This notebook contains scripts to analyze results from the HealthBench eval.

Running the HealthBench eval in `simple-evals` yields files with names like `healthbench{OPTIONAL_SUBSET_NAME}_{MODEL_NAME}_{DATETIME}.json` and `*.allresults.json`, saved to your `/tmp/` folder by default.

To analyze HealthBench results, move these to a desired folder, copy their file paths into the third cell of the `Imports and consts` section below, and then run the following analysis script.

The key data includes the metrics in the `.json` file and the example-level metrics in the `metadata.example_level_metadata` subkey in the `allresults.json` file.

## Imports and consts

In [1]:
import json
import pandas as pd
from typing import Literal
import matplotlib.pyplot as plt
import numpy as np
import textwrap
import blobfile as bf
from concurrent.futures import ThreadPoolExecutor
import itertools
from collections import Counter, defaultdict
import matplotlib.dates as mdates
import seaborn as sns
import os

In [2]:
sns.set_theme(
    style="dark",
    palette="muted",
    font="serif",
    rc={
        "figure.dpi": 120, # modify for paper figures
        "axes.titleweight": "normal",
        "axes.labelweight": "normal",
        "axes.spines.top": False,
        "axes.spines.right": False,
        "legend.frameon": False,
        "figure.autolayout": True,
        "legend.fontsize": "small",
        "legend.title_fontsize": "medium",
        "xtick.labelsize": "small",
        "ytick.labelsize": "small",
    }
)

In [3]:
fp_main_eval = 'az://openaipublic/simple-evals/healthbench/2025-05-07-06-14-12_oss_eval.jsonl'
fp_meta_eval = 'az://openaipublic/simple-evals/healthbench/2025-05-07-06-14-12_oss_meta_eval.jsonl'

# what directory should results be loaded from?
tmp_dir = '/Users/rahul/Documents/healthbench/data/'
# what directory should results be saved to?
results_dir = '/Users/rahul/Documents/healthbench/results/'

main_filename_list = """healthbench_gpt-3.5-turbo-0125_20250507_0653.json
healthbench_gpt-4.1_20250507_0653.json""".splitlines()
main_allresults_filename_list = [f.replace('.json', '_allresults.json') for f in main_filename_list]

human_eval_filename_list = """healthbench_apr_2025_reference_referencecompletions_20250507_0713.json
healthbench_aug_2024_reference_referencecompletions_20250507_0713.json
healthbench_apr_2025_reference_humanbaseline_20250507_0659.json
healthbench_aug_2024_reference_humanbaseline_20250507_0659.json
healthbench_no_reference_humanbaseline_20250507_0659.json""".splitlines()
human_eval_allresults_filename_list = [f.replace('.json', '_allresults.json') for f in human_eval_filename_list]

meta_eval_filename_str = """healthbench_meta_gpt-4.1-mini_20250511_051648.json""".splitlines()
meta_eval_allresults_filename_list = [f.replace('.json', '_allresults.json') for f in meta_eval_filename_str]

hard_filename_list = """healthbench_hard_o3_20250508_204645.json
healthbench_hard_o1_20250508_204650.json
healthbench_hard_gpt-4o-2024-08-06_20250508_204655.json
healthbench_hard_gpt-4.1_20250508_204647.json
healthbench_hard_gpt-3.5-turbo-0125_20250508_204657.json""".splitlines()
hard_allresults_filename_list = [f.replace('.json', '_allresults.json') for f in hard_filename_list]

consensus_set_filename_list = """healthbench_consensus_gpt-3.5-turbo-0125_20250509_131818.json
healthbench_consensus_gpt-4.1_20250509_131807.json
healthbench_consensus_gpt-4o-2024-08-06_20250509_131821.json
healthbench_consensus_o1_20250509_131813.json
healthbench_consensus_o3_20250509_131804.json""".splitlines()
consensus_allresults_filename_list = [f.replace('.json', '_allresults.json') for f in consensus_set_filename_list]

# used for all analyses that require many replicates, including worst-at-k, inter-replicate variance, etc. everything here should have been run with n_repeats > 1
many_replicate_filename_list = """healthbench_o1_20250507_0653_allresults.json
healthbench_o3_20250507_0653_allresults.json""".splitlines()

In [4]:
model_first_release = {
    "o3": "2025-04-16",
    "o4-mini": "2025-04-16",
    "gpt-4.1": "2025-04-14",
    "gpt-4.1-mini": "2025-04-14",
    "gpt-4.1-nano": "2025-04-14",
    "o1": "2024-12-05",
    "o1-pro": "2024-12-05",
    "o3-mini": "2025-01-31",
    "o1-mini": "2024-09-12",
    "o1-preview": "2024-09-12",
    "gpt-4.5-preview": "2025-02-27",
    "gpt-4o-2024-11-20": "2024-11-20",
    "gpt-4o-2024-08-06": "2024-08-06",
    "gpt-4o-2024-05-13": "2024-05-13",
    "chatgpt-4o-latest": "2025-03-27",
    "gpt-4o-mini": "2024-07-18",
    "gpt-4-turbo-2024-04-09": "2024-04-09",
    "gpt-3.5-turbo-0125": "2024-01-25",
    "gpt-4-0613": "2023-06-13",
}

models_to_canonical_name = {
    'o3': "o3",
    'o3_high': "o3-high",
    'o3_low': "o3-low",
    'o4-mini': "o4-mini",
    'o4-mini_high': "o4-mini-high",
    'o4-mini_low': "o4-mini-low",
    'gpt-4.1': "GPT-4.1",
    'gpt-4.1-mini': "GPT-4.1 mini",
    'gpt-4.1-nano': "GPT-4.1 nano",
    'o1': "o1",
    "o1_high": "o1-high",
    "o1_low": "o1-low",
    'o1-pro': "o1-pro",
    'o1-preview': "o1-preview",
    'o1-mini': "o1-mini",
    'o3-mini': "o3-mini",
    'o3-mini_low': "o3-mini-low",
    'o3-mini_high': "o3-mini-high",
    'gpt-4.5-preview': "gpt-4.5-preview",
    'gpt-4o-2024-11-20': "GPT-4o (Nov 2024)",
    'gpt-4o-2024-08-06': "GPT-4o (Aug 2024)",
    'gpt-4o-2024-05-13': "GPT-4o (May 2024)",
    'gpt-4o': "GPT-4o (Aug 2024)",
    'chatgpt-4o-latest': "ChatGPT-4o (latest)",
    'gpt-4o-mini': "GPT-4o mini",
    'gpt-4-turbo-2024-04-09': "GPT-4 Turbo",
    'gpt-3.5-turbo-0125': "GPT-3.5 Turbo",
    'gpt-4-0613': "GPT-4",
    'apr_2025_reference_humanbaseline': "Physicians with Apr 2025 models",
    'apr_2025_reference_referencecompletions': "Apr 2025 model reference responses",
    'aug_2024_reference_humanbaseline': "Physicians with Sep 2024 models",
    'aug_2024_reference_referencecompletions': "Sep 2024 model reference responses",
    'no_reference_humanbaseline': "Physicians with no reference",
}

In [5]:
CANONICAL_CLUSTER_NAMES = {
    'hedging': "Responding under uncertainty",
    'health': "Health data tasks",
    'global': "Global health",
    "communication": "Expertise-tailored communication",
    "context": "Context seeking",
    "emergency": "Emergency referrals",
    "complex": "Response depth"
}
CLUSTER_SORT_ORDER = [
    'emergency_referrals',
    'communication',
    'hedging',
    'complex_responses',
    'health_data_tasks',
    'global_health',
    'context_seeking',
]
FULL_CLUSTER_NAMES = {
    'communication',
    'emergency_referrals',
    'global_health',
    'health_data_tasks',
    'context_seeking',
    'complex_responses',
    'hedging',
}
CLUSTER_SORT_ORDER_SHORT = [c.split("_", 1)[0] for c in CLUSTER_SORT_ORDER]
CANONICAL_AXIS_NAMES = {
    "communication_quality": "Communication quality",
    "instruction_following": "Instruction following",
    "accuracy": "Accuracy",
    "completeness": "Completeness",
    "context_awareness": "Context awareness",
}
AXIS_SORT_ORDER = [
    'communication_quality',
    'instruction_following',
    'accuracy',
    'context_awareness',
    'completeness',
]

HUMAN_EVAL_SORT_ORDER = [
    'apr_2025_reference_humanbaseline',
    'apr_2025_reference_referencecompletions',
    'aug_2024_reference_humanbaseline',
    'aug_2024_reference_referencecompletions',
    'no_reference_humanbaseline',
]

## Descriptive stats

In [6]:
def _strip_prefix(tag: str) -> str:
    """
    Remove known prefixes from a tag and return a more human-readable string.
    """
    if ":" in tag:
        tag = tag.split(":", 1)[1]
    return tag.replace("_", " ").replace("-", " ").capitalize()


def get_example_summary(states: list[dict]) -> pd.DataFrame:
    row_labels: list[str] = []
    row_values: list[dict[str, str]] = []

    # Overall total
    total_examples: int = len(states)
    row_labels.append(r"\textbf{Total number of HealthBench examples}")
    row_values.append({"Count": f"\\textbf{{{total_examples}}}", "Percent": f"\\textbf{{{100.0:.1f}}}"})

    # Themes and their physician categories
    theme_counts: Counter[str] = Counter()
    category_counts: Counter[tuple[str, str]] = Counter()
    for state in states:
        theme_tags = [t for t in state["example_tags"] if t.startswith("theme:")]
        assert len(theme_tags) == 1, f"Expected 1 theme tag, got {len(theme_tags)} for state {state}"
        theme_tag = theme_tags[0]
        if theme_tag is None:
            continue
        theme_counts[theme_tag] += 1

        for t in state["example_tags"]:
            if t.startswith("physician_agreed_category:"):
                category_counts[(theme_tag, t)] += 1

    for theme_tag, n_theme in theme_counts.most_common():
        theme_clean = _strip_prefix(theme_tag)

        # Theme row (bolded)
        row_labels.append(r"\textbf{" + theme_clean + "}")
        row_values.append(
            {
                "Count": f"\\textbf{{{n_theme}}}",
                "Percent": f"\\textbf{{{n_theme / total_examples * 100:.1f}}}",
            }
        )

        # Associated physician-agreed categories, indented by one tab
        cat_items = []
        for (ex_theme_tag, category_tag), n_in_category in category_counts.items():
            if ex_theme_tag == theme_tag:
                cat_items.append((category_tag, n_in_category))
        cat_items.sort(key=lambda x: x[1], reverse=True)

        for category_tag, n_in_category in cat_items:
            category_clean = _strip_prefix(category_tag)
            row_labels.append("\qquad " + category_clean)
            row_values.append(
                {
                    "Count": str(n_in_category),
                    "Percent": f"{n_in_category / n_theme * 100:.1f}",
                }
            )

    df_examples = pd.DataFrame(row_values, index=row_labels)
    df_examples.index.name = "Theme and clusters"
    return df_examples

def get_rubric_criteria_summary(states: list[dict]) -> pd.DataFrame:
    axis_counts  = Counter()
    level_counts = Counter()

    for state in states:
        for rubric in state["rubrics"]:
            axis_tags = [t for t in rubric["tags"] if isinstance(t, str) and t.startswith("axis:")]
            assert len(axis_tags) == 1, f"Expected 1 axis tag, got {len(axis_tags)} for rubric {rubric}"
            axis_tag = axis_tags[0]
            axis_counts[axis_tag] += 1

            level_tags = [t for t in rubric["tags"] if isinstance(t, str) and t.startswith("level:")]
            assert len(level_tags) == 1, f"Expected 1 level tag, got {len(level_tags)} for rubric {rubric}"
            level_tag = level_tags[0]
            level_counts[level_tag] += 1

    index_labels: list[str] = []
    data_rows: list[dict[str, float | str]] = []

    def _add_row(label: str, count: int | float | str, pct: float | str):
        index_labels.append(label)
        data_rows.append({"Count": count, "Percent": pct})

    # (i) Overall (bold)
    total_rubrics = sum(axis_counts.values())
    _add_row(r"\textbf{All rubric criteria}", f"\\textbf{{{total_rubrics}}}", f"\\textbf{{{100.00:.2f}}}")

    # (ii) Level breakdown (bold for Level, indented for subrows)
    n_cluster = level_counts.get("level:cluster", 0)
    n_example = level_counts.get("level:example", 0)
    n_level = n_cluster + n_example
    assert n_level == total_rubrics, f"Expected n_level to equal total_rubrics, but got {n_level} != {total_rubrics}"
    _add_row(r"\textbf{Level}", f"\\textbf{{{n_level}}}", f"\\textbf{{{n_level / total_rubrics * 100:.2f}}}")
    _add_row(r"\qquad Cluster", n_cluster, f"{n_cluster / total_rubrics * 100:.2f}")
    _add_row(r"\qquad Example-specific", n_example, f"{n_example / total_rubrics * 100:.2f}")

    # (iii) Axis breakdown (bold for Axis, indented for subrows)
    _add_row(r"\textbf{Axis}", f"\\textbf{{{total_rubrics}}}", f"\\textbf{{{100.00:.2f}}}")
    for axis_tag, n_axis in axis_counts.most_common():
        axis_clean = _strip_prefix(axis_tag)
        _add_row(r"\qquad " + axis_clean, n_axis, f"{n_axis / total_rubrics * 100:.2f}")

    df_rubric_criteria = pd.DataFrame(data_rows, index=index_labels)
    df_rubric_criteria.index.name = "Category"

    return df_rubric_criteria

In [None]:
main_eval_states = []
with bf.BlobFile(fp_main_eval, 'r') as f:
    for line in f:
        main_eval_states.append(json.loads(line))

In [None]:
df_examples = get_example_summary(main_eval_states)

display(df_examples)
print(df_examples.to_latex())

In [None]:
df_rubric_criteria = get_rubric_criteria_summary(main_eval_states)
display(df_rubric_criteria)
print(df_rubric_criteria.to_latex(escape=False))

In [None]:
n_turns_list = []
n_rubrics_list = []
n_chars_list = []
for example in main_eval_states:
    n_rubrics_list.append(len(example["rubrics"]))
    n_turns_list.append(len(example["prompt"]))
    n_chars = 0
    for turn in example["prompt"]:
        n_chars += len(turn["content"])
    n_chars_list.append(n_chars)

quantiles = [0, 0.05, 0.25, 0.5, 0.75, 0.95, 1.0]
n_turns_quantiles = np.quantile(n_turns_list, quantiles)
n_rubrics_quantiles = np.quantile(n_rubrics_list, quantiles)
n_chars_quantiles = np.quantile(n_chars_list, quantiles)

# Also compute means
n_turns_mean = np.mean(n_turns_list)
n_rubrics_mean = np.mean(n_rubrics_list)
n_chars_mean = np.mean(n_chars_list)

df_quantiles = pd.DataFrame({
    "n_turns": n_turns_quantiles.astype(int),
    "n_rubrics": n_rubrics_quantiles.astype(int),
    "n_chars": n_chars_quantiles.astype(int),
}, index=[f"{int(q*100)}%" for q in quantiles])

# Insert mean as a new row after the 50% (median) row
mean_row = pd.DataFrame({
    "n_turns": [n_turns_mean],
    "n_rubrics": [n_rubrics_mean],
    "n_chars": [n_chars_mean],
}, index=["mean"])

# Find the position of the 50% row
median_idx = list(df_quantiles.index).index("50%")
# Split and insert mean after median
df_quantiles = pd.concat([
    df_quantiles.iloc[:median_idx+1],
    mean_row,
    df_quantiles.iloc[median_idx+1:]
])

display(df_quantiles)
print(df_quantiles.to_latex())

In [None]:
# how many examples have at least one clustered criterion?
n_states_with_clustered_criterion = 0
n_clustered_criteria = 0
unique_clustered_criteria = set()
for state in main_eval_states:
    has_clustered_criterion = False
    for rubric in state["rubrics"]:
        if "level:cluster" in rubric["tags"]:
            has_clustered_criterion = True
            unique_clustered_criteria.add(rubric["criterion"])

    if has_clustered_criterion:
        n_states_with_clustered_criterion += 1

print(f"Number of examples with at least one clustered criterion: {n_states_with_clustered_criterion}")
print(f"Number of unique clustered criteria: {len(unique_clustered_criteria)}")

In [None]:
# how many examples have at least one clustered criterion?
unique_criteria = set()
already_seen_criteria = set()
for state in main_eval_states:
    for rubric in state["rubrics"]:
        if rubric["criterion"] in unique_criteria:
            already_seen_criteria.add(rubric["criterion"])
        unique_criteria.add(rubric["criterion"])

print(f"Number of unique criteria: {len(unique_criteria)}")
print(f"Number of criteria seen more than once: {len(already_seen_criteria)}")

In [None]:
meta_eval_states = []
with bf.BlobFile(fp_meta_eval, 'r') as f:
    for line in f:
        meta_eval_states.append(json.loads(line))

In [None]:
# as defined in the paper, a meta-eval example is a tuple of (rubric criterion / category, conversation, response, physician grade)
meta_eval_examples = [
    {
        'category': s['category'],
        'conversation': s['prompt'],
        'response': s['completion'],
        'binary_label': label
    }
    for s in meta_eval_states
    for label in s['binary_labels']
]

meta_category_counts = Counter(s['category'] for s in meta_eval_examples)
meta_category_count_vals = list(meta_category_counts.values())
len(meta_eval_examples), min(meta_category_count_vals), max(meta_category_count_vals), np.mean(meta_category_count_vals)


## Data loading code

In [8]:
def filename_to_model(filename: str) -> str:
    if '_allresults' in filename:
        second_split_index = 3
    else:
        second_split_index = 2
    return filename.split('_', 1)[1].rsplit('_', second_split_index)[0]

def load_results(filename: str) -> dict:
    fp = os.path.join(tmp_dir, filename)
    results = json.load(open(fp))
    print(f'Loaded {filename}')
    return results

def get_results_by_filename(filenames: list[str]) -> dict:
    results_by_model = {
        filename_to_model(f): load_results(f)
        for f in filenames
    }
    return results_by_model

In [9]:
def get_df_from_results_by_model(results_by_model: dict) -> pd.DataFrame:
    metric_names_by_model = []
    for model in results_by_model:
        metric_names = [k for k in results_by_model[model].keys() if not k.endswith('bootstrap_std') and not k.endswith('n_samples')]
        names_to_append = set()
        for metric_name in metric_names:
            if metric_name == "score":
                continue
            assert metric_name + ':bootstrap_std' in results_by_model[model]
            assert metric_name + ':n_samples' in results_by_model[model]
            names_to_append.add(metric_name)
        metric_names_by_model.append(names_to_append)

    assert all(metric_names_by_model[0] == metric_names for metric_names in metric_names_by_model)
    metric_names = metric_names_by_model[0]

    rows = []
    for model, metrics in results_by_model.items():
        for item in metric_names:
            row = {
                "model": model,
                "metric": item,
                "value": metrics[item],
                "bootstrap_std": metrics[item + ":bootstrap_std"],
                "n_samples": metrics[item + ":n_samples"],
            }
            rows.append(row)
    df = pd.DataFrame(rows)
    df.set_index(["model", "metric"], inplace=True)

    return df

## Clustered bar plots

In [10]:
def wrap_label(label, width):
        return "\n".join(textwrap.wrap(label, width=width, break_long_words=False, replace_whitespace=False))

def clustered_bar_plot(
    results_by_model: dict,
    theme_or_axis: Literal["theme", "axis"],
    ylabel: str,
    title: str,
    sort_order: list[str] | None = None,
    cluster_sort_order: list[str] | None = None,
    error_as_yaxis: bool = False,
) -> pd.DataFrame:
    # get the results
    df = get_df_from_results_by_model(results_by_model)

    # get df by only theme or axis
    clustered_df = df[df.index.get_level_values("metric").str.startswith(f'{theme_or_axis}:')].copy()

    # get models and clusters
    models = clustered_df.index.get_level_values("model").unique()
    clusters = [m.replace(f"{theme_or_axis}:", "") for m in clustered_df.index.get_level_values("metric").unique()]
    if cluster_sort_order is not None:
        assert set(cluster_sort_order) == set(clusters), f"cluster_sort_order must be the same as clusters, but got {cluster_sort_order} vs {clusters}"
        clusters = cluster_sort_order

    # map clusters to canonical names
    if theme_or_axis == "theme":
        mapped_clusters = [CANONICAL_CLUSTER_NAMES[c.split("_", 1)[0]] for c in clusters]
    elif theme_or_axis == "axis":
        mapped_clusters = [CANONICAL_AXIS_NAMES[c] for c in clusters]
    else:
        raise ValueError(f"Invalid theme_or_axis: {theme_or_axis}")
    mapped_clusters = [c for c in mapped_clusters]

    # sort models
    if sort_order is not None:
        assert set(sort_order) == set(models), f"sort_order must be the same as models, but got {sort_order} vs {models}"
        sorted_models = sort_order
    else:
        model_scores = []
        for model in models:
            score_idx = (model, "overall_score")
            if score_idx in df.index:
                score = df.loc[score_idx, "value"]
            else:
                score = np.nan
            model_scores.append((model, score))

        model_scores_sorted = sorted(model_scores, key=lambda x: (-x[1], x[0]))
        sorted_models = [m for m, s in model_scores_sorted]

    # plot
    bar_width = 0.8 / len(clusters)
    x = np.arange(len(sorted_models))

    fig, ax = plt.subplots(figsize=(12, 6))
    sns.color_palette("colorblind")

    colors = sns.color_palette(n_colors=len(clusters))

    data_to_write = []

    bars = []
    for i, (cluster, mapped_cluster) in enumerate(zip(clusters, mapped_clusters)):
        values = []
        stds = []
        for model in sorted_models:
            idx = (model, f"{theme_or_axis}:{cluster}")
            if idx in clustered_df.index:
                if error_as_yaxis:
                    values.append(1 - clustered_df.loc[idx, "value"])
                else:
                    values.append(clustered_df.loc[idx, "value"])
                stds.append(clustered_df.loc[idx, "bootstrap_std"])
            else:
                values.append(np.nan)
                stds.append(0)
        bar = ax.bar(
            x + i * bar_width,
            values,
            width=bar_width,
            yerr=stds,
            label=wrap_label(mapped_cluster, width=5),
            capsize=2,
            align="edge",
            color=colors[i],
        )
        bars.append(bar)

        for model, val, std in zip(sorted_models, values, stds, strict=True):
            data_to_write.append({
                "model": models_to_canonical_name[model],
                "theme": mapped_cluster,
                "success_rate": val,
                "success_rate_lower": max(0, val - std),
                "success_rate_upper": min(1, val + std),
            })

    ax.set_xticks(x + bar_width * (len(clusters) - 1) / 2)

    wrapped_labels = [wrap_label(models_to_canonical_name[m], width=10) for m in sorted_models]
    ax.set_xticklabels(wrapped_labels)

    ax.set_ylabel(ylabel)
    ax.set_title(title)

    legend_handles = [bars[i][0] for i in range(len(clusters))]
    wrapped_mapped_clusters = [wrap_label(c, width=20) for c in mapped_clusters]
    ax.legend(legend_handles, wrapped_mapped_clusters, title=theme_or_axis.capitalize(), bbox_to_anchor=(1.05, 1), loc="upper left")

    for i, model in enumerate(sorted_models):
        score_idx = (model, "overall_score")
        if score_idx in df.index:
            if error_as_yaxis:
                score = 1 - df.loc[score_idx, "value"]
            else:
                score = df.loc[score_idx, "value"]
            left = x[i]
            right = x[i] + bar_width * len(clusters)
            ax.hlines(
                y=score,
                xmin=left,
                xmax=right,
                colors="black",
                linestyles="dashed",
                linewidth=1.5,
                label=None if i > 0 else "Model Score"
            )
            ax.text(
                left + (right - left) / 2,
                score,
                f"{score:.2f}",
                ha="center",
                va="bottom",
                fontsize=9,
                color="black",
                fontweight="bold",
                bbox=dict(facecolor="white", edgecolor="none", alpha=0.7, pad=0.5)
            )

            data_to_write.append({
                "model": models_to_canonical_name[model],
                "theme": None,
                "success_rate": score,
                "success_rate_lower": None,
                "success_rate_upper": None,
            })

    # Ensure y-axis starts at zero
    ax.set_ylim(bottom=0)

    plt.tight_layout()
    plt.show()

    return pd.DataFrame(data_to_write)

def save_csv_and_print(df, filename):
    csv = df.to_csv()
    path = bf.join(results_dir, filename)
    bf.write_text(path, csv)
    print(path)
    return df

### By theme

In [None]:
main_eval_results_by_model = get_results_by_filename(main_filename_list)

In [None]:
theme_df = clustered_bar_plot(main_eval_results_by_model, "theme", ylabel = "Score", title = "HealthBench scores by theme", cluster_sort_order = CLUSTER_SORT_ORDER)

In [None]:
save_csv_and_print(theme_df, 'theme_data.csv')

### By axis

In [None]:
axis_df = clustered_bar_plot(main_eval_results_by_model, "axis", ylabel = "Score", title = "HealthBench scores by axis", cluster_sort_order = AXIS_SORT_ORDER)

In [None]:
save_csv_and_print(axis_df, 'axis_data.csv')

### Hard subset

In [None]:
results_by_model_hard = get_results_by_filename(hard_filename_list)
results_by_model_hard = {
    k.replace('hard_', ''): v  # normalize the model names
    for k, v in results_by_model_hard.items()
}
hard_df = clustered_bar_plot(results_by_model_hard, theme_or_axis = "axis", ylabel = "Score", title = "HealthBench Hard subset scores by axis", sort_order = None, cluster_sort_order = AXIS_SORT_ORDER)

In [None]:
save_csv_and_print(hard_df, 'hard_subset_data.csv')

### Consensus subset

In [None]:
results_by_model_consensus = get_results_by_filename(consensus_set_filename_list)
results_by_model_consensus = {
    k.replace('consensus_', ''): v
    for k, v in results_by_model_consensus.items()
}
consensus_df = clustered_bar_plot(results_by_model_consensus, theme_or_axis = "theme", ylabel = "Error rate (1 − score)", title = "HealthBench Consensus subset error rates by theme", sort_order = None, cluster_sort_order = CLUSTER_SORT_ORDER, error_as_yaxis = True)

In [None]:
save_csv_and_print(consensus_df, 'consensus_subset_data.csv')

In [None]:
CONSENSUS_PROMPT_TAX_NAME_MAPPER = {
    'health-professional': "Health professional user",
    'not-health-professional': "Non-health professional user",
    'detailed': "Query requiring detailed response",
    'simple': "Query requiring simple response",
    'enough-context': "Enough context provided",
    'not-enough-context': "Not enough context provided",
    'conditionally-emergent': "Conditionally emergent",
    'emergent': "Emergent",
    'non-emergent': "Non-emergent",
    'context-does-not-matter': "Healthcare context does not matter",
    'context-matters-but-unclear': "Healthcare context matters but is unclear",
    'context-matters-is-clear': "Healthcare context matters and is clear",
    'enough-info-to-complete-task': "Enough information to complete task",
    'not-enough-info-to-complete-task': "Not enough information to complete task",
    'any-reducible-uncertainty': "Any reducible uncertainty",
    'no-uncertainty': "No uncertainty",
    'only-irreducible-uncertainty': "Only irreducible uncertainty",
}

QUESTION_NAME_MAPPER = {
    'accuracy_completeness': "Accuracy and completeness",
    'tailored': "Tailored communication",
    'accuracy_hedging': "Accuracy and hedging",
    'appropriate': "Appropriate",
    'helpful_safe': "Helpful and safe",
    'precise': "Precise",
    'context_seeking': "Context seeking",
    'emergency_behavior': "Emergency behavior",
    'aligned_accurate': "Aligned and accurate",
    'language': "Language",
    'accuracy_safety': "Accuracy and safety",
    'response_instruction_following': "Response instruction following",
    'helpfulness': "Helpfulness",
    'safety': "Safety",
    'accurate': "Accuracy",
    'hedges': "Hedging behavior",
    'seeks_context': "Context-seeking behavior",
}

def key_to_canonical_name(key: str) -> tuple[str, str | None, str | None]:
    if key == 'overall_score':
        return ('Overall score', '', '')

    assert key.startswith('cluster:')
    target_cluster = None
    for cluster in FULL_CLUSTER_NAMES:
        if key.startswith(f'cluster:{cluster}'):
            target_cluster = cluster

            break

    if target_cluster is None:
        raise ValueError(f"Invalid key: {key}")

    key = key.replace(f'cluster:{target_cluster}_', '')

    consensus_prompt_tax_name, question_name = key.split('_', 1)

    return (
        CANONICAL_CLUSTER_NAMES[target_cluster.split('_', 1)[0]],
        CONSENSUS_PROMPT_TAX_NAME_MAPPER[consensus_prompt_tax_name],
        QUESTION_NAME_MAPPER[question_name]
    )


filtered_model_results = {}
for model, results in results_by_model_consensus.items():
    filtered_model_results[models_to_canonical_name[model]] = {
        key_to_canonical_name(k): v for k, v in results.items()
        if (k.startswith('cluster:') or k == 'overall_score') and not k.endswith('bootstrap_std') and not k.endswith('n_samples')
    }


filtered_model_df = pd.DataFrame(filtered_model_results)
filtered_model_df.index = pd.MultiIndex.from_tuples(
    filtered_model_df.index,
    names=["Theme", "Consensus Category", "Consensus Criterion"]
)

sort_order = [
    CANONICAL_CLUSTER_NAMES[c.split('_', 1)[0]]
    for c in CLUSTER_SORT_ORDER
]
sort_order = ["Overall score"] + sort_order
filtered_model_df = filtered_model_df.loc[sort_order]

# Sort columns by overall score (descending, left to right)
sorted_cols = filtered_model_df.sort_values(ascending=False, by = ("Overall score", '', ''), axis = 1).keys().tolist()

filtered_model_df = filtered_model_df[sorted_cols].applymap(lambda x: f"{x:.4f}" if isinstance(x, (float, int)) else x)

display(filtered_model_df)
print(filtered_model_df.to_latex(multirow=False))

### Human eval data

In [None]:
human_eval_results_by_model = get_results_by_filename(human_eval_filename_list)
human_df = clustered_bar_plot(human_eval_results_by_model, "axis", ylabel = "Score", title = "Physician-written response and reference response HealthBench scores by axis", sort_order = HUMAN_EVAL_SORT_ORDER, cluster_sort_order = AXIS_SORT_ORDER)

In [None]:
save_csv_and_print(human_df, 'human_eval_data.csv')

## More detailed human eval analysis

In [None]:
human_eval_results = get_results_by_filename(human_eval_allresults_filename_list)

In [24]:
def example_level_metadata_to_score(metadata: dict) -> float:
    rubric_items = metadata['rubric_items']
    max_possible_score = 0
    score_achieved = 0
    for item in rubric_items:
        points = item['points']
        if points > 0:
            max_possible_score += points
        if item['criteria_met']:
            score_achieved += points
    return score_achieved / max_possible_score

def clip_score(score: float) -> float:
    if score > 1:
        return 1
    elif score < 0:
        return 0
    else:
        return score

def plot_human_eval_scores(
    results,
    baseline_name: str,
    reference_completions_name: str,
    x_label: str,
    y_label: str,
    title: str,
    figsize: tuple[int, int] = (8, 6)
) -> None:
    data = []
    for human_eval_metadata in results[baseline_name]['metadata']['example_level_metadata']:
        prompt_id = human_eval_metadata['prompt_id']
        reference_completion_metadata = [
            e
            for e in results[reference_completions_name]['metadata']['example_level_metadata']
            if e['prompt_id'] == prompt_id
        ]
        assert len(reference_completion_metadata) == 4
        data.append({
            'prompt_id': prompt_id,
            'human_eval_score': example_level_metadata_to_score(human_eval_metadata),
            'reference_completion_scores': [example_level_metadata_to_score(m) for m in reference_completion_metadata],
        })

    # Calculate mean and max reference completion scores for each example
    human_eval_scores = [r['human_eval_score'] for r in data]
    mean_reference_scores = [sum(r['reference_completion_scores']) / len(r['reference_completion_scores']) for r in data]

    score_diffs = np.array(human_eval_scores) - np.array(mean_reference_scores)

    plt.figure(figsize=(8, 6))

    # Determine symmetric range around zero so that zero falls at the center of one bin
    min_diff, max_diff = score_diffs.min(), score_diffs.max()
    max_abs = max(abs(min_diff), abs(max_diff))
    num_bins = 201  # pick an odd number so one bin is centered at zero
    bin_edges = np.linspace(-max_abs, max_abs, num_bins + 1)

    # start plotting
    plt.hist(score_diffs, bins=bin_edges, color='purple', alpha=0.5, density=True)
    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.ylim(0, 5)
    plt.axvline(x=0, color='black', linestyle='--', linewidth=0.5, zorder=100)
    plt.show()

    human_eval_vs_ref = [human_eval_score - mean_reference_score for human_eval_score, mean_reference_score in zip(human_eval_scores, mean_reference_scores, strict = True)]

    proportion_human_eval_greater_ref = len([x for x in human_eval_vs_ref if x > 0]) / len(human_eval_scores)
    human_eval_greater_ref = [x for x in human_eval_vs_ref if x > 0]
    mean_human_eval_greater_ref = sum(human_eval_greater_ref) / len(human_eval_greater_ref)
    print(f"Proportion of times that the human eval score is greater than the mean reference completion score: {proportion_human_eval_greater_ref * 100:.2f}%")
    print(f"Mean magnitude of the difference: {mean_human_eval_greater_ref:.2f}")

    proprtion_human_eval_less_ref = len([x for x in human_eval_vs_ref if x < 0]) / len(human_eval_scores)
    human_eval_less_ref = [x for x in human_eval_vs_ref if x < 0]
    mean_human_eval_less_ref = sum(human_eval_less_ref) / len(human_eval_less_ref)
    print(f"Proportion of times that the human eval score is less than the mean reference completion score: {proprtion_human_eval_less_ref * 100:.2f}%")
    print(f"Mean magnitude of the difference: {mean_human_eval_less_ref:.2f}")


In [None]:
plot_human_eval_scores(human_eval_results, 'aug_2024_reference_humanbaseline', 'aug_2024_reference_referencecompletions', x_label = "Physician response score minus mean reference completion score", y_label = "Density", title = "Paired physician response and mean reference score differences (Sep 2024 models)")

In [None]:
plot_human_eval_scores(human_eval_results, 'apr_2025_reference_humanbaseline', 'apr_2025_reference_referencecompletions', x_label = "Physician response score minus mean reference completion score", y_label = "Density", title = "Paired physician response and mean reference score differences (Apr 2025 models)")

In [27]:
def plot_distribution(
    score_lists_with_labels,
    title,
    x_label,
    y_label,
    bins,
    sort_order: list[str] | None = None,
    clip: tuple[float | None, float | None] | None = None,
    markers=True,
):
    """
    Overlay multiple histogram traces for rubric scores.

    Parameters
    ----------
    *score_lists_with_labels : tuple[list[float], str]
        Any number of (scores, label) pairs to plot.
    title      : str, optional
    x_label    : str, optional
    y_label    : str, optional
    bins       : int, optional
        Number of histogram bins for every trace (default 20).
    alpha      : float, optional
        Opacity for each bar set so they remain visible when overlapping.
    """
    if not score_lists_with_labels:
        raise ValueError("Provide at least one (scores, label) pair.")

    plt.figure()


    if sort_order:
        assert set(sort_order) == {label for _, label in score_lists_with_labels}
        score_lists_with_labels = sorted(score_lists_with_labels, key = lambda x: sort_order.index(x[1]))

    for scores, label in score_lists_with_labels:
        # Histogram counts (y) and bin centres (x)
        if clip:
            scores = np.clip(scores, clip[0], clip[1])

        counts, edges = np.histogram(scores, bins=bins, density=True)
        centres = 0.5 * (edges[:-1] + edges[1:])

        line, = plt.plot(centres, counts, label=models_to_canonical_name[label])      # line
        if markers:
            plt.scatter(centres, counts, color=line.get_color())

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.legend()
    plt.tight_layout()
    plt.show()

In [None]:
lengths = defaultdict(list)
for model in human_eval_results:
    for example in human_eval_results[model]['metadata']['example_level_metadata']:
        lengths[model].append(len(example['completion'][0]['content']))

# Sort the dictionary by the mean completion length so the legend order is meaningful.
lengths = dict(
    sorted(lengths.items(), key=lambda kv: np.mean(kv[1]))
)

lengths_list = [(v, k) for k, v in lengths.items()]

# Plot the distribution of completion lengths by model.
plot_distribution(
    lengths_list,
    title = "Distribution of completion lengths for physician and reference responses",
    x_label="Completion length (characters); clipped to [0, 7500]",
    y_label = "Density",
    bins = 50,
    clip = (0, 7500),
    sort_order = HUMAN_EVAL_SORT_ORDER
)


## Date frontier plot

In [29]:
def date_frontier_plot(date_plot_df):
    """
    Plot model scores against release dates, highlight the empirical
    performance frontier, and annotate points.

    • Frontier models:  label directly above the point (horizontal text)
    • Non-frontier models: label on a 45° downward–right diagonal that
      starts just above-left of the point and is automatically nudged to
      minimise overlaps (only these labels are moved).
    """

    plot_entries: list[tuple[pd.Timestamp, float, str]] = []
    models = date_plot_df.index.get_level_values(0).unique()
    for model in models:
        idx = (model, "overall_score")
        if idx not in date_plot_df.index:
            continue
        plot_entries.append(
            (
                pd.to_datetime(model_first_release[model]),
                float(date_plot_df.loc[idx, "value"]),
                model,
            )
        )

    if not plot_entries:  # Nothing to plot
        return

    plot_entries.sort(key=lambda x: x[0])  # chronological order

    frontier: list[tuple[pd.Timestamp, float, str]] = []
    max_score = float("-inf")
    for dt, sc, mdl in plot_entries:
        if sc > max_score:
            frontier.append((dt, sc, mdl))
            max_score = sc

    frontier_set = {(d, s) for d, s, _ in frontier}


    all_dates            = [d for d, *_ in plot_entries]
    non_frontier_dates   = [d for d, s, _ in plot_entries if (d, s) not in frontier_set]
    non_frontier_scores  = [s for d, s, _ in plot_entries if (d, s) not in frontier_set]
    frontier_dates       = [d for d, *_ in frontier]
    frontier_scores      = [s for _, s, _ in frontier]

    # ------------------------------------------------------------------ #
    # Construct the figure
    # ------------------------------------------------------------------ #
    fig, ax = plt.subplots(figsize=(8, 5))

    # Scatter + line
    ax.scatter(non_frontier_dates, non_frontier_scores, s=60, color="gray",     label="Below frontier")
    ax.scatter(frontier_dates,       frontier_scores,       s=60, color="tab:blue", label="Frontier models")
    ax.plot   (frontier_dates,       frontier_scores,             color="tab:blue", linewidth=2, label="Frontier path")


    from matplotlib.transforms import offset_copy

    frontier_texts, non_frontier_texts = [], []

    for dt, sc, mdl in plot_entries:
        is_frontier = (dt, sc) in frontier_set

        if is_frontier:
            # Static, horizontal label 5 px above the point
            txt = ax.text(
                dt,
                sc,
                models_to_canonical_name[mdl],
                transform=offset_copy(ax.transData, fig, x=0 - (19 if mdl == 'gpt-4.1' else 0), y=5, units="points"),
                ha="center",
                va="bottom",
                fontsize=9,
            )
            frontier_texts.append(txt)
        else:
            # Diagonal (-45°) label just below-right of the point
            txt = ax.text(
                dt,
                sc,
                models_to_canonical_name[mdl],
                transform=offset_copy(ax.transData, fig, x=3, y=-2, units="points"),
                ha="left",
                va="top",
                fontsize=7,
                rotation=-30,
            )
            non_frontier_texts.append(txt)

    ax.xaxis.set_major_locator(mdates.AutoDateLocator())
    ax.xaxis.set_major_formatter(mdates.DateFormatter("%Y-%m-%d"))
    fig.autofmt_xdate()

    ax.set_xlabel("Model release date")
    ax.set_ylabel("Score")
    ax.set_title("HealthBench performance frontier over time")
    ax.set_ylim(0, 1)
    ax.set_xlim(
        min(all_dates) - pd.Timedelta(days=30),
        pd.to_datetime("2025-05-06") + pd.Timedelta(days=35),
    )

    # Reverse legend order
    handles, labels = ax.get_legend_handles_labels()
    ax.legend(handles[::-1], labels[::-1], loc="upper left")

    plt.tight_layout()
    plt.show()

In [None]:
date_plot_df = get_df_from_results_by_model(main_eval_results_by_model)
date_frontier_plot(date_plot_df)

## Meta eval results

In [31]:
def agg_n_and_val_list(n_and_vals: list[dict]) -> dict:
    """Aggregate a list of {'value', 'n'} dicts into a single weighted average."""
    values = [d["value"] for d in n_and_vals]
    n = [d["n"] for d in n_and_vals]
    weighted_values = [v * n_i for v, n_i in zip(values, n)]
    return {"value": sum(weighted_values) / sum(n), "n": sum(n)}

def plot_model_physician_agreement(metrics, individual_physician_agreement_metrics, overall_model_agreement_metrics) -> pd.DataFrame:
    MIN_SAMPLES = 50

    all_clusters = {
        k.split(' ')[0].rstrip(':')
        for k in metrics.keys()
        if k.startswith('cluster:')
    }
    METRICS_KEY_TEMPLATE_MODEL = '{CLUSTER_NAME}: pairwise_model_f1_balanced'
    METRICS_KEY_TEMPLATE_PHYSICIAN = '{CLUSTER_NAME}: pairwise_physician_f1_balanced'

    cluster_names = {
        c.replace('cluster:', '').split('_', 1)[0]
        for c in all_clusters
    }
    assert set(cluster_names) == set(CLUSTER_SORT_ORDER_SHORT)
    cluster_names = list(reversed(CLUSTER_SORT_ORDER_SHORT))

    clusters_and_subclusters = {
        c: {
            full_name for full_name in all_clusters if full_name.startswith('cluster:' + c)
        }
        for c in cluster_names
    }

    y_pos = np.arange(len(cluster_names))

    fig, ax = plt.subplots(figsize=(8, 5))

    # Collect data for the table
    summary_rows: list[dict[str, float | int | str]] = []
    all_rows = []
    for i, cluster in enumerate(cluster_names):
        physician_n_and_vals = defaultdict(list)
        model_n_and_vals = []

        # Gather data for every sub-cluster
        for subcluster in clusters_and_subclusters[cluster]:
            key_physician = METRICS_KEY_TEMPLATE_PHYSICIAN.format(CLUSTER_NAME=subcluster)
            key_model = METRICS_KEY_TEMPLATE_MODEL.format(CLUSTER_NAME=subcluster)

            # Per-physician data
            indiv_n_and_vals = individual_physician_agreement_metrics[key_physician]
            for physician_id, n_and_val in indiv_n_and_vals.items():
                physician_n_and_vals[physician_id].append(n_and_val)

            # Model data
            model_n_and_val = overall_model_agreement_metrics[key_model]
            model_n_and_vals.append(model_n_and_val)

        # Aggregate per-physician data
        agg_physician_n_and_vals = [agg_n_and_val_list(vals) for vals in physician_n_and_vals.values()]

        # Keep physicians with enough samples
        agg_physician_n_and_vals_filtered = [
            (d["value"], d["n"]) for d in agg_physician_n_and_vals if d["n"] > MIN_SAMPLES
        ]
        if not agg_physician_n_and_vals_filtered:
            continue  # Skip clusters without physicians meeting the minimum sample count

        indiv_vals, indiv_n = zip(*agg_physician_n_and_vals_filtered)
        num_physicians = len(indiv_vals)


        # Plot individual physicians
        jitter = np.random.uniform(-0.15, 0.15, size=num_physicians)
        ax.scatter(
            indiv_vals,
            i + jitter,
            color="gray",
            alpha=0.6,
            s=30,
            label="Individual physicians" if i == 0 else None,
        )
        for val in indiv_vals:
            all_rows.append({
                "theme": CANONICAL_CLUSTER_NAMES[cluster],
                "score": val,
                "type": "Individual physicians"
            })

        # Physician weighted average
        grand_physician = agg_n_and_val_list(agg_physician_n_and_vals)
        grand_physician_val = grand_physician["value"]
        ax.scatter(
            [grand_physician_val],
            [i],
            color="blue",
            marker="*",
            s=120,
            edgecolor="black",
            label=wrap_label("Weighted average of physicians", 18) if i == 0 else None,
        )
        all_rows.append({
            "theme": CANONICAL_CLUSTER_NAMES[cluster],
            "score": grand_physician_val,
            "type": "Weighted average of physicians"
        })

        # Model score
        model_agg = agg_n_and_val_list(model_n_and_vals)
        model_val = model_agg["value"]
        ax.scatter(
            [model_val],
            [i],
            color="red",
            marker="^",
            s=80,
            edgecolor="black",
            label="GPT-4.1 grader" if i == 0 else None,
        )
        all_rows.append({
            "theme": CANONICAL_CLUSTER_NAMES[cluster],
            "score": model_val,
            "type": "GPT-4.1 grader"
        })


        # Model percentile among physicians
        model_percentile = (
            (
                np.sum(np.array(indiv_vals) < model_val)
                + 0.5 * np.sum(np.array(indiv_vals) == model_val)
            )
            / num_physicians
            * 100
        )

        # Save row for summary table
        summary_rows.append(
            {
                "Cluster": CANONICAL_CLUSTER_NAMES[cluster.replace("cluster:", "")],
                "# Physicians": num_physicians,
                "Physician Weighted Avg": round(grand_physician_val, 3),
                "Model Score": round(model_val, 3),
                "Model Percentile": round(model_percentile, 1),
            }
        )

    # Finalise plot
    cluster_labels = [wrap_label(CANONICAL_CLUSTER_NAMES[c.replace("cluster:", "")], 20) for c in cluster_names]
    ax.set_yticks(y_pos)
    ax.set_yticklabels(cluster_labels)
    ax.set_xlabel("Macro F1")
    ax.set_xlim(0, 1)
    ax.set_title("Model-physician and physician-physician agreement for HealthBench Consensus")

    fig.set_size_inches(15, 5)
    handles, labels = ax.get_legend_handles_labels()
    by_label = dict(zip(labels, handles))
    ax.legend(by_label.values(), by_label.keys(), loc="center left", bbox_to_anchor=(1.02, 0.5))
    plt.tight_layout(rect=[0, 0, 0.85, 1])
    plt.show()

    # Display summary table
    summary_df = pd.DataFrame(summary_rows).set_index("Cluster")
    display(summary_df)
    print(summary_df.to_latex())

    return pd.DataFrame(all_rows)

In [None]:
meta_eval_results_all = get_results_by_filename(meta_eval_allresults_filename_list)

In [None]:
for model_name, results in meta_eval_results_all.items():
    metrics = results['metrics']
    metadata = results['metadata']
    individual_physician_agreement_metrics = metadata['physician_agreement_metric_lists']
    overall_model_agreement_metrics = metadata['model_agreement_metrics']
    meta_eval_df = plot_model_physician_agreement(metrics, individual_physician_agreement_metrics, overall_model_agreement_metrics)

In [None]:
save_csv_and_print(meta_eval_df, 'meta_eval_data.csv')

## Plot worst at k

In [35]:
def rubric_accuracy_per_instance(rubric_items: list[dict[str, int]]) -> float:
    total_possible_points = sum(
        rubric_item['points'] for rubric_item in rubric_items if rubric_item['points'] > 0
    )
    if total_possible_points <= 0:
        # should not happen for overall score, but may happen for tags
        raise ValueError(f"Total possible points is 0 for rubric items: {rubric_items}")

    achieved_points = sum(
        rubric_item['points']
        for rubric_item in rubric_items
        if rubric_item['criteria_met']
    )
    overall_score = achieved_points / total_possible_points
    return overall_score


In [36]:
def plot_fail_at_k(fail_at_k_dicts_with_labels, x_label, y_label, title, show_scatter=True):
    """
    Plot one or more fail-at-k dictionaries as lines (optionally with scatter markers).

    Parameters
    ----------
    *fail_at_k_dicts_with_labels : tuple[dict, str]
        Any number of 2-tuples of the form (dict, label).
        Each dict maps k-values → metric value, and the label appears in the legend.
    x_label : str, optional
        Label for the x-axis.  Default is "k".
    y_label : str, optional
        Label for the y-axis.  Default is "Failure rate".
    title   : str, optional
        Figure title.
    show_scatter : bool, optional
        If True (default) draw scatter markers on top of the line.
    """
    if not fail_at_k_dicts_with_labels:
        raise ValueError("Provide at least one (dict, label) pair.")

    plt.figure()

    all_data = []

    fail_at_k_dicts_with_labels = sorted(fail_at_k_dicts_with_labels, key=lambda x: np.mean(list(x[0].values())), reverse=True)
    for fail_at_k_dict, label in fail_at_k_dicts_with_labels:
        x = list(fail_at_k_dict.keys())
        y = list(fail_at_k_dict.values())
        y = np.clip(y, 0, 1)
        mapped_label = models_to_canonical_name[label]
        line, = plt.plot(x, y, label=mapped_label)        # line
        if show_scatter:
            plt.scatter(x, y, color=line.get_color())  # scatter, same color

        for x, y in zip(x, y):
            all_data.append({
                'model': label,
                'k': x,
                'any_fail_rate': y,
            })

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.xticks([k for k in sorted(set().union(*[d[0].keys() for d in fail_at_k_dicts_with_labels])) if k == 1 or k % 2 == 0])
    plt.legend()
    plt.tight_layout()
    plt.show()

    return pd.DataFrame(all_data)

In [None]:
results_worst_at_k = get_results_by_filename(many_replicate_filename_list)

In [38]:
results_worst_at_k_pivoted = {}
for model_name, results in results_worst_at_k.items():
    example_level_metadata = results["metadata"]["example_level_metadata"]
    results_by_prompt = defaultdict(list)
    for prompt in example_level_metadata:
        results_by_prompt[prompt['prompt_id']].append(prompt['rubric_items'])
    results_worst_at_k_pivoted[model_name] = list(results_by_prompt.values())

In [39]:
def precompute_comb_indices(N):
    """
    Return a dict:  k -> (n_subsets, k) array containing
    every length-k subset of {0, 1, 2, ..., N-1}.
    (N is the total number of samples drawn and therefore the max worst-at-k that can be computed)
    This can be used as the an index for the scores_per_instance array.
    """
    combination_indices = {}
    for k in range(1, N + 1):
        combination_indices[k] = np.array(list(itertools.combinations(range(N), k)))
    return combination_indices

seen_n_instances = set()
for model_name, instances_per_problem in results_worst_at_k_pivoted.items():
    for instance in instances_per_problem:
        if len(instance) not in seen_n_instances:
            seen_n_instances.add(len(instance))

assert len(seen_n_instances) == 1
n_instances = list(seen_n_instances)[0]
combination_indices = precompute_comb_indices(n_instances)

In [40]:
worst_at_k_per_model = defaultdict(dict)
for model_name, instances_per_problem in results_worst_at_k_pivoted.items():
    worst_at_k_per_problem = defaultdict(list)
    for results_per_instance in instances_per_problem:
        scores_per_instance = np.array([rubric_accuracy_per_instance(instance) for instance in results_per_instance])
        assert len(scores_per_instance) == n_instances
        for k in range(1, len(scores_per_instance) + 1):
            subset_idxs = combination_indices[k]
            subsets = scores_per_instance[subset_idxs]
            assert subsets.shape == (len(subset_idxs), k)
            worst_of_k_per_subset = np.min(subsets, axis = 1)
            assert len(worst_of_k_per_subset) == len(subset_idxs)
            average_worst_of_k = np.mean(worst_of_k_per_subset)
            worst_at_k_per_problem[k].append(average_worst_of_k)

    for k, scores in worst_at_k_per_problem.items():
        assert len(scores) == 5000, f"len(scores) = {len(scores)}"
        mean_score = sum(scores) / len(scores)
        clipped_mean_score = np.clip(mean_score, 0, 1)
        worst_at_k_per_model[model_name][k] = clipped_mean_score

In [None]:
worst_at_k_list = [(v, k) for k, v in worst_at_k_per_model.items()]
fail_at_k_data = plot_fail_at_k(worst_at_k_list, x_label="Number of samples (k)", y_label="Score", title = "Worst-case HealthBench score at k samples")

In [None]:
save_csv_and_print(fail_at_k_data, 'fail_at_k_data.csv')

## Analysis of eval variability and length-adjusted win rates

In [43]:
scores_by_eval_run_by_model = defaultdict(list)
for model_name, instances_per_problem in results_worst_at_k_pivoted.items():
    results_by_eval_run = list(zip(*instances_per_problem))
    for results_per_eval_run in results_by_eval_run:
        problem_level_scores = [rubric_accuracy_per_instance(instance) for instance in results_per_eval_run]
        overall_score = np.mean(problem_level_scores)
        overall_score = np.clip(overall_score, 0, 1)
        scores_by_eval_run_by_model[model_name].append(overall_score)


In [None]:
# Prepare a DataFrame summarizing mean, std, min, max for each model's eval run scores
summary_stats = []
for model_name, scores in scores_by_eval_run_by_model.items():
    if len(scores) == 0:
        continue
    stats = {
        "model": model_name,
        "mean": np.mean(scores),
        "min": np.min(scores),
        "max": np.max(scores),

        "std": np.std(scores),
    }
    summary_stats.append(stats)

scores_by_eval_run_df = pd.DataFrame(summary_stats).set_index("model")
scores_by_eval_run_df = scores_by_eval_run_df.sort_values("mean", ascending=False)

display(scores_by_eval_run_df.applymap(lambda x: f"{x:.4f}" if isinstance(x, (float, int)) else x))
print(scores_by_eval_run_df.applymap(lambda x: f"{x:.4f}" if isinstance(x, (float, int)) else x).to_latex())

In [45]:
# desired data shape:
# problem (5000) -> model (5) -> list of solutions from that model

lengths = []
worst_at_k_model_names = list(results_worst_at_k.keys())
results_by_prompt_by_model = defaultdict(lambda: defaultdict(list))
for model_name, results in results_worst_at_k.items():
    example_level_metadata = results["metadata"]["example_level_metadata"]
    for prompt in example_level_metadata:
        score = rubric_accuracy_per_instance(prompt['rubric_items'])
        length = len(prompt['completion'][0]['content'])
        lengths.append(length)
        results_by_prompt_by_model[prompt['prompt_id']][model_name].append({
            'score': score,
            'length': length,
        }) # precompute

assert len(results_by_prompt_by_model) == 5000
inner_example = list(results_by_prompt_by_model.values())[0]
inner_inner_example = list(inner_example.values())[0]


In [46]:
def compare_model_results_per_problem(results_modela, results_modelb, length_control: float | None = None):
    # take every pair of results
    # figure out which is better
    # return the list of wins
    wins = []

    for result_modela, result_modelb in itertools.product(results_modela, results_modelb):

        if length_control is not None:
            length_a = result_modela['length']
            length_b = result_modelb['length']
            if length_a == 0 or length_b == 0:
                continue

            symmetric_pct_diff = abs(length_a - length_b) / ((length_a + length_b) / 2)
            if symmetric_pct_diff > length_control:
                continue

        score_a = result_modela['score']
        score_b = result_modelb['score']
        if score_a > score_b:
            wins.append(1)
        elif score_a < score_b:
            wins.append(0)
        elif score_a == score_b:
            continue

    win_rate = (sum(wins) / len(wins)) if wins else None
    return win_rate

In [47]:
def get_win_rate_df(results_by_prompt_by_model, length_control: float | None = None):
    win_rate_lists_modela_vs_modelb = defaultdict(lambda: defaultdict(list))
    for modela in worst_at_k_model_names:
        for modelb in worst_at_k_model_names:
            for _prompt_id, prompt_results in results_by_prompt_by_model.items():
                results_modela = prompt_results[modela]
                results_modelb = prompt_results[modelb]
                assert len(results_modela) == len(results_modelb) == 16
                win_rate = compare_model_results_per_problem(results_modela, results_modelb, length_control)
                if win_rate is not None:
                    win_rate_lists_modela_vs_modelb[modela][modelb].append(win_rate)

    win_rates_modela_vs_modelb = defaultdict(dict)
    for modela in worst_at_k_model_names:
        for modelb in worst_at_k_model_names:
            win_rate_list = win_rate_lists_modela_vs_modelb[modela][modelb]
            mapped_model_a = models_to_canonical_name[modela]
            mapped_model_b = models_to_canonical_name[modelb]
            win_rates_modela_vs_modelb[mapped_model_a][mapped_model_b] = sum(win_rate_list) / len(win_rate_list)

    df_win_rates = pd.DataFrame(win_rates_modela_vs_modelb)
    # Compute mean win rate for each model (row)
    mean_win_rates = df_win_rates.mean(axis=1).sort_values(ascending=True)
    # Reorder both rows and columns by descending mean win rate
    df_win_rates = df_win_rates.loc[mean_win_rates.index, mean_win_rates.index]
    df_win_rates = df_win_rates.T.round(3)
    return df_win_rates

def format_win_rate_df(df):
    df = df.map(lambda x: f"{x:.1%}" if x != 0.5 and pd.notna(x) else "-")
    return df

In [None]:
win_rate_df_no_length_control = get_win_rate_df(results_by_prompt_by_model, length_control = None)
formatted_win_rate_df_no_length_control = format_win_rate_df(win_rate_df_no_length_control)
display(formatted_win_rate_df_no_length_control)
print(formatted_win_rate_df_no_length_control.to_latex())

In [None]:
win_rate_df_length_control = get_win_rate_df(results_by_prompt_by_model, length_control = 0.1)
formatted_win_rate_df_length_control = format_win_rate_df(win_rate_df_length_control)
display(formatted_win_rate_df_length_control)
print(formatted_win_rate_df_length_control.to_latex())

In [None]:
win_rate_df_diff = (win_rate_df_no_length_control - win_rate_df_length_control) / (win_rate_df_no_length_control - 0.5)
formatted_win_rate_df_diff = format_win_rate_df(win_rate_df_diff)
display(formatted_win_rate_df_diff)
print(formatted_win_rate_df_diff.to_latex())

## Plot distributions

In [None]:
results_main_eval = get_results_by_filename(main_allresults_filename_list)

In [None]:

## Per example distribution of scores.
## I.e. Just plot the mass of rubric scores per one instance.
rubric_scores = []
for model_name, results in results_main_eval.items():
    rubric_scores_per_model = []
    data = results["metadata"]["example_level_metadata"]
    for prompt in data:
        rubric_scores_per_model.append(rubric_accuracy_per_instance(prompt['rubric_items']))
    rubric_scores.append([rubric_scores_per_model, model_name])
# Sort models by mean rubric score (descending)
rubric_scores = sorted(
    rubric_scores,
    key=lambda x: sum(x[0]) / len(x[0]) if len(x[0]) > 0 else float('-inf'),
    reverse=True
)

plot_distribution(rubric_scores, title="Distribution of scores per problem", x_label="Score (clipped to [0,1])", y_label="Relative frequency", bins=20, clip=(0, 1), markers = False)

In [None]:
## Solution length distribution
## Per one instance.
solution_lengths = []
for model_name, results in results_main_eval.items():
    solution_lengths_per_model = []
    data = results["metadata"]["example_level_metadata"]
    for prompt in data:
        usage = prompt['usage']
        if usage['output_tokens'] is None:
            continue
        solution_lengths_per_model.append(usage['output_tokens'])
    solution_lengths.append([solution_lengths_per_model, model_name])

clip_to = 4000
plot_distribution(solution_lengths, title=f"Distribution of sollen\n(clipped to [0, {clip_to}])", x_label="sollen", y_label="Frequency", bins=40, clip = (0, clip_to))

In [None]:
## Response length distribution
## Per one instance.
response_lengths = []
for model_name, results in results_worst_at_k.items():
    data = results["metadata"]["example_level_metadata"]
    response_lengths_per_model = []
    for prompt in data:
        usage = prompt['usage']
        if usage['output_tokens'] is None or usage['output_reasoning_tokens'] is None:
            continue
        response_lengths_per_model.append(usage['output_tokens'] - usage['output_reasoning_tokens'])

    response_lengths.append([response_lengths_per_model, model_name])

clip_to = 2000
plot_distribution(response_lengths, title=f"Distribution of response length (final message)\n(clipped to [0, {clip_to}])", x_label="response length", y_label="Frequency", bins=40, clip=(0, clip_to))

## Correlation plots

In [55]:
def plot_lista_vs_listb(
    list_pairs_with_labels,
    x_label,
    y_label,
    title,
    alpha: float | None = None,
    clipx: tuple[float | None, float | None] | None = None,
    clipy: tuple[float | None, float | None] | None = None,
    show_trendline=True,
):
    """
    Plot one or more (x-list, y-list) pairs as scatters with optional OLS trendlines.

    Parameters
    ----------
    *list_pairs_with_labels : tuple[list, list, str]
        Any number of 3-tuples: (lista, listb, label).
        - lista : x-axis values
        - listb : y-axis values
        - label : legend label for that pair
    x_label        : str, optional  – label for x-axis.
    y_label        : str, optional  – label for y-axis.
    title          : str, optional  – plot title.
    show_trendline : bool, optional – draw best-fit line(s) if True (default).
    """
    if not list_pairs_with_labels:
        raise ValueError("Provide at least one (x_list, y_list, label) triple.")

    plt.figure()

    for lista, listb, label in list_pairs_with_labels:
        if len(lista) != len(listb):
            raise ValueError(f"Lists for '{label}' are not the same length.")

        if clipx:
            lista = np.clip(lista, clipx[0], clipx[1])
        if clipy:
            listb = np.clip(listb, clipy[0], clipy[1])

        # Scatter
        line = plt.scatter(lista, listb, label=label, alpha=alpha)  # draw; keep handle

        # Trend-line (OLS) – use same color as scatter
        if show_trendline:
            m, b = np.polyfit(lista, listb, 1)
            xs = np.linspace(min(lista), max(lista), 200)
            plt.plot(xs, m * xs + b, color=line.get_facecolor().flatten())

        # Pearson r for legend suffix
        r = float(np.corrcoef(lista, listb)[0, 1])
        print(f"Pearson r for {label}: {r:.3f}")
        from scipy.stats import spearmanr
        r_spearman = float(spearmanr(lista, listb).statistic)
        print(f"Spearman r for {label}: {r_spearman:.3f}")

    plt.xlabel(x_label)
    plt.ylabel(y_label)
    plt.title(title)
    plt.tight_layout()
    plt.legend()
    plt.show()

In [None]:
## Solution length VS score
## Per one instance.
solution_length_vs_score = []
for model_name, results in results_worst_at_k.items():
    data = results["metadata"]["example_level_metadata"]
    solution_lengths_per_model = []
    rubric_scores_per_model = []
    for prompt in data:
        usage = prompt['usage']
        if usage['output_tokens'] is None:
            continue
        solution_lengths_per_model.append(usage['output_tokens'])
        rubric_scores_per_model.append(rubric_accuracy_per_instance(prompt['rubric_items']))
    solution_length_vs_score.append([rubric_scores_per_model, solution_lengths_per_model, model_name])

print('Response + CoT length\n')
plot_lista_vs_listb(solution_length_vs_score, x_label="Rubrics Score", y_label="Solution length", title="Rubrics Score VS Solution length", clipx=(0, 1), alpha = 0.3)

In [None]:
## Response length VS score
## Per one instance.
response_length_vs_score = []
for model_name, results in results_worst_at_k.items():
    data = results["metadata"]["example_level_metadata"]
    response_lengths_per_model = []
    rubric_scores_per_model = []
    for prompt in data:
        usage = prompt['usage']
        if usage['output_tokens'] is None or usage['output_reasoning_tokens'] is None:
            continue
        response_lengths_per_model.append(usage['output_tokens'] - usage['output_reasoning_tokens'])
        rubric_scores_per_model.append(rubric_accuracy_per_instance(prompt['rubric_items']))
    response_length_vs_score.append([rubric_scores_per_model, response_lengths_per_model, model_name])

print('Response length only\n')
plot_lista_vs_listb(response_length_vs_score, x_label="Rubrics Score", y_label="Response length", title="Rubrics Score VS Response length", clipx=(0, 1), clipy=(0, None), alpha = 0.3)

In [None]:
## Response length VS score
## Per one instance.
response_length_vs_score = []
for model_name, results in results_worst_at_k.items():
    if not ('o1' in model_name or 'o3' in model_name or 'o4' in model_name):
        continue
    data = results["metadata"]["example_level_metadata"]
    response_lengths_per_model = []
    rubric_scores_per_model = []
    for prompt in data:
        usage = prompt['usage']
        if usage['output_tokens'] is None or usage['output_reasoning_tokens'] is None:
            continue
        response_lengths_per_model.append(usage['output_reasoning_tokens'])
        rubric_scores_per_model.append(rubric_accuracy_per_instance(prompt['rubric_items']))
    response_length_vs_score.append([rubric_scores_per_model, response_lengths_per_model, model_name])

print('CoT length only\n')
plot_lista_vs_listb(response_length_vs_score, x_label="Rubrics Score", y_label="Response length", title="Rubrics Score VS CoT length", clipx=(0, 1), clipy=(0, None), alpha = 0.3)

In [None]:
## Response length VS score
## Per one instance.
response_length_vs_thinking_length = []
for model_name, results in results_worst_at_k.items():
    if not ('o1' in model_name or 'o3' in model_name or 'o4' in model_name):
        continue
    data = results["metadata"]["example_level_metadata"]
    response_lengths_per_model = []
    thinking_lengths_per_model = []
    for prompt in data:
        usage = prompt['usage']
        if usage['output_tokens'] is None or usage['output_reasoning_tokens'] is None:
            continue
        thinking_lengths_per_model.append(usage['output_reasoning_tokens'])
        response_lengths_per_model.append(usage['output_tokens'] - usage['output_reasoning_tokens'])
    response_length_vs_thinking_length.append([thinking_lengths_per_model, response_lengths_per_model, model_name])

print('Response length VS CoT length\n')
plot_lista_vs_listb(response_length_vs_thinking_length, x_label="CoT length", y_label="Response length", title="CoT length VS Response length", clipx=(0, None), clipy=(0, None), alpha = 0.3)

## Cost scatter

In [60]:
def plot_dollar_cost_scatter(
    entries,
    title,
    x_label,
    y_label,
    *,
    alpha: float = 0.8,
    scale: str = "linear",
    model_families: list[list[str]] | None = None,
):
    """
    Scatter-plot “cost vs. score” pairs while obeying three visual rules:

    1.  Each *family* is drawn in one colour.
        Within that family the **cheapest** model gets an “x”, the
        2-nd cheapest a square “s”, and the 3-rd cheapest a diamond “D”.
        (Families are assumed to contain ≤ 3 models.)

        Labels that do **not** belong to any family (“orphans”) are drawn
        with a circle “o”, each in its own colour.

    2.  For a log-scaled x-axis, tick labels remain plain-decimal
        (e.g. “0.01”, not “1e-2”).

    3.  Legend ordering
        • All *families* are listed first, with **each family reversed**
          (i.e. the last label in the family list appears first).
        • Orphans are listed after all families.

    Parameters
    ----------
    entries : list[(float, float, str)]
        (cost, score, label) triplets.
    title, x_label, y_label : str
        Axis metadata.
    alpha : float, optional
        Point opacity.
    scale : {"linear", "log"}, optional
        X-axis scale.  Default is "linear".
    model_families : list[list[str]] | None
        Lists of labels that belong together.
    """
    import numpy as np
    import matplotlib.pyplot as plt
    from matplotlib.ticker import FuncFormatter

    if model_families is not None:
        model_families = [[models_to_canonical_name[model] for model in family] for family in model_families]
    else:
        model_families = []

    RANK_MARKERS = ("x", "s", "D")     # cheapest → most-expensive
    ORPHAN_MARKER = "o"

    colour_cycle = itertools.cycle(
        plt.rcParams["axes.prop_cycle"].by_key()["color"]
    )


    grouped: dict[str, dict[str, list[float]]] = {}
    for x, y, label in entries:
        grouped.setdefault(label, {"x": [], "y": []})
        grouped[label]["x"].append(x)
        grouped[label]["y"].append(y)


    label_to_colour: dict[str, str] = {}
    label_to_marker: dict[str, str] = {}

    # 2.1 Families (shared colour, rank-based marker)
    for family in model_families:
        fam_colour = next(colour_cycle)

        # Determine the *cost* rank of each member (ascending)
        ranked = sorted(
            (lbl for lbl in family if lbl in grouped),
            key=lambda lbl: np.mean(grouped[lbl]["x"]),
        )
        for rank, lbl in enumerate(ranked):
            label_to_colour[lbl] = fam_colour
            label_to_marker[lbl] = RANK_MARKERS[min(rank, 2)]

    # 2.2 Orphans (unique colour, circle marker)
    for lbl in grouped:
        if lbl not in label_to_marker:
            label_to_colour[lbl] = next(colour_cycle)
            label_to_marker[lbl] = ORPHAN_MARKER


    plt.figure()
    label_to_handle: dict[str, plt.Artist] = {}
    for lbl, pts in grouped.items():
        h = plt.scatter(
            pts["x"],
            pts["y"],
            marker=label_to_marker[lbl],
            color=label_to_colour[lbl],
            alpha=alpha,
            label=lbl,
        )
        label_to_handle[lbl] = h


    for family in model_families:
        xs, ys = [], []
        for lbl in family:
            if lbl in grouped:
                xs.append(np.mean(grouped[lbl]["x"]))
                ys.append(np.mean(grouped[lbl]["y"]))
        if len(xs) >= 2:
            order = np.argsort(xs)                  # cheapest → priciest
            xs = np.array(xs)[order]
            ys = np.array(ys)[order]
            plt.plot(
                xs,
                ys,
                color=label_to_colour[family[0]],
                linewidth=1.5,
                alpha=min(alpha + 0.1, 1.0),
            )

    ax = plt.gca()
    ax.set_xlabel(x_label)
    ax.set_ylabel(y_label)
    ax.set_title(title)
    ax.tick_params(bottom=True, left=True)

    if scale == "log":
        ax.set_xscale("log")

        # Plain-decimal tick labels (“0.01”, not “1e-2”)
        def _fmt(val, _pos):
            if val == 0:
                return "0"
            s = f"{val:.6f}".rstrip("0").rstrip(".")
            return s

        ax.xaxis.set_major_formatter(FuncFormatter(_fmt))
    else:
        ax.set_xscale("linear")


    legend_handles: list[plt.Artist] = []
    legend_labels: list[str] = []

    # Families
    for family in model_families:
        for lbl in reversed(family):           # reverse the provided order
            if lbl in label_to_handle:
                legend_handles.append(label_to_handle[lbl])
                legend_labels.append(lbl)

    # Orphans
    for lbl in grouped:
        if lbl not in legend_labels:           # keep insertion order of `grouped`
            legend_handles.append(label_to_handle[lbl])
            legend_labels.append(lbl)

    ax.legend(
        handles=legend_handles,
        labels=legend_labels,
        loc="center left",
        bbox_to_anchor=(1.02, 0.5),
        borderaxespad=0.0,
    )


    plt.tight_layout()
    plt.show()

In [None]:
## Rubric score VS dollar cost
DOLLAR_COST_PER_MODEL = {
    'o3': {'input': 10, 'output': 40},
    'o3_high': {'input': 10, 'output': 40},
    'o3_low': {'input': 10, 'output': 40},
    'o4-mini': {'input': 1.1, 'output': 4.4},
    'o4-mini_high': {'input': 1.1, 'output': 4.4},
    'o4-mini_low': {'input': 1.1, 'output': 4.4},
    'gpt-4.1': {'input': 2, 'output': 8},
    'gpt-4.1-mini': {'input': 0.4, 'output': 1.6},
    'gpt-4.1-nano': {'input': 0.1, 'output': 0.4},
    'o1': {'input': 15, 'output': 60},
    'o1_high': {'input': 15, 'output': 60},
    'o1_low': {'input': 15, 'output': 60},
    'o1-pro': {'input': 150, 'output': 600},
    'o1-preview': {'input': 15, 'output': 60},
    'o1-mini': {'input': 1.1, 'output': 4.4},
    'o3-mini': {'input': 1.1, 'output': 4.4},
    'o3-mini_low': {'input': 1.1, 'output': 4.4},
    'o3-mini_high': {'input': 1.1, 'output': 4.4},
    'gpt-4.5-preview': {'input': 75, 'output': 150},
    'gpt-4o-2024-08-06': {'input': 2.5, 'output': 10},
    'gpt-4o-mini': {'input': 0.15, 'output': 0.6},
    'gpt-4-turbo-2024-04-09': {'input': 10, 'output': 30},
    'gpt-3.5-turbo-0125': {'input': 0.5, 'output': 1.5},
    'gpt-4-0613': {'input': 30, 'output': 60},
}

MODEL_FAMILIES = [
    ['o3_low', 'o3', 'o3_high'],
    ['o1_low', 'o1', 'o1_high'],
    ['o3-mini_low', 'o3-mini', 'o3-mini_high'],
    ['o4-mini_low', 'o4-mini', 'o4-mini_high'],
    ['gpt-4.1-nano', 'gpt-4.1-mini', 'gpt-4.1'],
    ['gpt-4o-mini', 'gpt-4o-2024-08-06']
]
rubric_score_cost = []
for model_name, results in results_main_eval.items():

    average_score = results['metrics']['overall_score']

    rubric_scores_per_model = []
    input_tokens_list = []
    output_tokens_list = []

    data = results["metadata"]["example_level_metadata"]
    for prompt in data:
        usage = prompt['usage']
        if usage['input_tokens'] is None or usage['output_tokens'] is None:
            continue
        input_tokens_list.append(usage['input_tokens'])
        output_tokens_list.append(usage['output_tokens'])
        rubric_scores_per_model.append(rubric_accuracy_per_instance(prompt['rubric_items']))

    avg_rubric_score = sum(rubric_scores_per_model) / len(rubric_scores_per_model)
    avg_input_tokens = sum(input_tokens_list) / len(input_tokens_list)
    avg_output_tokens = sum(output_tokens_list) / len(output_tokens_list)

    avg_cost_per_model = (avg_input_tokens * DOLLAR_COST_PER_MODEL[model_name]['input'] + avg_output_tokens * DOLLAR_COST_PER_MODEL[model_name]['output']) / 1e6
    rubric_score_cost.append([avg_cost_per_model, avg_rubric_score, models_to_canonical_name[model_name]])

plot_dollar_cost_scatter(rubric_score_cost, title="Score vs cost across OpenAI model families", x_label="Inference cost per example ($)", y_label="HealthBench score", model_families=MODEL_FAMILIES, scale='linear')

In [None]:
plot_dollar_cost_scatter(rubric_score_cost, title="HealthBench performance-cost frontier", x_label="Inference cost per example ($)", y_label="HealthBench score", model_families=MODEL_FAMILIES, scale='log')

In [None]:
cost_perf_data = pd.DataFrame(rubric_score_cost, columns=['cost_usd', 'performance_pct', 'model'])[['model', 'cost_usd', 'performance_pct']]
save_csv_and_print(cost_perf_data, 'cost_perf_data.csv')