In [11]:
from pathlib import Path
import sys

nb_dir = Path.cwd()


project_root = nb_dir.parent.parent   # .../human-ai-collab-uq

print("Notebook dir: ", nb_dir)
print("Project root:", project_root)

# add src/ to Python path
src_path = project_root / "src"
if str(src_path) not in sys.path:
    sys.path.insert(0, str(src_path))

import importlib
import llm.offline_helpers as off
off = importlib.reload(off)

print("Module path:", off.__file__)

# data root
data_root = project_root / "data" / "llm-data"

AI_GPT4O_JSONL = data_root / "AI_gpt-4o_N500_seed1337.jsonl"          # adjust if different
AI_GPT5_JSONL  = data_root / "AI_gpt-5-mini_N3000_seed1337_full.jsonl"
HUMAN_JSONL    = data_root / "human_diffDiag_N3000_seed1337.jsonl"
ALLOWED_LABELS = data_root / "allowed_labels.json"

for p in [AI_GPT4O_JSONL, AI_GPT5_JSONL, HUMAN_JSONL, ALLOWED_LABELS]:
    print(p, "exists:", p.exists())


Notebook dir:  /Users/nooranis/Downloads/Github/human-ai-collab-uq/notebooks/llm
Project root: /Users/nooranis/Downloads/Github/human-ai-collab-uq
Module path: /Users/nooranis/Downloads/Github/human-ai-collab-uq/src/llm/offline_helpers.py
/Users/nooranis/Downloads/Github/human-ai-collab-uq/data/llm-data/AI_gpt-4o_N500_seed1337.jsonl exists: True
/Users/nooranis/Downloads/Github/human-ai-collab-uq/data/llm-data/AI_gpt-5-mini_N3000_seed1337_full.jsonl exists: True
/Users/nooranis/Downloads/Github/human-ai-collab-uq/data/llm-data/human_diffDiag_N3000_seed1337.jsonl exists: True
/Users/nooranis/Downloads/Github/human-ai-collab-uq/data/llm-data/allowed_labels.json exists: True


In [25]:
# build the label space
label_space = off.load_label_space(
    ai_jsonl_paths=[AI_GPT4O_JSONL, AI_GPT5_JSONL],
    human_jsonl_path=HUMAN_JSONL,
    allowed_labels_json=ALLOWED_LABELS,
)

print("Num labels:", len(label_space))
print("First few:", label_space[:5])


Num labels: 49
First few: ['Acute COPD exacerbation / infection', 'Acute dystonic reactions', 'Acute laryngitis', 'Acute otitis media', 'Acute pulmonary edema']


In [26]:
# GPT-5 + human
ai_gpt5 = off.AIModel(AI_GPT5_JSONL, label_space=label_space)
human   = off.HumanExpert(HUMAN_JSONL,
                          label_space=label_space,
                          id_to_label=ai_gpt5.id_to_label)

# GPT-4o, same label space
ai_gpt4o = off.AIModel(AI_GPT4O_JSONL, label_space=label_space)

# Per-model overlaps with human
ids_5  = sorted(set(ai_gpt5.available_ids())  & set(human.available_ids()))
ids_4o = sorted(set(ai_gpt4o.available_ids()) & set(human.available_ids()))

print("GPT-5 overlap with human:", len(ids_5))
print("GPT-4o overlap with human:", len(ids_4o))


GPT-5 overlap with human: 3000
GPT-4o overlap with human: 500


In [28]:
SEED       = 1337
TEST_SIZE  = 0.5
N_SPLITS   = 10   # number of random splits to average over

# (ε, δ) choices 
PARAMS = {
    "Top-1": {
        "strategy": "topk_1",
        "GPT-4o": {"eps": 0.02, "delta": 0.70},
        "GPT-5":  {"eps": 0.02, "delta": 0.70},
    },
    "Top-2": {
        "strategy": "topk_2",
        "GPT-4o": {"eps": 0.01, "delta": 0.45},
        "GPT-5":  {"eps": 0.02, "delta": 0.45},
    },
}


In [29]:
def summarize_splits(records, attr):
    vals = np.array([getattr(r, attr) for r in records], dtype=float)
    return float(vals.mean()), float(vals.std(ddof=1)) if len(vals) > 1 else 0.0


def run_model_strategy_multi(
    model_name: str,
    ai_model: off.AIModel,
    ids: list[int],
    human_strategy: str,
    epsilon: float,
    delta: float,
    n_splits: int = N_SPLITS,
    base_seed: int = SEED,
    test_size: float = TEST_SIZE,
):
    """
    Run off.run_single_split for n_splits random seeds and return mean/std
    for human-only, H/AI CUP, and AI-only baselines.
    """
    records = []
    for i in range(n_splits):
        rs = base_seed + i
        res = off.run_single_split(
            ai=ai_model,
            human=human,
            ids=ids,
            human_strategy=human_strategy,
            epsilon=epsilon,
            delta=delta,
            test_size=test_size,
            random_state=rs,
            jitter=0.02,
        )
        records.append(res)

    # aggregate means and stds
    human_cov_mean, human_cov_std = summarize_splits(records, "human_cov")
    human_sz_mean,  human_sz_std  = summarize_splits(records, "human_sz")
    hai_cov_mean,   hai_cov_std   = summarize_splits(records, "hai_cov")
    hai_sz_mean,    hai_sz_std    = summarize_splits(records, "hai_sz")
    ai_cov_mean,    ai_cov_std    = summarize_splits(records, "ai_cov")
    ai_sz_mean,     ai_sz_std     = summarize_splits(records, "ai_sz")
    a_mean,         a_std         = summarize_splits(records, "a")
    b_mean,         b_std         = summarize_splits(records, "b")

    # tuned_delta can be None for some splits
    deltas = [r.tuned_delta for r in records if r.tuned_delta is not None]
    if deltas:
        tuned_delta_mean = float(np.mean(deltas))
        tuned_delta_std  = float(np.std(deltas, ddof=1)) if len(deltas) > 1 else 0.0
    else:
        tuned_delta_mean = None
        tuned_delta_std  = 0.0

    return {
        "human_cov_mean": human_cov_mean,
        "human_cov_std":  human_cov_std,
        "human_sz_mean":  human_sz_mean,
        "human_sz_std":   human_sz_std,

        "hai_cov_mean":   hai_cov_mean,
        "hai_cov_std":    hai_cov_std,
        "hai_sz_mean":    hai_sz_mean,
        "hai_sz_std":     hai_sz_std,

        "ai_cov_mean":    ai_cov_mean,
        "ai_cov_std":     ai_cov_std,
        "ai_sz_mean":     ai_sz_mean,
        "ai_sz_std":      ai_sz_std,

        "a_mean":         a_mean,
        "a_std":          a_std,
        "b_mean":         b_mean,
        "b_std":          b_std,

        "tuned_delta_mean": tuned_delta_mean,
        "tuned_delta_std":  tuned_delta_std,
    }


In [31]:
rows = []

for row_name, cfg in PARAMS.items():
    strat = cfg["strategy"]

    # GPT-4o
    p4  = cfg["GPT-4o"]
    res4 = run_model_strategy_multi(
        model_name="GPT-4o",
        ai_model=ai_gpt4o,
        ids=ids_4o,
        human_strategy=strat,
        epsilon=p4["eps"],
        delta=p4["delta"],
        n_splits=N_SPLITS,
        base_seed=SEED,
    )

    # GPT-5
    p5  = cfg["GPT-5"]
    res5 = run_model_strategy_multi(
        model_name="GPT-5",
        ai_model=ai_gpt5,
        ids=ids_5,
        human_strategy=strat,
        epsilon=p5["eps"],
        delta=p5["delta"],
        n_splits=N_SPLITS,
        base_seed=SEED,
    )

    human_cov_mean = res5["human_cov_mean"]
    human_sz_mean  = res5["human_sz_mean"]

    rows.append({
        "Strategy": row_name,

        "Human C/S": f"{human_cov_mean:.2f} / {human_sz_mean:.2f}",

        "GPT-4o CUP C/S": f"{res4['hai_cov_mean']:.2f} / {res4['hai_sz_mean']:.2f}",
        "GPT-4o (ε, δ)":  f"({p4['eps']:.2f}, {p4['delta']:.2f})",
        "GPT-4o AI C/S":  f"{res4['ai_cov_mean']:.2f} / {res4['ai_sz_mean']:.2f}",

        "GPT-5 CUP C/S":  f"{res5['hai_cov_mean']:.2f} / {res5['hai_sz_mean']:.2f}",
        "GPT-5 (ε, δ)":   f"({p5['eps']:.2f}, {p5['delta']:.2f})",
        "GPT-5 AI C/S":   f"{res5['ai_cov_mean']:.2f} / {res5['ai_sz_mean']:.2f}",
    })

table_df = pd.DataFrame(rows)
table_df


Unnamed: 0,Strategy,Human C/S,GPT-4o CUP C/S,"GPT-4o (ε, δ)",GPT-4o AI C/S,GPT-5 CUP C/S,"GPT-5 (ε, δ)",GPT-5 AI C/S
0,Top-1,0.73 / 1.00,0.90 / 2.90,"(0.02, 0.70)",0.89 / 8.94,0.90 / 1.49,"(0.02, 0.70)",0.90 / 1.82
1,Top-2,0.87 / 1.94,0.92 / 3.19,"(0.01, 0.45)",0.90 / 9.06,0.91 / 1.54,"(0.02, 0.45)",0.91 / 1.85
