In [None]:
import csv
from sklearn.metrics import cohen_kappa_score
from openai import OpenAI
import json
from itertools import combinations
from tqdm import tqdm
import pandas as pd
import copy
import random
import numpy as np
import re
from collections import OrderedDict

# Human Screening
In this section we compute the human agreement in the inclusion/exclusion phase, as measured by Cohen's k

In [None]:
screen1_fn = "Human Screening and Tiering Data/Nature Medicine LLM Systematic Review - Screener Group 1.csv"
screen2_fn = "Human Screening and Tiering Data/Nature Medicine LLM Systematic Review - Screener Group 2.csv"
tiebreak_fn = "Human Screening and Tiering Data/Nature Medicine LLM Systematic Review - Tiebreaks.csv"

In [None]:
# load both screener data
# data will be a list of dict_keys(['Title', 'Abstract', 'Include?', 'Screener Name', 'Comments', ''])
with open(screen1_fn, "r") as f:
    reader = csv.DictReader(f)
    screen1_data = [row for row in reader]
with open(screen2_fn, "r") as f:
    reader = csv.DictReader(f)
    screen2_data = [row for row in reader]
with open(tiebreak_fn, "r") as f:
    reader = csv.DictReader(f)
    tiebreak_data = [row for row in reader]

In [None]:
# ensure all studies are screened by both screeners
screen1_titles = {row["Title"] for row in screen1_data if row["Title"]}
screen2_titles = {row["Title"] for row in screen2_data if row["Title"]}
intersection = screen1_titles & screen2_titles

assert len(intersection) == 500

In [None]:
# extract the decisions into a neater structure
decisions = {}
for row in screen1_data:
    if row["Title"] in intersection:
        decisions[row["Title"]] = {"Screener 1": row["Include?"],
                                   "Screener 2": None}
        decisions[row["Title"]]["Abstract"] = row["Abstract"]
        
for row in screen2_data:
    if row["Title"] in intersection:
        decisions[row["Title"]]["Screener 2"] = row["Include?"]

In [None]:
# compute agreement and add tiebreaker decisions
for key in decisions:
    if decisions[key]["Screener 1"] == decisions[key]["Screener 2"]:
        decisions[key]["agree?"] = True
    else:
        decisions[key]["agree?"] = False
        for row in tiebreak_data:
            if row["Title"] == key:
                decisions[key]["tiebreaker"] = row["Final Decision"]
                break

In [None]:
# compute kappa score
screener1 = [decision["Screener 1"] for decision in decisions.values()]
screener2 = [decision["Screener 2"] for decision in decisions.values()]

# compute kappa
pairs = list(zip(screener1, screener2))
n = len(pairs)

obs_kappa = cohen_kappa_score(
    [h for h, _ in pairs],
    [l for _, l in pairs]
)

B = 50000
boot_kappas = []
for _ in range(B):
    # sample with replacement by index
    samp = [pairs[i] for i in random.choices(range(n), k=n)]
    h_samp, l_samp = zip(*samp)
    k = cohen_kappa_score(h_samp, l_samp)
    boot_kappas.append(k)

# derive the 95% percentile CI
alpha = 0.05
lower, upper = np.percentile(boot_kappas, [100*alpha/2, 100*(1-alpha/2)])

print(f"Observed kappa: {obs_kappa:.3f}")
print(f"Bootstrap 95% CI: {lower:.3f} - {upper:.3f}")

percent_agree = sum([decision["agree?"] for decision in decisions.values()]) / len(decisions) * 100

# LLM Screening
Here, we prepare a batch job to screen all deduplicated studies for inclusion/exclusion via GPT-4.1

## Prepare Batch Job

In [None]:
with open("Prompts/screening_instructions.txt", "r") as f:
    INSTRUCTIONS = f.read()

studies = []
with open("deduped_and_processed_studies.jsonl", "r") as f:
    for line in f:
        study = json.loads(line)
        studies.append(study)

template = {"custom_id": "", "method": "POST", "url": "/v1/responses",
              "body": {"model": "gpt-5", 
                      "reasoning": {"effort": "high"}, 
                      "instructions": INSTRUCTIONS,
                      "input": "",
              }
            }

out_jsonl = []
for idx, row in tqdm(enumerate(studies), total=len(studies)):
    out = copy.deepcopy(template)
    out["custom_id"] = str(idx)
    out["body"]["input"] = f"Title: {row['Title']}\nAbstract: {row['Abstract']}"
    out_jsonl.append(out)

with open("embase_pubmed_scopus_batch-GPT-5r-high.jsonl", "w") as f:
    for entry in out_jsonl:
        f.write(json.dumps(entry) + "\n")

## Process Batch Job Results
Now, we need to process the result since the batch job only returns the responses to our prompts.

In [None]:
# Load original batch
original_messages = {}
with open("embase_pubmed_scopus_batch-GPT-5r-high.jsonl", "r") as f:
    batch = [json.loads(line) for line in f]

for entry in batch:
    original_messages[entry["custom_id"]] = entry["body"]["input"]

responses = []
with open("Batch Responses/GPT-5r-high-screening-output.jsonl", "r") as f:
    for line in f:
        response = json.loads(line)
        custom_id = response["custom_id"]
        if custom_id in original_messages:
            response["original_message"] = original_messages[custom_id]
            responses.append(response)

studies = []
with open("deduped_and_processed_studies.jsonl", "r") as f:
    for line in f:
        study = json.loads(line)
        studies.append(study)

failures = []
overlap = 0
for i in tqdm(range(0, len(studies))):
    for response in responses:
        try:
            if str(studies[i]["Abstract"]) in response["original_message"] and str(studies[i]["Title"]) in response["original_message"]:
                result = json.loads(response["response"]["body"]["output"][1]["content"][0]["text"].replace("```json", "").replace("```", ""))

                studies[i]["Comments"] = result["Comments"]
                studies[i]["Include?"] = result["Include?"]
                
                break
        except Exception as e:
            failures.append((i, str(e)))
            continue

In [None]:
from openai import OpenAI
from tqdm import tqdm

client = OpenAI()

corrected_responses = []
# re-process failures
for failure in tqdm(failures):
    corrected_responses.append(client.responses.create(
        model="gpt-5",
        instructions=INSTRUCTIONS,
        input=f"Title: {studies[failure[0]]['Title']}\nAbstract: {studies[failure[0]]['Abstract']}",
        reasoning={"effort":"high"},
    ))

In [None]:
for failure, corrected_response in zip(failures, corrected_responses):
    try:
        result = json.loads(corrected_response.output[1].content[0].text.replace("```json", "").replace("```", ""))
        studies[failure[0]]["Comments"] = result["Comments"]
        studies[failure[0]]["Include?"] = result["Include?"]
    except Exception as e:
        print(f"Failed again for index {failure[0]}: {str(e)}")
        continue

with open("deduped_and_processed_studies-GPT-5r-high.jsonl", 'w') as f:
    for record in studies:
        f.write(json.dumps(record) + '\n')

### Join the LLM data with the whole data structure
Here, we take the processed LLM data and merge it with the human data for further analysis

In [None]:
studies = []
with open("deduped_and_processed_studies-GPT-5r-high.jsonl", encoding="utf-8") as f:
    for line in f:
        record = json.loads(line)
        studies.append(record)

In [None]:
# These two studies disappeared in the second scrape, so we manually process them to reproduce the results prior to revision 1 (after which the new scrape was done)

# Expression of Concern: Evaluating the Factors Influencing Residency Match for Surgical Specialty Applicants and Programs: Challenges and Future Directions (The American Surgeon™, (2024), (00031348241262427), 10.1177/00031348241262427)
# A Multiagent-Driven Robotic AI Chemist Enabling Autonomous Chemical Research On Demand

for title in [
    "Expression of Concern: Evaluating the Factors Influencing Residency Match for Surgical Specialty Applicants and Programs: Challenges and Future Directions (The American Surgeon™, (2024), (00031348241262427), 10.1177/00031348241262427)",
    "A Multiagent-Driven Robotic AI Chemist Enabling Autonomous Chemical Research On Demand"
]:
    abstract = decisions[title]["Abstract"]

    result = client.responses.create(
        model="gpt-5",
        instructions=INSTRUCTIONS,
        input=f"Title: {title}\nAbstract: {abstract}",
        reasoning={"effort":"high"},
    )

    decision = json.loads(result.output[1].content[0].text.replace("```json", "").replace("```", ""))
    studies.append({
        "Title": title,
        "Abstract": abstract,
        "Comments": decision["Comments"],
        "Include?": decision["Include?"]
    })


In [None]:
# reformat studies to be keyed by title (more convenient this way)
studies_by_title = {}
for study in studies:
    title = study["Title"]
    if title not in studies_by_title:
        studies_by_title[title] = study

In [None]:
# some of the titles changed very slightly between the first and second scrape
# we use difflib to find the matching titles
# there are so few mismatches that we can just manually check them to ensure they match, which they do

import difflib

def find_closest_title(query, titles):
    matches = difflib.get_close_matches(query, titles, n=1, cutoff=0.9)
    if matches:
        return matches[0]
    # fallback: just return the best-scoring title even if < cutoff
    best = max(titles, key=lambda t: difflib.SequenceMatcher(None, query, t).ratio())
    return best

from sklearn.metrics import confusion_matrix

# this block computes bounds on the sensitivity and specificity of the LLM screening
human_consensus = [] # human decisions
llm_consensus = [] # llm decisions

for study in decisions: # recall that decisions is keyed by title
    if decisions[study]["agree?"]:
        human_consensus.append(decisions[study]["Screener 1"].lower())
    else:
        if "tiebreaker" in decisions[study]:
            human_consensus.append(decisions[study]["tiebreaker"].lower())

    if study not in studies_by_title:
        # Unfortunately, on the second scrape, some titles changed extremely slightly
        # We manually review each of these matches to ensure they are the correct matches
        closest_title = find_closest_title(study, studies_by_title.keys())
        print(f"Could not find exact match for title:\n{study}\nClosest match is:\n{closest_title}\n")
        llm_consensus.append(studies_by_title[closest_title]["Include?"].lower())
    else:
        llm_consensus.append(studies_by_title[study]["Include?"].lower())

pairs = list(zip(human_consensus, llm_consensus))
n = len(pairs)

obs_kappa = cohen_kappa_score(
    [h for h, _ in pairs],
    [l for _, l in pairs]
) # compute the observed kappa for the human-LLM pairs

# bootstrap the kappa score to get confidence intervals
B = 50000
boot_kappas = []
for _ in range(B):
    # sample with replacement by index
    samp = [pairs[i] for i in random.choices(range(n), k=n)]
    h_samp, l_samp = zip(*samp)
    k = cohen_kappa_score(h_samp, l_samp)
    boot_kappas.append(k)

alpha = 0.05
lower, upper = np.percentile(boot_kappas, [100*alpha/2, 100*(1-alpha/2)])

print(f"Observed kappa: {obs_kappa:.3f}")
print(f"Bootstrap 95% CI: {lower:.3f} - {upper:.3f}")

print()

n_iters = 50000
sensitivities = []
specificities = []
for _ in range(n_iters):
    # sample with replacement by index
    samp = [pairs[i] for i in random.choices(range(n), k=n)]
    h_samp, l_samp = zip(*samp)
    
    cm = confusion_matrix(h_samp, l_samp, labels=["no", "yes"])
    tn, fp, fn, tp = cm.ravel()
    sensitivity = tp / (tp + fn) if tp + fn else float("nan")
    specificity = tn / (tn + fp) if tn + fp else float("nan")
    
    sensitivities.append(sensitivity)
    specificities.append(specificity)

# --- derive the 95% percentile CI ---
alpha = 0.05
sensitivity_lower, sensitivity_upper = np.percentile(sensitivities, [100*alpha/2, 100*(1-alpha/2)])
specificity_lower, specificity_upper = np.percentile(specificities, [100*alpha/2, 100*(1-alpha/2)])

h_samp, l_samp = zip(*pairs)
cm = confusion_matrix(h_samp, l_samp, labels=["no", "yes"])
tn, fp, fn, tp = cm.ravel()

print(f"Sensitivity: {tp / (tp + fn) if tp + fn else float('nan'):.3f}")
print(f"Specificity: {tn / (tn + fp) if tn + fp else float('nan'):.3f}")
print(f"Sensitivity: {sensitivity_lower:.3f} - {sensitivity_upper:.3f}")
print(f"Specificity: {specificity_lower:.3f} - {specificity_upper:.3f}")

In [None]:
# count number of disagreements between human consensus and llm consensus
disagreements = sum(1 for h, l in pairs if h != l)
print(f"Number of disagreements between human consensus and LLM consensus: {disagreements}")

### Estimating the true inclusion bounds

In [None]:
import numpy as np
import random

# constants
N_full   = 12896
P_obs    = 4609 # LLM inclusions
K_yes    = sum(x == "yes" for x, _ in pairs) # human 'yes' in the audit
n_audit  = 500 # number of samples we used in the human audit

# Jeffreys posterior for prevalence
alpha_p, beta_p = K_yes + 0.5, n_audit - K_yes + 0.5

TP = []
FN = []
FP = []
TN = []

P_pred = [] # estimating how many positives the LLM would predict
true_number_of_positives = [] # list to store the estimated number of studies that are actually positive

for se, sp in zip(sensitivities, specificities):
    # draw a random prevalence from the Beta posterior
    pi = random.betavariate(alpha_p, beta_p)
    
    # propagate to full corpus
    tp = N_full * pi * se
    fn = N_full * pi * (1 - se)
    fp = N_full * (1 - pi) * (1 - sp)
    tn = N_full * (1 - pi) * sp

    true_number_of_positives.append(tp + fn)
    
    TP.append(tp)
    FN.append(fn)
    FP.append(fp)
    TN.append(tn)

    P_pred.append(tp + fp)

# 95 % percentile intervals
def ci(x):
      return np.percentile(x, [2.5, 97.5]).astype(int)

ci_TP, ci_FN, ci_FP, ci_TN = map(ci, (TP, FN, FP, TN))
mean_TP, mean_FN, mean_FP, mean_TN = map(lambda x: int(np.mean(x)),
                                         (TP, FN, FP, TN))

print(f"TP ≈ {mean_TP} (95 % CI {ci_TP[0]}-{ci_TP[1]})")
print(f"FP ≈ {mean_FP} (95 % CI {ci_FP[0]}-{ci_FP[1]})")
print(f"FN ≈ {mean_FN} (95 % CI {ci_FN[0]}-{ci_FN[1]})")
print(f"TN ≈ {mean_TN} (95 % CI {ci_TN[0]}-{ci_TN[1]})")
print(f"Total positives ≈ {int(np.mean(true_number_of_positives))} "
      f"(95 % CI {ci(true_number_of_positives)[0]}-{ci(true_number_of_positives)[1]})")
print(f"Predicted LLM positives ≈ {int(np.mean(P_pred))} "
      f"(the model output {P_obs})")

# Tiering
Here we compute human tiering scores

In [None]:
tiering1_fn = "Human Screening and Tiering Data/Nature Medicine LLM Systematic Review - Tiering Group 1.csv"
tiering2_fn = "Human Screening and Tiering Data/Nature Medicine LLM Systematic Review - Tiering Group 2.csv"

tiering1_data = pd.read_csv(tiering1_fn).to_dict(orient='records')
tiering2_data = pd.read_csv(tiering2_fn).to_dict(orient='records')

In [None]:
questions = {}
for row1 in tiering1_data:
    title = row1["Title"]
    for row2 in tiering2_data:
        if row2["Title"] == title:
            if type(row1["Tier"]) is not str or type(row2["Tier"]) is not str:
                continue

            # initial human rejection of studies that don't fit the inclusion criteria
            # we are quite strict here to be positive we are not including any studies that do not fit the inclusion criteria for the LLM analysis later
            if row1["Tier"] == "X" or row2["Tier"] == "X":
                continue
            questions[title] = {
                "Title": title,
                "Abstract": row1["Abstract"],
                "Tier": [row1["Tier"], row2["Tier"]],
                "Screeners": [row1["Screener Name"], row2["Screener Name"]],
            }
            break

In [None]:
tiering_decisions1 = [item["Tier"][0] for item in questions.values()]
tiering_decisions2 = [item["Tier"][1] for item in questions.values()]

# bootstrap kappa score
pairs = list(zip(tiering_decisions1, tiering_decisions2))
n = len(pairs)
obs_kappa = cohen_kappa_score(
    [h for h, _ in pairs],
    [l for _, l in pairs]
)

B = 50000
boot_kappas = []
for _ in range(B):
    # sample with replacement by index
    samp = [pairs[i] for i in random.choices(range(n), k=n)]
    h_samp, l_samp = zip(*samp)
    k = cohen_kappa_score(h_samp, l_samp)
    boot_kappas.append(k)

# --- derive the 95% percentile CI ---
alpha = 0.05
lower, upper = np.percentile(boot_kappas, [100*alpha/2, 100*(1-alpha/2)])
print(f"Observed kappa (tiering): {obs_kappa:.3f}")
print(f"Bootstrap 95% CI (tiering): {lower:.3f} - {upper:.3f}")

# LLM Tiering
Here we generate the batch requests for the LLM tiering process and analyze the results

## Batch Request

In [None]:
with open("Prompts/tiering_prompt.txt", "r") as f:
    TIEIRNG_PROMPT = f.read()

with open("deduped_and_processed_studies-GPT-5r-high.jsonl", "r") as f:
    studies = [json.loads(line) for line in f]
    df = pd.DataFrame(studies)

included_df = df[df["Include?"] == "yes"]

with open("included_studies.jsonl", "w") as f:
    for item in included_df.reset_index().to_dict(orient='records'):
        f.write(json.dumps(item) + "\n")

with open("included_studies.jsonl", "r") as f:
    included_studies = [json.loads(line) for line in f]

template = {"custom_id": "", "method": "POST", "url": "/v1/responses",
              "body": {"model": "gpt-5", 
                      "reasoning": {"effort": "high"}, 
                      "instructions": TIEIRNG_PROMPT,
                      "input": "",
              }
            }

out_jsonl = []
for idx, row in tqdm(enumerate(studies), total=len(studies)):
    out = copy.deepcopy(template)
    out["custom_id"] = str(idx)
    out["body"]["input"] = f"Title: {row['Title']}\nAbstract: {row['Abstract']}"
    out_jsonl.append(out)

with open("tiering_batch_requests.jsonl", "w") as f:
    for item in out_jsonl:
        f.write(json.dumps(item) + "\n")

with open("Batch Responses/GPT-5r-high-tiering-output.jsonl", "r") as f:
    responses = [json.loads(line) for line in f]

In [None]:
for i, item in enumerate(responses):
    try:
        gpt5r_tier = json.loads(item["response"]["body"]["output"][1]["content"][0]["text"].replace("```json", "").replace("```", ""))
        included_studies[i]["LLM-tier"] = gpt5r_tier
    except (KeyError, IndexError, json.JSONDecodeError) as e:
        print(f"Error processing response {i}: {e}")

with open("included_studies_with_tiers-GPT-5r-high.jsonl", "w") as f:
    for item in included_studies:
        f.write(json.dumps(item) + "\n")

In [None]:
# assign tiebreaks
with open("Human Screening and Tiering Data/Nature Medicine LLM Systematic Review - Tiering Tiebreaks.csv", "r") as f:
    reader = csv.DictReader(f)
    tiebreak_data = [row for row in reader]

    tiebreaks = {}
    for item in tiebreak_data:
        title = item["Title"]
        assert title not in tiebreaks, f"Duplicate title found: {title}"
        tiebreaks[title] = item["Tiebreak"]

for title in questions:
    if questions[title]["Tier"][0] == questions[title]["Tier"][1]:
        assert title not in tiebreaks, f"Tiebreak found for title with no disagreement: {title}"
    else:
        assert title in tiebreaks, f"No tiebreak found for title with disagreement: {title}"
        questions[title]["Tiebreak"] = tiebreaks[title]

In [None]:
# Same story as before, some works were removed in the second scrape
# We manually add them here to reproduce the results prior to revision 1 (after which the new scrape was done)
for title in tqdm(["ChatGPT-o1 and the Pitfalls of Familiar Reasoning in Medical Ethics",
                "From Answers to Insights: Unveiling the Strengths and Limitations of ChatGPT and Biomedical Knowledge Graphs",
                "Leveraging artificial intelligence to detect ethical concerns in medical research: A case study",
                "Evaluating the performance of Generative Pre-trained Transformer-4 (GPT-4) in standardizing radiology reports",
                "Transforming Healthcare Education: Harnessing Large Language Models for Frontline Health Worker Capacity Building using Retrieval-Augmented Generation",
                "A Novel RAG Framework with Knowledge-Enhancement for Biomedical Question Answering",
                "MedGen: An Explainable Multi-Agent Architecture for Clinical Decision Support through Multisource Knowledge Fusion",
                "Managing class imbalance in the training of a large language model to predict patient selection for total knee arthroplasty: Results from the Artificial intelligence to Revolutionise the patient Care pathway in Hip and knEe aRthroplastY (ARCHERY) project",
                "Expert evaluation of large language models for clinical dialogue summarization",
                "Automated and code-free development of a risk calculator using ChatGPT-4 for predicting diabetic retinopathy and macular edema without retinal imaging"]):
    # find associated abstract
    abstract = questions[title]["Abstract"]
    result = client.responses.create(
        model="gpt-5",
        instructions=TIEIRNG_PROMPT,
        input=f"Title: {title}\nAbstract: {abstract}",
        reasoning={"effort":"high"},
    )

    decision = json.loads(result.output[1].content[0].text.replace("```json", "").replace("```", ""))
    included_studies.append({
        "Title": title,
        "Abstract": abstract,
        "LLM-tier": decision
    })

In [None]:
# some of the titles changed very slightly between the first and second scrape
# we use difflib to find the matching titles
# there are so few mismatches that we can just manually check them to ensure they match, which they do

import difflib

def find_closest_title(query, titles):
    matches = difflib.get_close_matches(query, titles, n=1, cutoff=0.5)
    if matches:
        return matches[0]
    # fallback: just return the best-scoring title even if < cutoff
    best = max(titles, key=lambda t: difflib.SequenceMatcher(None, query, t).ratio())
    return best

# compute LLM kappa
ground_truth_tierings = [item["Tier"][0].lower() if "Tiebreak" not in item else item["Tiebreak"].lower() for item in questions.values()]
llm_tierings = []
for title in questions.keys():
    found_study = False
    for item in included_studies:
        if item["Title"] == title:
            llm_tierings.append(item["LLM-tier"]["Tier"].lower())
            found_study = True
            break
    
    if not found_study:
        closest_title = find_closest_title(title, [item["Title"] for item in included_studies])
        print(f"Could not find exact match for title:\n{title}\nClosest match is:\n{closest_title}\n")
        for item in included_studies:
            if item["Title"] == closest_title:
                llm_tierings.append(item["LLM-tier"]["Tier"].lower())
                found_study = True
                break

    assert found_study, f"Study not found for title: {title}"

pairs = list(zip(ground_truth_tierings, llm_tierings))
n = len(pairs)
obs_kappa = cohen_kappa_score(
    [h for h, _ in pairs],
    [l for _, l in pairs]
)
B = 50000
boot_kappas = []
for _ in range(B):
    # sample with replacement by index
    samp = [pairs[i] for i in random.choices(range(n), k=n)]
    h_samp, l_samp = zip(*samp)
    k = cohen_kappa_score(h_samp, l_samp)
    boot_kappas.append(k)

# derive the 95% percentile CI
alpha = 0.05
lower, upper = np.percentile(boot_kappas, [100*alpha/2, 100*(1-alpha/2)])
print(f"Observed kappa (tiering): {obs_kappa:.3f}")
print(f"Bootstrap 95% CI (tiering): {lower:.3f} - {upper:.3f}")

# compute sensitivity and specificity for each tier
def compute_tiering_sensitivity_specificity(ground_truth, llm_predictions):
    tiers = ["i", "ii", "iii"]
    sensitivity = {}
    specificity = {}
    
    for tier in tiers:
        tp = tn = fp = fn = 0
        for gt, pred in zip(ground_truth, llm_predictions):
            if gt == tier and pred == tier:
                tp += 1
            elif gt == tier and pred != tier:
                fn += 1
            elif gt != tier and pred != tier:
                tn += 1
            elif gt != tier and pred == tier:
                fp += 1
        
        sensitivity[tier] = tp / (tp + fn) if (tp + fn) else 0
        specificity[tier] = tn / (tn + fp) if (tn + fp) else 0
    
    return sensitivity, specificity

sensitivity, specificity = compute_tiering_sensitivity_specificity(ground_truth_tierings, llm_tierings)
n_iters = 50000
sensitivities = {tier: [] for tier in sensitivity.keys()}
specificities = {tier: [] for tier in specificity.keys()}
for _ in range(n_iters):
    # sample with replacement by index
    samp = [pairs[i] for i in random.choices(range(n), k=n)]
    h_samp, l_samp = zip(*samp)
    sens, spec = compute_tiering_sensitivity_specificity(h_samp, l_samp)
    
    for tier in sensitivity.keys():
        sensitivities[tier].append(sens[tier])
        specificities[tier].append(spec[tier])

# 95% percentile CI
alpha = 0.05
ci_sensitivities = {tier: np.percentile(sensitivities[tier], [100*alpha/2, 100*(1-alpha/2)]) for tier in sensitivity.keys()}
ci_specificities = {tier: np.percentile(specificities[tier], [100*alpha/2, 100*(1-alpha/2)]) for tier in specificity.keys()}
print("\nSensitivity and specificity for each tier:")
for tier in sensitivity.keys():
    print(f"Tier: {tier}")
    print(f"  Sensitivity: {sensitivity[tier]:.3f} (95% CI: {ci_sensitivities[tier][0]:.3f} - {ci_sensitivities[tier][1]:.3f})")
    print(f"  Specificity: {specificity[tier]:.3f} (95% CI: {ci_specificities[tier][0]:.3f} - {ci_specificities[tier][1]:.3f})")

In [None]:
# get number of disagreements between human consensus and llm consensus
disagreements = sum(1 for h, l in pairs if h != l)
print(f"Number of disagreements between human consensus and LLM consensus (tiering): {disagreements}")

In [None]:
# ----- macro (un-weighted) averages -----
macro_sensitivity = np.mean(list(sensitivity.values()))
macro_specificity = np.mean(list(specificity.values()))

print(f"\nMacro-averaged sensitivity:  {macro_sensitivity:.3f}")
print(f"Macro-averaged specificity:  {macro_specificity:.3f}")


# ----- bootstrap macro values -----
boot_macro_sens = []
boot_macro_spec = []

for i in range(n_iters):
    boot_macro_sens.append(
        np.mean([sensitivities[t][i] for t in sensitivity.keys()])
    )
    boot_macro_spec.append(
        np.mean([specificities[t][i] for t in specificity.keys()])
    )

# 95 % percentile CIs
alpha = 0.05
ci_macro_sens = np.percentile(boot_macro_sens,
                              [100*alpha/2, 100*(1-alpha/2)])
ci_macro_spec = np.percentile(boot_macro_spec,
                              [100*alpha/2, 100*(1-alpha/2)])

print(f"Macro-sens (95% CI): {macro_sensitivity:.3f} "
      f"({ci_macro_sens[0]:.3f}-{ci_macro_sens[1]:.3f})")
print(f"Macro-spec (95% CI): {macro_specificity:.3f} "
      f"({ci_macro_spec[0]:.3f}-{ci_macro_spec[1]:.3f})")


In [None]:
# get number of erroneous studies that were only one tier off
tier_ordering = ["s", "i", "ii", "iii"]

one_too_high = 0
one_too_low = 0
off_by_one = 0
total_errors = 0

for i in range(0, len(llm_tierings)):
    if llm_tierings[i] == ground_truth_tierings[i]:
        continue

    total_errors += 1
    llm_tier_index = tier_ordering.index(llm_tierings[i])
    gt_tier_index = tier_ordering.index(ground_truth_tierings[i])

    if abs(llm_tier_index - gt_tier_index) == 1:
        off_by_one += 1

        if llm_tier_index > gt_tier_index:
            one_too_high += 1
        else:
            one_too_low += 1

print(f"Total errors: {total_errors}")
print(f"Off by one tier: {off_by_one}")
print(f"Percentage off by one tier: {off_by_one / total_errors * 100:.2f}%")
print(f"One tier too high: {one_too_high}")
print(f"One tier too low: {one_too_low}")
print(f"Percentage one tier too high: {one_too_high / off_by_one * 100:.2f}%")
print(f"Percentage one tier too low: {one_too_low / off_by_one * 100:.2f}%")

## Posterior Estimation of True Tier Prevalence
We want to estimate the true tier prevalences. This is a bit tricky!

In [None]:
import numpy as np, pymc as pm, pytensor.tensor as pt, arviz as az
from sklearn.metrics import confusion_matrix

with open("included_studies_with_tiers-GPT-5r-high.jsonl", "r") as f:
    included_studies = [json.loads(line) for line in f]

# get number of studies in each tier
tier_counts = {'i': 0, 'ii': 0, 'iii': 0}
for item in included_studies:
    tier = item["LLM-tier"]["Tier"].lower()
    if tier in tier_counts:
        tier_counts[tier] += 1
    if tier == "s":
        tier_counts['i'] += 1  # treat 's' as 'i' for the model

# confusion matrix for the audit
cm = confusion_matrix(ground_truth_tierings, llm_tierings, labels=['i', 'ii', 'iii'])

# INPUTS
N_tot  = sum(tier_counts.values()) # total number of studies that were screened
T_vec  = np.array([tier_counts['i'], tier_counts['ii'], tier_counts['iii']]) # observed counts of each tier in vector form
M_audit = cm # matrix of observed counts from both human and LLM audits

# dirichlet priors
alpha = np.ones(3) # this is our prior for *each row* of the confusion matrix
beta  = np.ones(3) # this is our prior for the prevalence of each tier, we assume uniformity for simplicity, but this doesn't affect results very much

# MODEL
with pm.Model() as model:
    # prevalence
    phi = pm.Dirichlet("phi", a=beta)

    # confusion rows
    theta = pm.Dirichlet("theta", a=alpha, shape=(3,3))

    # audit likelihood (three independent rows)
    for i in range(3):
        pm.Multinomial(f"audit_row_{i}",
                       n=M_audit[i].sum(),
                       p=theta[i],
                       observed=M_audit[i])

    # big sweep likelihood
    pi = pm.Deterministic("pi", pt.dot(phi, theta))
    pm.Multinomial("big_sweep",
                   n=N_tot,
                   p=pi,
                   observed=T_vec)

    # derived true-tier counts
    N_true = pm.Deterministic("N_true", phi * N_tot)

    idata = pm.sample(4000, tune=2000,
                      chains=4, target_accept=0.9, random_seed=42)

In [None]:
# plot confusion matrix in greyscale
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd

cm_df = pd.DataFrame(cm, index=['i', 'ii', 'iii'], columns=['i', 'ii', 'iii'])
plt.figure(figsize=(4, 4))
sns.heatmap(cm_df, annot=True, fmt='d', cmap='Greys',
            xticklabels=['I', 'II', 'III'], yticklabels=['I', 'II', 'III'],
            cbar=False, linewidths=0.5, linecolor='black', annot_kws={"size": 16})
plt.xlabel('LLM Predicted Tier')
plt.ylabel('Human Labeled Tier')
plt.savefig('confusion_matrix.svg', dpi=300, bbox_inches='tight')
plt.savefig('confusion_matrix.png', dpi=300, bbox_inches='tight')
plt.show()


In [None]:
az.summary(idata, var_names=["phi", "theta", "N_true"], hdi_prob=0.95)
az.plot_trace(idata, var_names=["phi", "N_true"])

# idata is the object returned by pm.sample()
az.summary(
    idata,
    var_names=["N_true"],   # the derived vector φ × N_tot we defined
    hdi_prob=0.95,          # highest-density interval
    round_to=0              # no decimals for counts
)

# Data Extraction
Here, we create a batch request to extract various data fields of interest. Note that not all fields were used in the final analysis!

In [None]:
included_studies = []
with open("included_studies_with_tiers-GPT-5r-high.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        included_studies.append(item)

In [None]:
data_extraction_prompt = """
Given a study's title and abstract, extract the following information according to this structured JSON format. Return only the JSON object, with no additional text or formatting.

{
  "models_used": ["", ...], # (e.g. "GPT-3.5", "GPT-4", "Gemini Pro", etc.)
  "specialty": ["", ...], # (e.g. "emergency medicine", "radiology", "oncology", etc.)
  "subspecialty": [], # (if applicable, can also list things like "medical education" here)
  "types_of_human_evaluators": ["", ...], # (e.g. "medical students", "residents", "fellows", "attendings", etc.)
  "quantitative?": "", # (yes, no, unsure) # if both qualitative and quantitative, use "yes"
  "sample_size": "",
  "task_type": ["", ...], # give a very brief description of the task type, e.g. "diagnosis", "triage", "patient question answering", etc.
  "geographical_region": "", # (e.g. "North America", "Europe", "Asia", or "unsure")
  "evaluation_type(s)": ["", ...], # (capability, etc.)
  "evaluation_metric(s)": ["", ...], # (subjective? likert? etc.)?
  "datasets_used": ["", ...], # (if applicable, e.g. "MedQA", "MIMIC-III", etc.)
  "did_the_llm_outperform_the_human?": {"answer": "", "details": ""}, # (yes, no, unsure)
  "extremely_brief_summary_of_results": "",
  "types_of_data_sources": ["", ...], # (e.g. "clinical notes", "online vignettes", "patient questions", etc.)
}
""".strip()

template = {"custom_id": "", "method": "POST", "url": "/v1/responses",
              "body": {"model": "gpt-5", 
                      "reasoning": {"effort": "high"}, 
                      "instructions": data_extraction_prompt,
                      "input": "",
              }
            }
batch_request = []

for idx, study in enumerate(included_studies):
    query = f"Title: {study['Title']}\nAbstract: {study['Abstract']}"
    request = copy.deepcopy(template)
    request["custom_id"] = str(idx)
    request["body"]["input"] = query
    batch_request.append(request)

with open("data-extraction_batch_requests.jsonl", "w") as f:
    for item in batch_request:
        f.write(json.dumps(item) + "\n")

with open("data-extraction_batch_requests.jsonl", "w") as f:
    for item in batch_request:
        f.write(json.dumps(item) + "\n")

with open("Batch Responses/GPT-5r-high-data-extraction-output.jsonl", "r") as f:
    responses = [json.loads(line) for line in f]

In [None]:
for i, item in enumerate(responses):
    try:
        extracted_data = json.loads(item["response"]["body"]["output"][1]["content"][0]["text"].replace("```json", "").replace("```", ""))
    except:
        print(item["response"]["body"]["output"][1]["content"][0]["text"].replace("```json", "").replace("```", ""))
    included_studies[i]["extracted_data"] = extracted_data

# get a set of each field:
extracted_fields = {}
for study in included_studies:
    for key in study["extracted_data"].keys():
        if key not in extracted_fields:
            extracted_fields[key] = set()
        if isinstance(study["extracted_data"][key], list):
            for item in study["extracted_data"][key]:
                extracted_fields[key].add(item)
        elif isinstance(study["extracted_data"][key], dict):
            continue
        else:
            extracted_fields[key].add(study["extracted_data"][key])

In [None]:
with open("included_studies_with_extracted_data-GPT-5r-high.jsonl", "w") as f:
    for item in included_studies:
        f.write(json.dumps(item) + "\n")

# Parsing Extracted Data
This is where we organize the data the model extracted (This will be a long section)

In [None]:
specialty_categories = {
  # ————— Medicine & Core Boards —————
  "Allergy and Immunology": {
    "Allergy and Immunology": []
  },

  "Anesthesiology": {
    "Adult Cardiac Anesthesiology": [],
    "Critical Care Medicine": [],
    "Health Care Administration, Leadership, and Management": [],
    "Hospice and Palliative Medicine": [],
    "Neurocritical Care": [],
    "Pain Medicine": [],
    "Pediatric Anesthesiology": [],
    "Obstetric Anesthesiology": [],
    "Pediatric Cardiac Anesthesiology": [],
    "Regional Anesthesiology and Acute Pain Medicine": [],
    "Sleep Medicine": []
  },

  "Dermatology": {
    "Dermatopathology": [],
    "Micrographic Dermatologic Surgery": [],
    "Pediatric Dermatology": [],
    "Cosmetic Dermatologic Surgery": []
  },

  "Emergency Medicine": {
    "Anesthesiology Critical Care Medicine": [],
    "Emergency Medical Services": [],
    "Health Care Administration, Leadership, and Management": [],
    "Hospice and Palliative Medicine": [],
    "Internal Medicine-Critical Care Medicine": [],
    "Medical Toxicology": [],
    "Neurocritical Care": [],
    "Pain Medicine": [],
    "Pediatric Emergency Medicine": [],
    "Sports Medicine": [],
    "Undersea and Hyperbaric Medicine": []
  },

  "Family Medicine": {
    "Adolescent Medicine": [],
    "Geriatric Medicine": [],
    "Health Care Administration, Leadership, and Management": [],
    "Hospice and Palliative Medicine": [],
    "Pain Medicine": [],
    "Sleep Medicine": [],
    "Sports Medicine": [],
    "Clinical Informatics": []
  },

  "Internal Medicine": {
    "Adolescent Medicine": [],
    "Adult Congenital Heart Disease": [],
    "Advanced Heart Failure and Transplant Cardiology": [],
    "Cardiovascular Disease": [],
    "Clinical Cardiac Electrophysiology": [],
    "Critical Care Medicine": [],
    "Endocrinology, Diabetes and Metabolism": [],
    "Gastroenterology": [],
    "Geriatric Medicine": [],
    "Hematology": [],
    "Hospice and Palliative Medicine": [],
    "Infectious Disease": [],
    "Interventional Cardiology": [],
    "Medical Oncology": [],
    "Nephrology": [],
    "Neurocritical Care": [],
    "Pulmonary Disease": [
      "Pulmonary Disease and Critical Care Medicine",
      "Interventional Pulmonology"
    ],
    "Rheumatology": [],
    "Sleep Medicine": [],
    "Sports Medicine": [],
    "Transplant Hepatology": []
  },

  "Medical Genetics and Genomics": {
    "Clinical Genetics and Genomics": [
      "Molecular Genetic Pathology"
    ],
    "Clinical Biochemical Genetics": [],
    "Laboratory Genetics and Genomics": [],
    "Medical Biochemical Genetics": []
  },

  "Neurology": {
    "Neurology (Adult)": [
      "Clinical Neurophysiology",
      "Epilepsy",
      "Neuromuscular Medicine",
      "Sleep Medicine",
      "Pain Medicine",
      "Vascular Neurology",
      "Brain Injury Medicine",
      "Neurocritical Care",
      "Neuroimmunology and Multiple Sclerosis",
      "Movement Disorders"
    ],
    "Child Neurology": [],
    "Neurodevelopmental Disabilities": []
  },

  "Psychiatry": {
    "Addiction Psychiatry": [],
    "Child and Adolescent Psychiatry": [],
    "Consultation-Liaison Psychiatry": [],
    "Forensic Psychiatry": [],
    "Geriatric Psychiatry": [],
    "Brain Injury Medicine": [],
    "Sleep Medicine": [],
    "Pain Medicine": []
  },

  "Preventive Medicine": {
    "Aerospace Medicine": [],
    "Occupational and Environmental Medicine": [],
    "Public Health and General Preventive Medicine": [],
    "Addiction Medicine": [],
    "Clinical Informatics": [],
    "Health Care Administration, Leadership, and Management": [],
    "Medical Toxicology": [],
    "Undersea and Hyperbaric Medicine": [],
    "Lifestyle Medicine": []
  },

  "Pediatrics": {
    "Pediatrics (General)": [],
    "Adolescent Medicine": [],
    "Child Abuse Pediatrics": [],
    "Developmental-Behavioral Pediatrics": [],
    "Hospice and Palliative Medicine": [],
    "Medical Toxicology": [],
    "Neonatal-Perinatal Medicine": [],
    "Pediatric Cardiology": [],
    "Pediatric Critical Care Medicine": [],
    "Pediatric Emergency Medicine": [],
    "Pediatric Endocrinology": [],
    "Pediatric Gastroenterology": [],
    "Pediatric Hematology-Oncology": [],
    "Pediatric Hospital Medicine": [],
    "Pediatric Infectious Diseases": [],
    "Pediatric Nephrology": [],
    "Pediatric Pulmonology": [],
    "Pediatric Rheumatology": [],
    "Pediatric Transplant Hepatology": [],
    "Sleep Medicine": [],
    "Sports Medicine": []
  },

  # ————— Surgery & Procedure-based Boards —————
  "Surgery (American Board of Surgery)": {
    "General Surgery": [
      "Complex General Surgical Oncology",
      "Pediatric Surgery",
      "Surgery of the Hand",
      "Surgical Critical Care"
    ],
    "Vascular Surgery": [
      "Integrated Vascular Surgery"
    ]
  },

  "Thoracic Surgery": {
    "Thoracic and Cardiac Surgery": [
      "Integrated Thoracic Surgery"
    ],
    "Congenital Cardiac Surgery": []
  },

  "Colon and Rectal Surgery": {
    "Colon and Rectal Surgery": []
  },

  "Orthopaedic Surgery": {
    "Orthopaedic Sports Medicine": [],
    "Surgery of the Hand": [],
    "Adult Reconstructive Orthopaedic Surgery": [],
    "Foot and Ankle Orthopaedic Surgery": [],
    "Orthopaedic Surgery of the Spine": [],
    "Orthopaedic Trauma": [],
    "Musculoskeletal Oncology": []
  },

  "Plastic Surgery": {
    "Plastic Surgery": [
      "Craniofacial Surgery",
      "Microsurgery"
    ],
    "Plastic Surgery within the Head and Neck": [],
    "Surgery of the Hand": []
  },

  "Urology": {
    "Urology": [],
    "Pediatric Urology": [],
    "Female Pelvic Medicine and Reconstructive Surgery": []
  },

  "Obstetrics and Gynecology": {
    "Obstetrics and Gynecology": [],
    "Complex Family Planning": [],
    "Critical Care Medicine": [],
    "Gynecologic Oncology": [],
    "Maternal-Fetal Medicine": [],
    "Reproductive Endocrinology and Infertility": [],
    "Urogynecology and Reconstructive Pelvic Surgery": []
  },

  "Otolaryngology - Head and Neck Surgery": {
    "Complex Pediatric Otolaryngology": [],
    "Neurotology": [],
    "Plastic Surgery within the Head and Neck": [],
    "Sleep Medicine": [],
    "Facial Plastic Surgery": [],
    "Rhinology/Nasal and Sinus Care": [],
    "Laryngology and Voice": []
  },

  "Ophthalmology": {
    "Neuro-Ophthalmology": [],
    "Oculofacial Plastic/Ophthalmic Plastic and Reconstructive Surgery": [],
    "Ophthalmic Pathology": [],
    "Pediatric Ophthalmology": [],
    "Uveitis and Ocular Immunology": []
  },

  # ————— Imaging, Pathology & Related —————
  "Radiology": {
    "Diagnostic Radiology": [
      "Abdominal Imaging Radiology",
      "Cardiothoracic Radiology",
      "Musculoskeletal Imaging Radiology",
      "Neuroradiology",
      "Nuclear Radiology",
      "Pediatric Radiology",
      "Pain Medicine"
    ],
    "Interventional Radiology and Diagnostic Radiology": [
      "Interventional Radiology (Integrated)"
    ],
    "Radiation Oncology": [],
    "Medical Physics": [
      "Diagnostic Medical Physics",
      "Nuclear Medical Physics",
      "Therapeutic Medical Physics"
    ],
    "Endovascular Surgical Neuroradiology": []
  },

  "Nuclear Medicine": {
    "Nuclear Medicine": []
  },

  "Pathology": {
    "Pathology - Anatomic/Clinical (AP/CP)": [],
    "Pathology - Anatomic": [],
    "Pathology - Clinical": [],
    "Blood Banking/Transfusion Medicine": [],
    "Clinical Informatics": [],
    "Cytopathology": [],
    "Dermatopathology": [],
    "Hematopathology": [],
    "Neuropathology": [],
    "Pathology - Chemical": [],
    "Pathology - Forensic": [],
    "Pathology - Medical Microbiology": [],
    "Pathology - Molecular Genetic": [],
    "Pathology - Pediatric": [],
    "Selective Pathology": [],
    "Medical Microbiology Pathology": []
  },

  # ————— PM&R and Overlapping Procedural/Functional Areas —————
  "Physical Medicine and Rehabilitation": {
    "Brain Injury Medicine": [],
    "Neuromuscular Medicine": [],
    "Pain Medicine": [],
    "Pediatric Rehabilitation Medicine": [],
    "Spinal Cord Injury Medicine": [],
    "Sports Medicine": []
  },

  # ————— Additional Boards / Categories —————
  "Neurological Surgery": {
    "Neurological Surgery": [],
    "Neurocritical Care": [],
    "Endovascular Surgical Neuroradiology": []
  },

  "Psychiatry and Neurology (Shared/Multi-board subs)": {
    "Multi-board Subspecialties": [
      "Brain Injury Medicine",
      "Clinical Informatics",
      "Neurocritical Care",
      "Pain Medicine",
      "Sleep Medicine"
    ]
  },
  
  "Osteopathic Neuromusculoskeletal Medicine": {
    "Osteopathic Neuromusculoskeletal Medicine": []
  }
}


task_categories = ['Knowledge Retrieval & Clinical QA',
 'Diagnostic Reasoning & Disease Detection',
 'Prognosis & Risk Stratification',
 'Therapeutic Decision Support & Treatment Planning',
 'Clinical Management & Workflow Guidance',
 'Patient-Facing Communication & Education',
 'Psychological & Mental Health Support',
 'Medical Imaging & Signal Interpretation',
 'Clinical Documentation, Reporting & Summarization',
 'Information Extraction, Coding & Classification',
 'Education, Assessment & Simulation',
 'Translation & Simplification',
 'Administrative & Operational Support',
 'Ethical, Safety & Quality Oversight',
 'Other']

evaluation_categories = ['Accuracy & Core Capability',
 'Factuality and Groundedness',
 'Calibration & Uncertainty Quantification',
 'Alignment & Guideline Concordance',
 'Safety & Risk/Harm Assessment',
 'Reliability and Robustness',
 'Generalizability & External Validity',
 'Bias & Fairness',
 'Human Expert Evaluation & Comparison',
 'Accessibility & Language Coverage',
 'Explainability & Transparency',
 'Patient/End-User Outcomes & Satisfaction',
 'User Experience, Usability & Trust',
 'Education & Training Impact',
 'Efficiency & Workflow Impact',
 'Implementation & Feasibility',
 'Regulatory & Ethical Compliance',
 'Benchmarking & Head-to-Head Evaluations',
 'Content Quality & Readability',
 'Other']


metric_categories = ['Task-level Accuracy & Performance',
 'Safety & Harm Assessment',
 'Clinical Appropriateness & Guideline Concordance',
 'Reliability, Consistency & Reproducibility',
 'Inter-Rater Agreement & Concordance',
 'Statistical Significance & Effect Size',
 'Diagnostic/Screening Operating Characteristics',
 'Readability & Linguistic Complexity',
 'Empathy, Bedside Manner & Emotional Support',
 'Comprehensiveness & Coverage',
 'Clarity, Readability & Communication Quality',
 'User/Clinician Preference & Acceptability',
 'Usability & Workflow Efficiency',
 'Calibration & Confidence',
 'Faithfulness & Fact-Checking',
 'Bias & Fairness',
 'Similarity & Semantic Overlap',
 'Patient Education Readiness',
 'Cognitive/Knowledge Outcomes',
 'Resource Utilization & Cost Efficiency',
 'Other']

dataset_categories = [
 'Patient-Facing Q&A & FAQs',
 'Clinician Board & Self-Assessment Questions',
 'Clinical Vignettes & Case Reports',
 'Real-World Electronic Health Records',
 'Synthetic / Simulated Clinical Data',
 'Dialogue & Chat Corpora',
 'Imaging Data & Reports',
 'Physiological Signals & Wearables',
 'Laboratory & Pathology Data',
 'Genomics & -Omics',
 'Pharmacology & Medication Knowledge',
 'Clinical Guidelines & Consensus Statements',
 'Knowledge Graphs / Ontologies / Terminologies',
 'Survey Instruments & Psychometric Scales',
 'Educational & Reference Texts',
 'Imaging Challenges & Benchmarks',
 'Research Literature & Abstract Corpora',
 'Clinical Trial Eligibility & Structured Cohorts',
 'Social-Media & Search-Query Data',
 'Regulatory & Administrative Documents',

 'Unstructured Clinical Notes',
 'Structured EHR Tables & Event Logs',
 'Operative, Procedure & Anesthesia Reports',
 'Emergency/Prehospital & Triage Notes',
 'Patient Portal & Secure Messages',
 'Telemedicine & Call-Center Transcripts',
 'Audio Dictations & Speech Corpora',
 'Digital Pathology & Whole-Slide Images',
 'ICU/Bedside Monitoring Waveforms & Time-Series',
 'Device & Equipment Telemetry',
 'Claims, Billing & Utilization Data',
 'Terminology & Code-Mapping Corpora (ICD/CPT/RxNorm mappings)',

 'Other',
]

## Create the batch job to classify free-text fields into the above structures

In [None]:
expert_classes = [
    # Physicians
    "Attending (Fellowship-trained)",
    "Attending",
    "Fellow",
    "Resident",
    "Intern",
    "Sub-intern",
    "Medical Student",
    "Student (Undergraduate)",
    "Student (Other)",
    "MD (Unspecified)",
    "DO (Unspecified)",
    "Other Trainee",

    # Non-Physician Clinical Providers
    "Nurse",
    "Advanced Practice Provider (NP/PA)",
    "Pharmacist",
    "Therapist / Rehabilitation / Mental Health",
    "Technician / EMT",
    "Other Clinical Provider",
    "Non-Physician Clinician",

    # Researchers and Academics
    "Faculty / Educator",
    "Reviewer / Panelist",
    "Researcher / Scientist",
    "Student (Post-doc)"
    "Student (Graduate, not medical)",
    "Other Academic",

    # Patients, Caregivers, and Public
    "Patient",
    "Caregiver / Family Member",
    "Lay Public / Participant",
    "Other (Public)",

    # Healthcare Support and Administration
    "Language Services",
    "Medical Coder",
    "Program Administration",
    "Information Specialist",
    "Other Support / Admin",

    # Non-Healthcare Domain Experts
    "Non-healthcare Domain Expert",

    # last resort if unsure
    "Other"
]


specialty_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the study's medical specialties and subspecialties according to the following JSON format.\n\n"
    f"{json.dumps(specialty_categories, indent=2)}\n\n"
    "Output a list of lists representing the specialties and subspecialties. For example, if a study is in both Neuroradiology and Cardiology, the output should be:\n\n"
    '[["Radiology", "Diagnostic Radiology", "Neuroradiology"], ["Internal Medicine", "Cardiovascular Disease"]]\n\n'
    "Notice that each specialty must traverse the hierarchy from top to bottom. If a study is only in a top-level specialty with no subspecialty, "
    "It can be represented as a single-item list, e.g. [\"Dermatology\"]. If a study is not clearly in any specialty, return an empty list.\n\n"
    
    "Output only the JSON object, with no additional text or formatting."
)

task_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the tasks the study evaluates according to the following categories.\n\n"
    f"{json.dumps(task_categories, indent=2)}\n\n"
    "Output a list of the most relevant categories, listed by order of relevance (most relevant first). "
    "The list should have the MINIMAL number of elements necessary to describe the task. Be selective and specific. "
    "If none apply, return \"Other\".\n\n"
    "Output only the JSON array, with no additional text or formatting."
)

evaluation_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the evaluation types the study uses according to the following categories.\n\n"
    f"{json.dumps(evaluation_categories, indent=2)}\n\n"
    "Output a list of the most relevant categories. If none apply, return an empty list.\n\n"
    "Output only the JSON array, with no additional text or formatting."
)

metric_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the evaluation metrics the study uses according to the following categories.\n\n"
    f"{json.dumps(metric_categories, indent=2)}\n\n"
    "Output a list of the most relevant categories. If none apply, return an empty list.\n\n"
    "Output only the JSON array, with no additional text or formatting."
)

dataset_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the datasets the study uses according to the following categories.\n\n"
    f"{json.dumps(dataset_categories, indent=2)}\n\n"
    "Output a list of the most relevant categories. If none apply, return an empty list. Additionally, classify whether the dataset is open-access, proprietary, or unsure. "
    "Here is an example output:\n\n"
    '[{"category": "Real-World Electronic Health Records", "access": "proprietary", "parsed_name_or_description": "MIMIC III"}, '
    '{"category": "Clinical Vignettes & Case Reports", "access": "open-access", "parsed_name_or_description": "NEJM case of the month"}...]\n\n'
    "\nOutput only the JSON array, with no additional text or formatting. "
    "The JSON should be structured as the example, with each unique dataset represented as an object with 'category', 'access', and 'parsed_name_or_description' fields. "
    "Assign at MOST one category PER DATASET mentioned in the abstract (if multiple datasets are mentioned, categorize them all. "
    "If no datasets are mentioned, return an empty list for categories)."
)

model_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the LLM models the study uses according to the following classes.\n\n"
    '"open-source", "proprietary", "fine-tuned", "custom", "unsure"\n\n'
    "Output a list of the most relevant categories. If none apply, return an empty list.\n\n"
    "Output only the JSON array, with no additional text or formatting."
)

evaluator_prompt = (
    "Given a study's title, abstract, and some already extracted data, classify the types of human evaluators the study uses according to the following classes.\n\n"
    f"{json.dumps(expert_classes, indent=2)}\n\n"
    "Output a list of the most relevant categories. Use the minimal number of categories (and most descriptive) to describe all experts in the study. If none apply, return an empty list.\n\n"
    "Output only the JSON array, with no additional text or formatting."
)

In [None]:
included_studies = []
with open("included_studies_with_extracted_data-GPT-5r-high.jsonl", "r") as f:
    for line in f:
        item = json.loads(line)
        included_studies.append(item)

In [None]:
template = {"custom_id": "", "method": "POST", "url": "/v1/responses",
              "body": {"model": "gpt-5", 
                      "reasoning": {"effort": "minimal"}, # this is mostly a classification task, so minimal effort should suffice
                      "instructions": "",
                      "input": "",
              }
            }

out_jsonl = []

for idx, row in tqdm(enumerate(included_studies), total=len(included_studies)):
    #for prompt, prompt_name in zip([specialty_prompt, task_prompt, evaluation_prompt, metric_prompt, dataset_prompt, evaluator_prompt], ["specialty", "task", "evaluation", "metric", "dataset", "evaluator"]):
    for prompt, prompt_name in zip([specialty_prompt, task_prompt, evaluation_prompt, metric_prompt, dataset_prompt], ["specialty", "task", "evaluation", "metric", "dataset"]):
        out = copy.deepcopy(template)
        out["custom_id"] = str(idx) + "-" + prompt_name
        out["body"]["instructions"] = prompt
        out["body"]["input"] = f"Title: {row['Title']}\nAbstract: {row['Abstract']}\nExtracted data: {json.dumps(row['extracted_data'], indent=2)}"
        out_jsonl.append(out)

with open("extracted_data_parsing.jsonl", "w") as f:
    for entry in out_jsonl:
        f.write(json.dumps(entry) + "\n")

In [None]:
template = {"custom_id": "", "method": "POST", "url": "/v1/responses",
              "body": {"model": "gpt-5", 
                      "reasoning": {"effort": "minimal"}, # this is mostly a classification task, so minimal effort should suffice
                      "instructions": "",
                      "input": "",
              }
            }

out_jsonl = []

for idx, row in tqdm(enumerate(included_studies), total=len(included_studies)):
    for prompt, prompt_name in zip([task_prompt], ["task_specific"]):
        out = copy.deepcopy(template)
        out["custom_id"] = str(idx) + "-" + prompt_name
        out["body"]["instructions"] = prompt
        out["body"]["input"] = f"Title: {row['Title']}\nAbstract: {row['Abstract']}\nExtracted data: {json.dumps(row['extracted_data'], indent=2)}"
        out_jsonl.append(out)

with open("extracted_data_parsing_tasks.jsonl", "w") as f:
    for entry in out_jsonl:
        f.write(json.dumps(entry) + "\n")

## Grouping the Models

In [None]:
# Split into model names list
models = [m.strip() for m in extracted_fields["models_used"] if m.strip()]

# Define category structure
categories = {
    'A1_ClosedSource_GeneralLLM': [],
    'A2_OpenSource_BigLLM': [],
    'B1_TinyOrDistilled': [],
    'B2_Midsize_OpenSource': [],
    'C1_BiomedicalClinical': [],
    'C2_OtherDomain': [],
    'D1_FineTuned_ClinicalChat': [],
    'D2_FineTuned_ResearchPilot': [],
    'E1_RAG': [],
    'E2_Agentic': [],
    'F1_VisionLanguage': [],
    'F2_AudioLanguage': [],
    'F3_GeneralMLLM': [],
    'F4_VisionDxBackbone': [],
    'G_VisionOnly': [],
    'H_Embeddings': [],
    'I_Encoders': [],
    'J1_TreeEnsemble': [],
    'J2_LinearProb': [],
    'J3_Ngram': [],
    'K_SpeechAssistants': [],
    'L_OtherNeuralHybrid': [],
    'Uncategorized': []
}

# Helper functions
def has_any(name, keywords):
    return any(k.lower() in name.lower() for k in keywords)

def classify(name):
    n = name.lower()
    # J categories first for traditional ML
    if has_any(n, ['random forest', 'xgboost', 'gradient boosting', 'lightgbm']):
        return 'J1_TreeEnsemble'
    if has_any(n, ['logistic regression', 'support vector machine', 'svm']):
        return 'J2_LinearProb'
    if has_any(n, ['n-gram', 'bag-of-words']):
        return 'J3_Ngram'
    
    # Vision-only
    if has_any(n, ['u-net', 'yolov', 'densenet', 'vgg', 'resnet', 'vision transformer', 'cnn', 'dcnn', 'segmentation model', 'yolo', 'transformer-based vision model']):
        return 'G_VisionOnly'
    
    # Embeddings
    if has_any(n, ['embedding', 'sbert', 'sentence-bert', 'openai embeddings', 'text-embedding']):
        return 'H_Embeddings'
    
    # Encoders
    if has_any(n, ['bert', 'roberta', 'deberta', 'longformer', 'albert', 'bart', 'electra', 'word2vec', 'fasttext', 'gpt-2']):
        return 'I_Encoders'
    
    # Audio language
    if has_any(n, ['whisper', 'audiopalm']):
        return 'F2_AudioLanguage'
    
    # Multimodal - vision language
    if has_any(n, ['gpt-4v', 'vision', 'blip', 'flamingo', 'llava', 'multimodal', 'imagebind', 'skin', 'm4cxr', 'cxlava', 'cxr', 'llmseg', 'mlmm']):
        return 'F1_VisionLanguage'
    
    # General MLLM
    if has_any(n, ['gpt-4o', 'grok', 'gemini pro vision', 'gemini pro-v', '4o1', 'mlmm']):
        return 'F3_GeneralMLLM'
    
    # Vision backbone
    if has_any(n, ['u-net +', '+ vgg', '+ inception', 'yolo', 'densenet', 'kera-cxr', 'boneview', 'medflamingo']):
        return 'F4_VisionDxBackbone'
    
    # RAG
    if has_any(n, ['rag', 'self-biorag', 'retrieval-augmented']):
        return 'E1_RAG'
    
    # Agentic
    if has_any(n, ['agent', 'multi-agent', 'agents']):
        return 'E2_Agentic'
    
    # Biomedical
    if has_any(n, ['med', 'clinical', 'bio', 'onc', 'derm', 'rad', 'gastro', 'surg', 'cardio', 'neuro', 'oph', 'peds', 'tcml', 'tcm', 'hpt', 'pharm', 'hiv', 'covid']):
        return 'C1_BiomedicalClinical'
    
    # Other domain specialized
    if has_any(n, ['legal', 'finance', 'code', 'chem', 'econ']):
        return 'C2_OtherDomain'
    
    # Fine-tuned clinical chat
    if has_any(n, ['gpt', 'chat', 'bot', 'assistant', 'ai', 'copilot']) and has_any(n, ['derm', 'skin', 'drug', 'doctor', 'clinic', 'health', 'rare', 'radiolog', 'surg', 'prostate', 'epilep', 'cardio', 'ecg', 'rad', 'onc', 'peds', 'eau', 'gastro', 'otolaryng']):
        return 'D1_FineTuned_ClinicalChat'
    
    # Fine-tuned research pilot/general
    if has_any(n, ['custom', 'fine-tuned', 'pilot', 'research', 'tool']):
        return 'D2_FineTuned_ResearchPilot'
    
    # Closed source General LLM
    closed_brands = ['gpt', 'chatgpt', 'claude', 'gemini', 'bard', 'grok', 'copilot', 'bing', 'pi', 'perplexity', 'youchat', 'alexa', 'google assistant', 'my ai']
    if has_any(n, closed_brands):
        return 'A1_ClosedSource_GeneralLLM'
    
    # Open-source Large
    if has_any(n, ['llama', 'mixtral', 'falcon', 'mistral', 'qwen', 'deepseek', 'wizardlm', 'vicuna', 'gemma', 'yi', 'baichuan', 'bloom', 'reka', 'orca']):
        # size
        if has_any(n, ['7b', '8b', '9b', '3b', '4b', '5b', '6b']):
            return 'B2_Midsize_OpenSource'
        else:
            return 'A2_OpenSource_BigLLM'
    
    # Tiny / distilled
    if has_any(n, ['tiny', 'mini', 'distil', 'phi', 'orca_mini', 'o1-mini', 'bc-slm', 'slm', 'small']):
        return 'B1_TinyOrDistilled'
    
    # Speech assistants
    if has_any(n, ['alexa', 'google assistant', 'bing chatbot', 'bing ai', 'bing chat', 'google voice']):
        return 'K_SpeechAssistants'
    
    # Other neural / hybrid
    if has_any(n, ['lstm', 'cnn', 'clip', 'fft', 'transformer']):
        return 'L_OtherNeuralHybrid'
    
    return 'Uncategorized'

# Classify all models
for m in models:
    cat = classify(m)
    categories[cat].append(m)

# For brevity, remove empty categories
categories = {k: v for k, v in categories.items() if v}

## Model Brand Extraction

In [None]:
import re
from collections import OrderedDict

# ──────────────────────────────────────────────────────────────────────────────
# Comprehensive brand patterns
# - Put specific/versioned patterns *before* broad brand buckets to avoid
#   false captures. Hyphens/underscores/slashes in model ids are common,
#   so patterns are liberal to handle "org/Model-3.1-8B-Instruct" shapes.
# ──────────────────────────────────────────────────────────────────────────────
brand_patterns = [
    # ─── OpenAI family (specific before generic) ────────────────────────────
    # 4o / Omni / o-series (o1/o3) appear in many OpenAI model ids.
    ('GPT-4o',          re.compile(r'\b(?:gpt|chat[\s\-]?gpt)?[\s\-]?(?:4[o0]|4-?omni)\b|\b(?:o1|o3)(?:[\-\w]*)\b', re.I)),
    ('GPT-4.1',         re.compile(r'\b(?:gpt|chat[\s\-]?gpt)[\s\-]?4\.1(?:[\s\-]?(?:mini|nano))?\b', re.I)),
    ('ChatGPT-4',       re.compile(r'Chat[\s\-]?GPT.*\b4(?:\.0|v)?\b', re.I)),
    ('ChatGPT-3.5',     re.compile(r'Chat[\s\-]?GPT.*\b3\.5\b', re.I)),
    ('GPT-4',           re.compile(r'^\s*GPT[\s\-]?4(?![a-zA-Z])', re.I)),
    ('GPT-3.5',         re.compile(r'^\s*GPT[\s\-]?3\.5(?![a-zA-Z0-9])', re.I)),

    # ─── Anthropic Claude (version-aware) ───────────────────────────────────
    # Matches forms like:
    #   claude-3-5-sonnet-20240620, claude-3-opus, claude-3-haiku, claude-2.1
    ('Claude 3.5',      re.compile(r'\bclaude[\-\w]*3(?:[.\-]5)[\-\w]*\b', re.I)),
    ('Claude 3',        re.compile(r'\bclaude[\-\w]*3[\-\w]*(?:opus|sonnet|haiku)?\b', re.I)),
    ('Claude 2',        re.compile(r'\bclaude[\-\w]*2(?:\.\d+)?[\-\w]*(?:instant)?\b', re.I)),
    ('Claude (unspecified)', re.compile(r'\bClaude\b', re.I)),

    # ─── Google Gemini (version-aware) ──────────────────────────────────────
    # Matches:
    #   gemini-1.5-pro|flash|ultra|vision, gemini-1.0-pro|ultra, gemini-nano
    ('Gemini 1.5',      re.compile(r'\bgemini[\s\-]?1\.5[\-\w]*\b', re.I)),
    ('Gemini 1.0',      re.compile(r'\bgemini[\s\-]?1\.0[\-\w]*\b', re.I)),
    ('Gemini Nano',     re.compile(r'\bgemini[\s\-]?nano\b', re.I)),
    ('Gemini',          re.compile(r'\bGemini\b', re.I)),

    # ─── Meta LLaMA (version-aware + subfamilies) ───────────────────────────
    # Matches:
    #   meta-llama/Llama-3.1-8B-Instruct, Llama-3-70B, Llama 2, Code Llama, Llama Guard
    ('Code Llama',      re.compile(r'\bcode[\s\-]?llama\b', re.I)),
    ('Llama Guard',     re.compile(r'\bllama[\s\-]?guard\b', re.I)),
    ('LLaMA 3.x',       re.compile(r'\b(?:meta[\s\-]?)?llama[\s\-]?3(?:\.\d+)?\b', re.I)),
    ('LLaMA 2',         re.compile(r'\b(?:meta[\s\-]?)?llama[\s\-]?2\b', re.I)),
    ('LLaMA',           re.compile(r'\b(?:meta[\s\-]?)?llama\b', re.I)),

    # ─── Mistral AI ─────────────────────────────────────────────────────────
    ('Mixtral',         re.compile(r'\bmixtral\b', re.I)),
    ('Mistral',         re.compile(r'\bmistral(?:ai)?\b', re.I)),

    # ─── Alibaba Qwen ───────────────────────────────────────────────────────
    # Matches:
    #   Qwen2.5-72B-Instruct, Qwen2-VL, Qwen1.5, Qwen-7B-Chat, CodeQwen
    ('Qwen',            re.compile(r'\bqwen(?:[\-\s]?(?:1\.5|2(?:\.5)?|vl|coder|audio|vision)?[\w\-]*)?\b', re.I)),

    # ─── Microsoft Phi ──────────────────────────────────────────────────────
    # Matches: Phi-3, Phi-3.5, phi-3-mini-4k-instruct
    ('Phi',             re.compile(r'\bphi[\s\-]?\d', re.I)),

    # ─── Google Gemma ───────────────────────────────────────────────────────
    # Matches: gemma-2b-it, gemma-7b, codegemma
    ('CodeGemma',       re.compile(r'\bcode[\s\-]?gemma\b', re.I)),
    ('Gemma',           re.compile(r'\bgemma(?:[\-\s]?\d)?\b', re.I)),

    # ─── Databricks ─────────────────────────────────────────────────────────
    ('DBRX',            re.compile(r'\bdbrx\b', re.I)),

    # ─── DeepSeek (incl. Coder) ─────────────────────────────────────────────
    ('DeepSeek',        re.compile(r'\bdeepseek(?:[\-\s]?(?:coder|v\d(?:\.\d)?))?\b', re.I)),

    # ─── ZhipuAI GLM / ChatGLM ──────────────────────────────────────────────
    ('GLM / ChatGLM',   re.compile(r'\b(?:chat)?glm[\w\-\.\s]*\b', re.I)),

    # ─── 01.AI Yi ───────────────────────────────────────────────────────────
    # Guard with digit to avoid "yi" false matches.
    ('Yi',              re.compile(r'(?<![a-z])yi[\-\s]?\d', re.I)),

    # ─── Baichuan ───────────────────────────────────────────────────────────
    ('Baichuan',        re.compile(r'\bbaichuan\b', re.I)),

    # ─── xAI Grok ───────────────────────────────────────────────────────────
    ('Grok',            re.compile(r'(?<![a-z])grok[\-\s]?\d', re.I)),

    # ─── LLaVA (VLM) ────────────────────────────────────────────────────────
    ('LLaVA',           re.compile(r'\bllava\b', re.I)),

    # ─── MosaicML / Databricks MPT ──────────────────────────────────────────
    ('MPT',             re.compile(r'\bmpt[\-\s]?\d', re.I)),

    # ─── EleutherAI & community families ────────────────────────────────────
    ('GPT-NeoX',        re.compile(r'\bgpt[\-\s]?neox\b', re.I)),
    ('GPT-J',           re.compile(r'\bgpt[\-\s]?j\b', re.I)),
    ('Pythia',          re.compile(r'\bpythia[\-\s]?\d', re.I)),
    ('RedPajama',       re.compile(r'\bredpajama\b', re.I)),

    # ─── AI21 Labs ──────────────────────────────────────────────────────────
    ('Jamba (AI21)',    re.compile(r'\bjamba\b', re.I)),
    ('Jurassic (AI21)', re.compile(r'\bjurassic\b', re.I)),

    # ─── Cohere ─────────────────────────────────────────────────────────────
    # Matches: command, command-r, command-r+, command-light
    ('Cohere Command',  re.compile(r'\b(?:cohere[\-\s]?)?(?:command(?:[\s\-]?(?:r\+?|light)?)|c4ai\-command)\b', re.I)),

    # ─── NVIDIA ─────────────────────────────────────────────────────────────
    ('Nemotron (NVIDIA)', re.compile(r'\bnemotron\b', re.I)),

    # ─── TII UAE ────────────────────────────────────────────────────────────
    ('Falcon',          re.compile(r'\bfalcon[\-\s]?\d', re.I)),

    # ─── LMSYS & popular fine-tunes ─────────────────────────────────────────
    ('Vicuna',          re.compile(r'\bvicuna\b', re.I)),
    ('WizardLM',        re.compile(r'\bwizard(?:lm|coder)\b', re.I)),
    ('Zephyr',          re.compile(r'\bzephyr\b', re.I)),
    ('Nous Hermes',     re.compile(r'\b(?:nous[\-\s]?)?hermes\b|openhermes', re.I)),

    # ─── Reka AI ────────────────────────────────────────────────────────────
    ('Reka',            re.compile(r'\breka\b', re.I)),

    # ─── Perplexity (API model ids like pplx-70b-online) ───────────────────
    ('Perplexity (pplx)', re.compile(r'\b(?:perplexity|pplx)[\w\-]*\b', re.I)),

    # ─── Alibaba Tongyi (other than Qwen) ───────────────────────────────────
    ('Tongyi',          re.compile(r'\btongyi\b', re.I)),

    # ─── OpenBMB MiniCPM ────────────────────────────────────────────────────
    ('MiniCPM',         re.compile(r'\bminicpm\b', re.I)),

    # ─── BigScience BLOOM ───────────────────────────────────────────────────
    ('BLOOM',           re.compile(r'\bbloomz?\b', re.I)),

    # ─── RWKV ───────────────────────────────────────────────────────────────
    ('RWKV',            re.compile(r'\brwkv\b', re.I)),

    # ─── Google Bard (legacy) ───────────────────────────────────────────────
    ('Bard',            re.compile(r'\bBard\b', re.I)),

    # ─── Assistants / bots (broad) ──────────────────────────────────────────
    ('Bing Chat',       re.compile(r'\bBing\b', re.I)),
    ('Pi',              re.compile(r'\bPi\b', re.I)),
    ('Alexa',           re.compile(r'Alexa', re.I)),
    ('Google Assistant',re.compile(r'Google Assistant', re.I)),

    # ─── OpenAI catch‑all ───────────────────────────────────────────────────
    ('OpenAI (other)',  re.compile(r'\bOpenAI\b', re.I)),

    # ─── Broad buckets LAST ─────────────────────────────────────────────────
    ('ChatGPT-unspecified',
                        re.compile(r'\bChat[\s\-]?GPT\b', re.I)),
    ('GPT-unspecified', re.compile(r'^\s*GPT\b(?![\s\-]?(?:4|3\.5))', re.I)),
]

def categorize_brands(model_list):
    """
    Return an OrderedDict mapping brand -> [matching original names] for the
    provided iterable of model names.

    Notes:
      - First-match wins (ordering in `brand_patterns` matters).
      - Buckets include one key per brand label; 'Other' collects leftovers.
    """
    # Ensure unique keys appear once in the buckets, preserving *final* order
    # from brand_patterns (duplicated labels would otherwise be overwritten).
    seen = set()
    ordered_keys = []
    for brand, _ in brand_patterns:
        if brand not in seen:
            ordered_keys.append(brand)
            seen.add(brand)

    buckets = OrderedDict((brand, []) for brand in ordered_keys)
    buckets['Other'] = []

    for name in model_list:
        for brand, pattern in brand_patterns:
            if pattern.search(name):
                buckets[brand].append(name)
                break
        else:
            buckets['Other'].append(name)

    return buckets

def categorize_brand(model_name: str) -> str:
    """
    Return the brand key that best matches this single model name.
    If no pattern matches, returns 'Other'.
    """
    for brand, pattern in brand_patterns:
        if pattern.search(model_name):
            return brand
    return 'Other'

## Putting It All Together

In [None]:
with open("Batch Responses/GPT-5r-minimal-data-classification-output.jsonl", "r") as f:
    responses = [json.loads(line) for line in f]

In [None]:
# set up skeleton for processed data
for study in included_studies:
    processed_data = {"models_types": [],
        "model_categories": [],
        "specialties": [],
        "task_types": [],
        "evaluation_types": [],
        "evaluation_metrics": [],
        "human_evaluators": [],
        "dataset_types": [],
    }

    study["processed_data"] = processed_data

In [None]:
original_requests = []
with open("extracted_data_parsing.jsonl", "r") as f:
    for line in f:
        original_requests.append(json.loads(line))

In [None]:
# GPT-5r-minimal had formatting issues with 16 out of 23045 responses (0.07% error rate!!), so we manually fixed the formatting (not content) of these:

corrected_failures = {
    569: '{"categories": ["Knowledge Graphs / Ontologies / Terminologies", "Pharmacology & Medication Knowledge", "Educational & Reference Texts"], "access": "proprietary"}',
    6094: '{"categories": ["Clinician Board & Self-Assessment Questions", "Educational & Reference Texts"], "access": "unsure"}',
    7419: '{"categories": ["Clinician Board & Self-Assessment Questions", "Educational & Reference Texts"], "access": "proprietary"}',
    7779: '{"categories": ["Clinician Board & Self-Assessment Questions", "Patient-Facing Q&A & FAQs", "Research Literature & Abstract Corpora", "Educational & Reference Texts"], "access": "open-access"}',
    8929: '{"categories": ["Clinician Board & Self-Assessment Questions", "Clinical Vignettes & Case Reports", "Educational & Reference Texts"], "access": "proprietary"}',
    9309: '{"categories": ["Imaging Data & Reports", "Real-World Electronic Health Records"], "access": "unsure"}',
    9645: '[["Radiology", "Diagnostic Radiology"], ["Pathology", "Pathology - Anatomic/Clinical (AP/CP)"]]',
    9704: '{"categories": ["Clinical Vignettes & Case Reports", "Educational & Reference Texts"], "access": "unsure"}',
    11924: '{"categories": ["Real-World Electronic Health Records", "Research Literature & Abstract Corpora", "Clinical Trial Eligibility & Structured Cohorts"], "access": "unsure"}',
    13669: '{"categories": ["Synthetic / Simulated Clinical Data", "Dialogue & Chat Corpora", "Survey Instruments & Psychometric Scales", "Clinical Vignettes & Case Reports", "Social-Media & Search-Query Data"], "access": "unsure"}',
    15124: '{"categories": ["Clinical Vignettes & Case Reports", "Real-World Electronic Health Records"], "access": "unsure"}',
    16789: '{"categories": ["Patient-Facing Q&A & FAQs", "Dialogue & Chat Corpora", "Social-Media & Search-Query Data"], "access": "unsure"}',
    18244: '{"categories": ["Clinician Board & Self-Assessment Questions", "Educational & Reference Texts", "Research Literature & Abstract Corpora"], "access": "unsure"}',
    19504: '{"categories": ["Clinician Board & Self-Assessment Questions", "Educational & Reference Texts"], "access": "open-access"}',
    20834: '{"categories": ["Research Literature & Abstract Corpora", "Clinical Guidelines & Consensus Statements"], "access": "open-access"}',
    22994: '{"categories": ["Research Literature & Abstract Corpora", "Imaging Data & Reports", "Educational & Reference Texts", "Clinical Vignettes & Case Reports", "Real-World Electronic Health Records"], "access": "open-access"}'

}

for i, response in tqdm(enumerate(responses)):
    try:
        try:
            data = json.loads(response["response"]["body"]["output"][1]["content"][0]["text"])
        except:
            data = json.loads(corrected_failures[i])

        data_type = response["custom_id"].split("-")[1]

        # find original study title/abstract to match from the batch request
        assert original_requests[i]["custom_id"] == response["custom_id"], "Mismatched request/response!"
        original_study = original_requests[i]

        # find matching study (by title/abstract)
        # this isn't the most efficient thing on the planet but computers are fast :)
        for study in included_studies:
            found = False

            title = study.get("Title")
            abstract = study.get("Abstract")

            if title and abstract and title in original_study["body"]["input"]:
                if isinstance(abstract, str):
                    if abstract in original_study["body"]["input"]:
                        found = True
                        break
                else: # sometimes there is no abstract, so we handle this case
                    found = True
                    break

        assert found, "Could not find matching study for response!"

        if data_type == "specialty":
            study["processed_data"]["specialties"] = data
        elif data_type == "task":
            study["processed_data"]["task_types"] = data
        elif data_type == "evaluation":
            study["processed_data"]["evaluation_types"] = data
        elif data_type == "metric":
            study["processed_data"]["evaluation_metrics"] = data
        elif data_type == "dataset":
            study["processed_data"]["dataset_types"] = data
        else:
            print(f"Unknown data type: {data_type}")
    except Exception as e:
        print(f"Failed to process response {i}: {e}")
        failures.append((i, str(e), response))    

## This part is extra since we initially did not put the evaluator type in the prompt list, so we run/parse this separately
(Note that this does not change methodology at all since each classification task is an individual batch request anyway)

In [None]:
template = {"custom_id": "", "method": "POST", "url": "/v1/responses",
              "body": {"model": "gpt-5", 
                      "reasoning": {"effort": "minimal"}, # this is mostly a classification task, so minimal effort should suffice
                      "instructions": "",
                      "input": "",
              }
            }

out_jsonl = []

for idx, row in tqdm(enumerate(included_studies), total=len(included_studies)):
    for prompt, prompt_name in zip([evaluator_prompt], ["evaluator"]):
        out = copy.deepcopy(template)
        out["custom_id"] = str(idx) + "-" + prompt_name
        out["body"]["instructions"] = prompt
        out["body"]["input"] = f"Title: {row['Title']}\nAbstract: {row['Abstract']}\nExtracted data: {json.dumps(row['extracted_data'], indent=2)}"
        out_jsonl.append(out)

with open("extracted_data_parsing_evaluator_only.jsonl", "w") as f:
    for entry in out_jsonl:
        f.write(json.dumps(entry) + "\n")

In [None]:
original_requests = []
with open("extracted_data_parsing_evaluator_only.jsonl", "r") as f:
    for line in f:
        original_requests.append(json.loads(line))

with open("Batch Responses/GPT-5r-minimal-expert-classification-output.jsonl", "r") as f:
    for line in f:
        original_requests.append(json.loads(line))

responses = [json.loads(line) for line in open("Batch Responses/GPT-5r-minimal-expert-classification-output.jsonl", "r")]
failures = []

for i, response in tqdm(enumerate(responses)):
    try:
        try:
            data = json.loads(response["response"]["body"]["output"][1]["content"][0]["text"])
        except:
            data = json.loads(corrected_failures[i])

        data_type = response["custom_id"].split("-")[1]

        # find original study title/abstract to match from the batch request
        assert original_requests[i]["custom_id"] == response["custom_id"], "Mismatched request/response!"
        original_study = original_requests[i]

        # find matching study (by title/abstract)
        # this isn't the most efficient thing on the planet but computers are fast :)
        for study in included_studies:
            found = False

            title = study.get("Title")
            abstract = study.get("Abstract")

            if title and abstract and title in original_study["body"]["input"]:
                if isinstance(abstract, str):
                    if abstract in original_study["body"]["input"]:
                        found = True
                        break
                else: # sometimes there is no abstract, so we handle this case
                    found = True
                    break

        assert found, "Could not find matching study for response!"

        if data_type == "specialty":
            study["processed_data"]["specialties"] = data
        elif data_type == "task":
            study["processed_data"]["task_types"] = data
        elif data_type == "evaluation":
            study["processed_data"]["evaluation_types"] = data
        elif data_type == "metric":
            study["processed_data"]["evaluation_metrics"] = data
        elif data_type == "dataset":
            study["processed_data"]["dataset_types"] = data
        elif data_type == "evaluator":
            study["processed_data"]["human_evaluators"] = data
        else:
            print(f"Unknown data type: {data_type}")
    except Exception as e:
        print(f"Failed to process response {i}: {e}")
        failures.append((i, str(e), response))    

# Finally putting it all together!

In [None]:
with open("extracted_data_parsing_dataset.jsonl", "r") as f:
    original_requests = [json.loads(line) for line in f]

responses = []
with open("Batch Responses/GPT-5r-minimal-dataset-reclassification_output.jsonl", "r") as f:
    for line in f:
        responses.append(json.loads(line))

failures = []

for i, response in tqdm(enumerate(responses)):
    try:
        try:
            data = json.loads(response["response"]["body"]["output"][1]["content"][0]["text"])
        except:
            data = json.loads(corrected_failures[i])

        data_type = response["custom_id"].split("-")[1]

        # find original study title/abstract to match from the batch request
        assert original_requests[i]["custom_id"] == response["custom_id"], "Mismatched request/response!"
        original_study = original_requests[i]

        # find matching study (by title/abstract)
        # this isn't the most efficient thing on the planet but computers are fast :)
        for study in included_studies:
            found = False

            title = study.get("Title")
            abstract = study.get("Abstract")

            if title and abstract and title in original_study["body"]["input"]:
                if isinstance(abstract, str):
                    if abstract in original_study["body"]["input"]:
                        found = True
                        break
                else: # sometimes there is no abstract, so we handle this case
                    found = True
                    break

        assert found, "Could not find matching study for response!"

        if data_type == "specialty":
            study["processed_data"]["specialties"] = data
        elif data_type == "task":
            study["processed_data"]["task_types"] = data
        elif data_type == "evaluation":
            study["processed_data"]["evaluation_types"] = data
        elif data_type == "metric":
            study["processed_data"]["evaluation_metrics"] = data
        elif data_type == "dataset_specific":
            study["processed_data"]["dataset_types"] = data
        elif data_type == "evaluator":
            study["processed_data"]["human_evaluators"] = data
        else:
            print(f"Unknown data type: {data_type}")
    except Exception as e:
        print(f"Failed to process response {i}: {e}")
        failures.append((i, str(e), response))    

In [None]:
with open("extracted_data_parsing_tasks.jsonl", "r") as f:
    original_requests = [json.loads(line) for line in f]

responses = []
with open("Batch Responses/GPT-5r-minimal-task-specific-output.jsonl", "r") as f:
    for line in f:
        responses.append(json.loads(line))

failures = []

for i, response in tqdm(enumerate(responses)):
    try:
        try:
            data = json.loads(response["response"]["body"]["output"][1]["content"][0]["text"])
        except:
            data = json.loads(corrected_failures[i])

        data_type = response["custom_id"].split("-")[1]

        # find original study title/abstract to match from the batch request
        assert original_requests[i]["custom_id"] == response["custom_id"], "Mismatched request/response!"
        original_study = original_requests[i]

        # find matching study (by title/abstract)
        # this isn't the most efficient thing on the planet but computers are fast :)
        for study in included_studies:
            found = False

            title = study.get("Title")
            abstract = study.get("Abstract")

            if title and abstract and title in original_study["body"]["input"]:
                if isinstance(abstract, str):
                    if abstract in original_study["body"]["input"]:
                        found = True
                        break
                else: # sometimes there is no abstract, so we handle this case
                    found = True
                    break

        assert found, "Could not find matching study for response!"

        if data_type == "specialty":
            study["processed_data"]["specialties"] = data
        elif data_type == "task_specific":
            study["processed_data"]["task_types"] = data
        elif data_type == "evaluation":
            study["processed_data"]["evaluation_types"] = data
        elif data_type == "metric":
            study["processed_data"]["evaluation_metrics"] = data
        elif data_type == "dataset_specific":
            study["processed_data"]["dataset_types"] = data
        elif data_type == "evaluator":
            study["processed_data"]["human_evaluators"] = data
        else:
            print(f"Unknown data type: {data_type}")
    except Exception as e:
        print(f"Failed to process response {i}: {e}")
        failures.append((i, str(e), response))    

In [None]:
for study in included_studies:
    # find the category for the model
    processed_data = {"models_types": [],
        "model_categories": []}
    for model in study["extracted_data"]["models_used"]:
        if model == "":
            continue
        found = False
        for key, value in categories.items():
            if model in value:
                processed_data["models_types"].append(key)
                processed_data["model_categories"].append(categorize_brand(model))
                found = True
                break
        
        if not found:
            raise ValueError(f"Model '{model}' not found in any category.")
    
    study["processed_data"].update(processed_data)

with open("final_processed_studies.jsonl", "w") as f:
    for study in included_studies:
        f.write(json.dumps(study) + "\n")