Here we want to provide statistical proof that exomiser truly is statistical significant difference among the rest of the models in top-1. To do this we can use row level mcnear test

In [None]:
import os
import pandas as pd
import ast
from statsmodels.stats.contingency_tables import mcnemar

In [None]:
full_results = "../data/results/ex_vs_llm_review/full_results/"
# glob all tsv files in the folder
tsv_files = [f for f in os.listdir(full_results) if f.endswith(".tsv")]

# Helper to extract top-1 is_correct from a row
def get_top1_is_correct(row):
    try:
        scored = ast.literal_eval(row['scored']) if isinstance(row['scored'], str) else row['scored']
        if scored and scored[0]['rank'] == 1:
            return scored[0]['is_correct']
    except Exception:
        pass
    return False

# Load all models and extract top-1 is_correct for each
model_results = {}
for fname in tsv_files:
    model_name = fname.replace('.tsv','')
    df = pd.read_csv(os.path.join(full_results, fname), sep='\t')
    model_results[model_name] = df.apply(get_top1_is_correct, axis=1).tolist()

# Pick exomiser as reference
exomiser_key = [k for k in model_results if 'exomiser' in k][0]
exomiser = model_results[exomiser_key]

# Run McNemar test for exomiser vs each other model
for model, results in model_results.items():
    if model == exomiser_key:
        continue
    # Build contingency table
    table = [[0,0],[0,0]]
    for e, r in zip(exomiser, results):
        table[int(e)][int(r)] += 1
    result = mcnemar(table, exact=True)
    print(f"Exomiser vs {model}")
    print(f"Contingency table: {table}")
    print(f"McNemar statistic: {result.statistic}, p-value: {result.pvalue}\n")