In [11]:
import pandas as pd
import numpy as np
from scipy.stats import bootstrap
import matplotlib.pyplot as plt
import json
from pathlib import Path

In [12]:
# ROOT_DIR = Path("C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/")
ROOT_DIR = Path("C:/Users/user/Desktop/MSc/pb_msc/llm_judge/results_llm_retrival_reranker_llama_3_3/")

file_paths = sorted(ROOT_DIR.rglob("judgements_*.json"))

print(f"Found {len(file_paths)} files")
# file_paths[:5]

Found 27 files


In [None]:
def parse_filename_simple(p):
    name = p.stem
    parts = name.split("_")

    if parts[0] == "judgements":
        parts = parts[1:]

    bucket = parts[-1]
    rag_type = "_".join(parts[-3:-1])
    model = "_".join(parts[:-3])

    return model, rag_type, bucket


rows = []
all_items = []

for p in file_paths:
    try:
        with open(p, "r", encoding="utf-8") as f:
            data = json.load(f)

        n_items = len(data) if isinstance(data, list) else 0
        model, rag_type, bucket = parse_filename_simple(p)

        rows.append({
            "file": str(p),
            "model": model,
            "rag_type": rag_type,
            "bucket": bucket,
            "n_questions": n_items
        })

        if isinstance(data, list) and n_items > 0:
            all_items.extend(data)

    except Exception as e:
        rows.append({
            "file": str(p),
            "model": None,
            "rag_type": None,
            "bucket": None,
            "n_questions": 0
        })


df_file_overview = pd.DataFrame(rows)

print("Total valid questions loaded:", len(all_items))
display(df_file_overview.sort_values(["model", "rag_type", "bucket"]))


Total valid questions loaded: 2700


Unnamed: 0,file,model,rag_type,bucket,n_questions
0,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,advanced_rag,average,100
1,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,advanced_rag,good,100
2,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,advanced_rag,poor,100
3,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,baseline_rag,average,100
4,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,baseline_rag,good,100
5,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,baseline_rag,poor,100
6,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,naive_rag,average,100
7,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,naive_rag,good,100
8,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,deepseek_llm_7b,naive_rag,poor,100
9,C:\Users\user\Desktop\MSc\pb_msc\llm_judge\res...,llama_2_7b_hf,advanced_rag,average,100


In [14]:
def json_to_long_df(items):
    rows = []
    for it in items:
        qid = it["id"]
        model = it["model"]
        rag = it["rag_method"]
        bucket = it["bucket"]

        for j in it["judgements"]:
            qtype = j["question_type"]
            for criterion in ["correctness", "relevance", "completeness"]:
                rows.append({
                    "question_id": qid,
                    "model": model,
                    "rag_method": rag,
                    "bucket": bucket,
                    "question_type": qtype,
                    "criterion": criterion,
                    "score": int(j[criterion]),  # 0/1
                })

    return pd.DataFrame(rows)

df_long = json_to_long_df(all_items)

print("Long DF shape:", df_long.shape)
df_long.head()

Long DF shape: (24300, 7)


Unnamed: 0,question_id,model,rag_method,bucket,question_type,criterion,score
0,335,deepseek_llm_7b,advanced_rag,average,comprehension,correctness,1
1,335,deepseek_llm_7b,advanced_rag,average,comprehension,relevance,1
2,335,deepseek_llm_7b,advanced_rag,average,comprehension,completeness,1
3,335,deepseek_llm_7b,advanced_rag,average,analytical,correctness,1
4,335,deepseek_llm_7b,advanced_rag,average,analytical,relevance,1


In [15]:
def diff_mean(a, b):
    return np.mean(a) - np.mean(b)

def bootstrap_diff_ratio_scipy(scores_a, scores_b, n_boot=20000, ci=0.95, random_state=42):
    scores_a = np.asarray(scores_a)
    scores_b = np.asarray(scores_b)

    p1_obs = scores_a.mean()
    p2_obs = scores_b.mean()
    obs_diff = p1_obs - p2_obs
    obs_ratio = p1_obs / p2_obs if p2_obs > 0 else np.inf

    res = bootstrap(
        data=(scores_a, scores_b),
        statistic=diff_mean,
        n_resamples=n_boot,
        vectorized=False,
        paired=True,
        confidence_level=ci,
        method="percentile",
        random_state=random_state
    )

    ci_low = float(res.confidence_interval.low)
    ci_high = float(res.confidence_interval.high)
    boot_diffs = res.bootstrap_distribution
    
    # one sided p-value
    if obs_diff > 0:
        p_value_one_sided = float(np.mean(boot_diffs >= obs_diff))
    else:
        p_value_one_sided = 1.0

    #two sided p-value
    p_value_two_sided  = float(np.mean(np.abs(boot_diffs) >= np.abs(obs_diff)))

    return {
        "n1": len(scores_a),
        "n2": len(scores_b),
        "p1": p1_obs,
        "p2": p2_obs,
        "diff": obs_diff,
        "ratio": obs_ratio,
        "ci_low": ci_low,
        "ci_high": ci_high,
        "p_value_one_sided": p_value_one_sided,
        "p_value_two_sided": p_value_two_sided,
    }

def run_full_bootstrap(df_long, n_boot=20000, ci=0.95, random_state=42):
    bucket_pairs = [("good", "average"), ("good", "poor"), ("average", "poor")]
    group_cols = ["model", "rag_method", "question_type", "criterion"]

    out_rows = []

    for keys, df_g in df_long.groupby(group_cols):
        key_dict = dict(zip(group_cols, keys))

        bucket2scores = {
            b: df_g[df_g["bucket"] == b]["score"].values
            for b in df_g["bucket"].unique()
        }

        for b1, b2 in bucket_pairs:
            if b1 not in bucket2scores or b2 not in bucket2scores:
                continue

            res = bootstrap_diff_ratio_scipy(
                bucket2scores[b1],
                bucket2scores[b2],
                n_boot=n_boot,
                ci=ci,
                random_state=random_state
            )

            out_rows.append({
                **key_dict,
                "bucket_pair": f"{b1}_vs_{b2}",
                "bucket_1": b1,
                "bucket_2": b2,
                **res
            })

    return pd.DataFrame(out_rows)

df_boot = run_full_bootstrap(df_long, n_boot=20000, ci=0.95, random_state=42)

print("Bootstrap results shape:", df_boot.shape)
df_boot.head(10)


Bootstrap results shape: (243, 17)


Unnamed: 0,model,rag_method,question_type,criterion,bucket_pair,bucket_1,bucket_2,n1,n2,p1,p2,diff,ratio,ci_low,ci_high,p_value_one_sided,p_value_two_sided
0,deepseek_llm_7b,advanced_rag,analytical,completeness,good_vs_average,good,average,100,100,0.97,0.99,-0.02,0.979798,-0.06,0.02,1.0,0.6192
1,deepseek_llm_7b,advanced_rag,analytical,completeness,good_vs_poor,good,poor,100,100,0.97,1.0,-0.03,0.97,-0.07,0.0,1.0,0.5824
2,deepseek_llm_7b,advanced_rag,analytical,completeness,average_vs_poor,average,poor,100,100,0.99,1.0,-0.01,0.99,-0.03,0.0,1.0,0.63185
3,deepseek_llm_7b,advanced_rag,analytical,correctness,good_vs_average,good,average,100,100,0.97,1.0,-0.03,0.97,-0.07,0.0,1.0,0.58105
4,deepseek_llm_7b,advanced_rag,analytical,correctness,good_vs_poor,good,poor,100,100,0.97,0.99,-0.02,0.979798,-0.06,0.02,1.0,0.6143
5,deepseek_llm_7b,advanced_rag,analytical,correctness,average_vs_poor,average,poor,100,100,1.0,0.99,0.01,1.010101,0.0,0.03,0.63115,0.63115
6,deepseek_llm_7b,advanced_rag,analytical,relevance,good_vs_average,good,average,100,100,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
7,deepseek_llm_7b,advanced_rag,analytical,relevance,good_vs_poor,good,poor,100,100,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
8,deepseek_llm_7b,advanced_rag,analytical,relevance,average_vs_poor,average,poor,100,100,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0
9,deepseek_llm_7b,advanced_rag,comprehension,completeness,good_vs_average,good,average,100,100,1.0,0.99,0.01,1.010101,0.0,0.03,0.6316,0.6316


# Continuous per-question scoring

In [17]:
df_question = (
    df_long
    .groupby(["model", "rag_method", "bucket", "question_type", "question_id"], as_index=False)
    .agg(score_cont=("score", "mean"))
)

print("Continuous per-question DF shape:", df_question.shape)
display(df_question.head())

Continuous per-question DF shape: (8100, 6)


Unnamed: 0,model,rag_method,bucket,question_type,question_id,score_cont
0,deepseek_llm_7b,advanced_rag,average,analytical,335,1.0
1,deepseek_llm_7b,advanced_rag,average,analytical,828,1.0
2,deepseek_llm_7b,advanced_rag,average,analytical,1270,1.0
3,deepseek_llm_7b,advanced_rag,average,analytical,1408,1.0
4,deepseek_llm_7b,advanced_rag,average,analytical,1418,1.0


In [None]:
def diff_mean(a, b):
    """Return mean difference a - b."""
    return np.mean(a) - np.mean(b)


def bootstrap_continuous_bca(scores_a, scores_b, n_boot=100000, ci=0.95, random_state=42):
    scores_a = np.asarray(scores_a)
    scores_b = np.asarray(scores_b)

    p1 = scores_a.mean()
    p2 = scores_b.mean()
    obs_diff = p1 - p2

    res = bootstrap(
        data=(scores_a, scores_b),
        statistic=diff_mean,
        vectorized=False,
        paired=False,
        n_resamples=n_boot,
        confidence_level=ci,
        method="percentile",
        random_state=random_state
    )

    ci_low = float(res.confidence_interval.low)
    ci_high = float(res.confidence_interval.high)
    boot_diffs = res.bootstrap_distribution

    p_value_one_sided = float(np.mean(boot_diffs >= obs_diff))

    return {
        "n1": len(scores_a),
        "n2": len(scores_b),
        "p1": p1,
        "p2": p2,
        "diff": obs_diff,
        "ratio": p1 / p2 if p2 > 0 else np.inf,
        "ci_low": ci_low,
        "ci_high": ci_high,
        "p_value_one_sided": p_value_one_sided,
    }

In [None]:
def run_bootstrap_continuous(df_question, n_boot=100000, ci=0.95, random_state=42):
    bucket_1 = "good"
    bucket_2 = "poor"

    out_rows = []
    group_cols = ["model", "rag_method", "question_type"]

    for keys, df_g in df_question.groupby(group_cols):
        key_dict = dict(zip(group_cols, keys))

        df_b1 = df_g[df_g["bucket"] == bucket_1]["score_cont"].values
        df_b2 = df_g[df_g["bucket"] == bucket_2]["score_cont"].values

        res = bootstrap_continuous_bca(
            df_b1, df_b2,
            n_boot=n_boot,
            ci=ci,
            random_state=random_state
        )

        out_rows.append({
            **key_dict,
            "bucket_pair": f"{bucket_1}_vs_{bucket_2}",
            **res
        })

    return pd.DataFrame(out_rows)


df_boot_cont = run_bootstrap_continuous(df_question)

print("Final continuous bootstrap results:", df_boot_cont.shape)
display(df_boot_cont.head())

Final continuous bootstrap results: (27, 13)


Unnamed: 0,model,rag_method,question_type,bucket_pair,n1,n2,p1,p2,diff,ratio,ci_low,ci_high,p_value_one_sided
0,deepseek_llm_7b,advanced_rag,analytical,good_vs_poor,100,100,0.98,0.996667,-0.016667,0.983278,-0.033333,0.0,0.51211
1,deepseek_llm_7b,advanced_rag,comprehension,good_vs_poor,100,100,0.996667,1.0,-0.003333,0.996667,-0.01,0.0,0.49902
2,deepseek_llm_7b,advanced_rag,textual_stylistic,good_vs_poor,100,100,0.966667,0.903333,0.063333,1.070111,0.023333,0.103333,0.48888
3,deepseek_llm_7b,baseline_rag,analytical,good_vs_poor,100,100,0.806667,0.836667,-0.03,0.964143,-0.103333,0.043333,0.49781
4,deepseek_llm_7b,baseline_rag,comprehension,good_vs_poor,100,100,0.743333,0.716667,0.026667,1.037209,-0.066667,0.12,0.49047


In [21]:
output_path = "bootstrap_results.xlsx"

with pd.ExcelWriter(output_path, engine="openpyxl") as writer:
    df_boot.to_excel(writer, index=False, sheet_name="per_criterion_results")
    df_boot_cont.to_excel(writer, index=False, sheet_name="per_question_results")

print(f"Saved both sheets to: {output_path}")

Saved both sheets to: bootstrap_results.xlsx
