In [2]:
# %pip install scipy

In [20]:
import pandas as pd
import numpy as np
from scipy.stats import bootstrap
import matplotlib.pyplot as plt
import json
from pathlib import Path

# Single file test

In [4]:
base = "C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/"
path_1 = base + "deepseek_llm_7b/judgements_deepseek_llm_7b_naive_rag_good.json"
path_2 = base + "deepseek_llm_7b/judgements_deepseek_llm_7b_naive_rag_average.json"

In [5]:
def load_json(path):
    with open(path, "r", encoding="utf-8") as f:
        return json.load(f)

In [6]:
data1 = load_json(path_1)
data2 = load_json(path_2)

In [7]:
raw_data = data1 + data2

In [8]:
pd.DataFrame(raw_data)

Unnamed: 0,id,model,rag_method,bucket,judgements
0,8,deepseek_llm_7b,naive_rag,good,"[{'correctness': 1, 'relevance': 1, 'completen..."
1,44,deepseek_llm_7b,naive_rag,good,"[{'correctness': 1, 'relevance': 1, 'completen..."
2,228,deepseek_llm_7b,naive_rag,good,"[{'correctness': 1, 'relevance': 1, 'completen..."
3,319,deepseek_llm_7b,naive_rag,good,"[{'correctness': 1, 'relevance': 1, 'completen..."
4,472,deepseek_llm_7b,naive_rag,good,"[{'correctness': 1, 'relevance': 1, 'completen..."
...,...,...,...,...,...
195,18736,deepseek_llm_7b,naive_rag,average,"[{'correctness': 1, 'relevance': 1, 'completen..."
196,18825,deepseek_llm_7b,naive_rag,average,"[{'correctness': 1, 'relevance': 1, 'completen..."
197,18839,deepseek_llm_7b,naive_rag,average,"[{'correctness': 1, 'relevance': 1, 'completen..."
198,18899,deepseek_llm_7b,naive_rag,average,"[{'correctness': 1, 'relevance': 1, 'completen..."


In [9]:
def json_to_long_df(items):
    rows = []

    for it in items:
        qid = it["id"]
        model = it["model"]
        rag = it["rag_method"]
        bucket = it["bucket"]

        # each question has 3 judgements — one per criterion
        for j in it["judgements"]:
            qtype = j["question_type"]

            for criterion in ["correctness", "relevance", "completeness"]:
                rows.append({
                    "question_id": qid,
                    "model": model,
                    "rag_method": rag,
                    "bucket": bucket,
                    "question_type": qtype,
                    "criterion": criterion,
                    "score": int(j[criterion])  # 0/1
                })

    return pd.DataFrame(rows)

In [10]:
df_long = json_to_long_df(raw_data)
df_long.tail()

Unnamed: 0,question_id,model,rag_method,bucket,question_type,criterion,score
1795,19146,deepseek_llm_7b,naive_rag,average,analytical,relevance,1
1796,19146,deepseek_llm_7b,naive_rag,average,analytical,completeness,1
1797,19146,deepseek_llm_7b,naive_rag,average,textual_stylistic,correctness,0
1798,19146,deepseek_llm_7b,naive_rag,average,textual_stylistic,relevance,0
1799,19146,deepseek_llm_7b,naive_rag,average,textual_stylistic,completeness,0


In [11]:
def get_scores(df, model, rag, qtype, criterion, bucket):
    df_f = df[
        (df["model"] == model) &
        (df["rag_method"] == rag) &
        (df["question_type"] == qtype) &
        (df["criterion"] == criterion) &
        (df["bucket"] == bucket)
    ]
    return df_f["score"].values

In [12]:
# Example usage:
scores_A = get_scores(df_long,
                      model="deepseek_llm_7b",
                      rag="naive_rag",
                      qtype="analytical",
                      criterion="correctness",
                      bucket="good")

scores_B = get_scores(df_long,
                      model="deepseek_llm_7b",
                      rag="naive_rag",
                      qtype="analytical",
                      criterion="correctness",
                      bucket="average")

scores_A[:10], scores_B[:10]

(array([0, 1, 1, 0, 1, 1, 1, 0, 0, 0]), array([1, 0, 1, 1, 0, 0, 1, 1, 1, 1]))

In [17]:
from scipy.stats import bootstrap
import numpy as np

def diff_mean(a, b):
    return np.mean(a) - np.mean(b)

res = bootstrap(
    data=(scores_A, scores_B),
    statistic=diff_mean,
    n_resamples=1000,
    vectorized=False,
    paired=False,
    confidence_level=0.95,
    method="percentile",
    random_state=42
)

print("95% CI:", res.confidence_interval)
print("Example bootstrap diffs:", res.bootstrap_distribution[:10])


95% CI: ConfidenceInterval(low=np.float64(-0.30999999999999994), high=np.float64(-0.06000000000000005))
Example bootstrap diffs: [-0.17 -0.2  -0.19 -0.29 -0.15 -0.28 -0.2  -0.22 -0.25 -0.2 ]


In [18]:
len(res.bootstrap_distribution)

1000

# Judgements input

In [23]:
ROOT_DIR = Path("C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/")

file_paths = sorted(ROOT_DIR.rglob("judgements_*.json"))

print(f"Found {len(file_paths)} files")
file_paths[:5]

Found 27 files


[WindowsPath('C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/deepseek_llm_7b/judgements_deepseek_llm_7b_advanced_rag_average.json'),
 WindowsPath('C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/deepseek_llm_7b/judgements_deepseek_llm_7b_advanced_rag_good.json'),
 WindowsPath('C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/deepseek_llm_7b/judgements_deepseek_llm_7b_advanced_rag_poor.json'),
 WindowsPath('C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/deepseek_llm_7b/judgements_deepseek_llm_7b_modular_rag_average.json'),
 WindowsPath('C:/Users/user/Desktop/pb_msc/llm_judge/results_new_rag_llama_3_3/deepseek_llm_7b/judgements_deepseek_llm_7b_modular_rag_good.json')]

In [24]:
def parse_filename(p: Path):
    """
    Expected patterns like:
      judgements_deepseek_llm_7b_advanced_rag_average.json
      judgements_llama_2_7b_hf_naive_rag_poor.json
      judgements_mistral-7b_v01_modular_rag_good.json
    """
    name = p.stem  # without .json
    parts = name.split("_")

    # remove leading "judgements"
    if parts[0] == "judgements":
        parts = parts[1:]

    bucket = parts[-1]                 # good / average / poor
    rag_method = "_".join(parts[-3:-1])  # advanced_rag / naive_rag / modular_rag
    model = "_".join(parts[:-3])       # rest

    return model, rag_method, bucket

meta_rows = []
for p in file_paths:
    model, rag_method, bucket = parse_filename(p)
    meta_rows.append({
        "path": str(p),
        "model": model,
        "rag_method": rag_method,
        "bucket": bucket
    })

df_files = pd.DataFrame(meta_rows)

display(df_files.head(10))
print("\nCounts per model/rag/bucket:")
display(df_files.groupby(["model","rag_method","bucket"]).size().reset_index(name="n_files"))


Unnamed: 0,path,model,rag_method,bucket
0,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,advanced_rag,average
1,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,advanced_rag,good
2,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,advanced_rag,poor
3,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,modular_rag,average
4,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,modular_rag,good
5,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,modular_rag,poor
6,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,naive_rag,average
7,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,naive_rag,good
8,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,naive_rag,poor
9,C:\Users\user\Desktop\pb_msc\llm_judge\results...,llama_2_7b_hf,advanced_rag,average



Counts per model/rag/bucket:


Unnamed: 0,model,rag_method,bucket,n_files
0,deepseek_llm_7b,advanced_rag,average,1
1,deepseek_llm_7b,advanced_rag,good,1
2,deepseek_llm_7b,advanced_rag,poor,1
3,deepseek_llm_7b,modular_rag,average,1
4,deepseek_llm_7b,modular_rag,good,1
5,deepseek_llm_7b,modular_rag,poor,1
6,deepseek_llm_7b,naive_rag,average,1
7,deepseek_llm_7b,naive_rag,good,1
8,deepseek_llm_7b,naive_rag,poor,1
9,llama_2_7b_hf,advanced_rag,average,1


In [32]:
import json
import pandas as pd

def parse_filename_simple(p):
    """
    Expected filename example:
    judgements_deepseek_llm_7b_advanced_rag_average.json

    model      = deepseek_llm_7b
    rag_type   = advanced_rag
    bucket     = average
    """
    name = p.stem
    parts = name.split("_")

    # remove "judgements"
    if parts[0] == "judgements":
        parts = parts[1:]

    bucket = parts[-1]
    rag_type = "_".join(parts[-3:-1])
    model = "_".join(parts[:-3])

    return model, rag_type, bucket


rows = []
all_items = []

for p in file_paths:   # file_paths = list of paths found earlier
    try:
        with open(p, "r", encoding="utf-8") as f:
            data = json.load(f)

        n_items = len(data) if isinstance(data, list) else 0
        model, rag_type, bucket = parse_filename_simple(p)

        rows.append({
            "file": str(p),
            "model": model,
            "rag_type": rag_type,
            "bucket": bucket,
            "n_questions": n_items
        })

        # add only if looks like valid list
        if isinstance(data, list) and n_items > 0:
            all_items.extend(data)

    except Exception as e:
        rows.append({
            "file": str(p),
            "model": None,
            "rag_type": None,
            "bucket": None,
            "n_questions": 0
        })


df_file_overview = pd.DataFrame(rows)

print("Total valid questions loaded:", len(all_items))
display(df_file_overview.sort_values(["model", "rag_type", "bucket"]))


Total valid questions loaded: 2529


Unnamed: 0,file,model,rag_type,bucket,n_questions
0,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,advanced_rag,average,100
1,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,advanced_rag,good,100
2,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,advanced_rag,poor,100
3,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,modular_rag,average,100
4,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,modular_rag,good,100
5,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,modular_rag,poor,100
6,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,naive_rag,average,100
7,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,naive_rag,good,100
8,C:\Users\user\Desktop\pb_msc\llm_judge\results...,deepseek_llm_7b,naive_rag,poor,100
9,C:\Users\user\Desktop\pb_msc\llm_judge\results...,llama_2_7b_hf,advanced_rag,average,100


In [26]:
def json_to_long_df(items):
    rows = []
    for it in items:
        qid = it["id"]
        model = it["model"]
        rag = it["rag_method"]
        bucket = it["bucket"]

        for j in it["judgements"]:
            qtype = j["question_type"]
            for criterion in ["correctness", "relevance", "completeness"]:
                rows.append({
                    "question_id": qid,
                    "model": model,
                    "rag_method": rag,
                    "bucket": bucket,
                    "question_type": qtype,
                    "criterion": criterion,
                    "score": int(j[criterion]),  # 0/1
                })

    return pd.DataFrame(rows)

df_long = json_to_long_df(all_items)

print("Long DF shape:", df_long.shape)
df_long.head()


Long DF shape: (22761, 7)


Unnamed: 0,question_id,model,rag_method,bucket,question_type,criterion,score
0,335,deepseek_llm_7b,advanced_rag,average,comprehension,correctness,1
1,335,deepseek_llm_7b,advanced_rag,average,comprehension,relevance,1
2,335,deepseek_llm_7b,advanced_rag,average,comprehension,completeness,1
3,335,deepseek_llm_7b,advanced_rag,average,analytical,correctness,1
4,335,deepseek_llm_7b,advanced_rag,average,analytical,relevance,1


In [27]:
def diff_mean(a, b):
    return np.mean(a) - np.mean(b)

def bootstrap_diff_ratio_scipy(scores_a, scores_b, n_boot=20000, ci=0.95, random_state=42):
    scores_a = np.asarray(scores_a)
    scores_b = np.asarray(scores_b)

    p1_obs = scores_a.mean()
    p2_obs = scores_b.mean()
    obs_diff = p1_obs - p2_obs
    obs_ratio = p1_obs / p2_obs if p2_obs > 0 else np.inf

    res = bootstrap(
        data=(scores_a, scores_b),
        statistic=diff_mean,
        n_resamples=n_boot,
        vectorized=False,
        paired=False,
        confidence_level=ci,
        method="percentile",
        random_state=random_state
    )

    ci_low = float(res.confidence_interval.low)
    ci_high = float(res.confidence_interval.high)
    boot_diffs = res.bootstrap_distribution
    p_value = float(np.mean(np.abs(boot_diffs) >= np.abs(obs_diff)))

    return {
        "n1": len(scores_a),
        "n2": len(scores_b),
        "p1": p1_obs,
        "p2": p2_obs,
        "diff": obs_diff,
        "ratio": obs_ratio,
        "ci_low": ci_low,
        "ci_high": ci_high,
        "p_value": p_value,
    }

def run_full_bootstrap(df_long, n_boot=20000, ci=0.95, random_state=42):
    bucket_pairs = [("good", "average"), ("good", "poor"), ("average", "poor")]
    group_cols = ["model", "rag_method", "question_type", "criterion"]

    out_rows = []

    for keys, df_g in df_long.groupby(group_cols):
        key_dict = dict(zip(group_cols, keys))

        bucket2scores = {
            b: df_g[df_g["bucket"] == b]["score"].values
            for b in df_g["bucket"].unique()
        }

        for b1, b2 in bucket_pairs:
            if b1 not in bucket2scores or b2 not in bucket2scores:
                continue

            res = bootstrap_diff_ratio_scipy(
                bucket2scores[b1],
                bucket2scores[b2],
                n_boot=n_boot,
                ci=ci,
                random_state=random_state
            )

            out_rows.append({
                **key_dict,
                "bucket_pair": f"{b1}_vs_{b2}",
                "bucket_1": b1,
                "bucket_2": b2,
                **res
            })

    return pd.DataFrame(out_rows)

df_boot = run_full_bootstrap(df_long, n_boot=20000, ci=0.95, random_state=42)

print("Bootstrap results shape:", df_boot.shape)
df_boot.head(10)


Bootstrap results shape: (243, 16)


Unnamed: 0,model,rag_method,question_type,criterion,bucket_pair,bucket_1,bucket_2,n1,n2,p1,p2,diff,ratio,ci_low,ci_high,p_value
0,deepseek_llm_7b,advanced_rag,analytical,completeness,good_vs_average,good,average,100,100,0.97,0.98,-0.01,0.989796,-0.05,0.03,0.8308
1,deepseek_llm_7b,advanced_rag,analytical,completeness,good_vs_poor,good,poor,100,100,0.97,0.99,-0.02,0.979798,-0.06,0.02,0.6177
2,deepseek_llm_7b,advanced_rag,analytical,completeness,average_vs_poor,average,poor,100,100,0.98,0.99,-0.01,0.989899,-0.04,0.02,0.7864
3,deepseek_llm_7b,advanced_rag,analytical,correctness,good_vs_average,good,average,100,100,0.99,0.99,0.0,1.0,-0.03,0.03,1.0
4,deepseek_llm_7b,advanced_rag,analytical,correctness,good_vs_poor,good,poor,100,100,0.99,1.0,-0.01,0.99,-0.03,0.0,0.6352
5,deepseek_llm_7b,advanced_rag,analytical,correctness,average_vs_poor,average,poor,100,100,0.99,1.0,-0.01,0.99,-0.03,0.0,0.63615
6,deepseek_llm_7b,advanced_rag,analytical,relevance,good_vs_average,good,average,100,100,1.0,1.0,0.0,1.0,0.0,0.0,1.0
7,deepseek_llm_7b,advanced_rag,analytical,relevance,good_vs_poor,good,poor,100,100,1.0,1.0,0.0,1.0,0.0,0.0,1.0
8,deepseek_llm_7b,advanced_rag,analytical,relevance,average_vs_poor,average,poor,100,100,1.0,1.0,0.0,1.0,0.0,0.0,1.0
9,deepseek_llm_7b,advanced_rag,comprehension,completeness,good_vs_average,good,average,100,100,0.99,0.98,0.01,1.010204,-0.02,0.04,0.78915


In [30]:
cols_order = [
    "model",
    "rag_method",
    "question_type",
    "criterion",
    "bucket_pair",
    "bucket_1",
    "bucket_2",
    "n1",
    "n2",
    "p1",
    "p2",
    "diff",
    "ci_low",
    "ci_high",
    "p_value",
    "ratio",
]

df_boot = df_boot[cols_order]

display(df_boot.head(20))

print("\nCounts per model/rag/qtype/criterion:")
display(df_boot.groupby(["model","rag_method","question_type","criterion"]).size().reset_index(name="n_tests"))


Unnamed: 0,model,rag_method,question_type,criterion,bucket_pair,bucket_1,bucket_2,n1,n2,p1,p2,diff,ci_low,ci_high,p_value,ratio
0,deepseek_llm_7b,advanced_rag,analytical,completeness,good_vs_average,good,average,100,100,0.97,0.98,-0.01,-0.05,0.03,0.8308,0.989796
1,deepseek_llm_7b,advanced_rag,analytical,completeness,good_vs_poor,good,poor,100,100,0.97,0.99,-0.02,-0.06,0.02,0.6177,0.979798
2,deepseek_llm_7b,advanced_rag,analytical,completeness,average_vs_poor,average,poor,100,100,0.98,0.99,-0.01,-0.04,0.02,0.7864,0.989899
3,deepseek_llm_7b,advanced_rag,analytical,correctness,good_vs_average,good,average,100,100,0.99,0.99,0.0,-0.03,0.03,1.0,1.0
4,deepseek_llm_7b,advanced_rag,analytical,correctness,good_vs_poor,good,poor,100,100,0.99,1.0,-0.01,-0.03,0.0,0.6352,0.99
5,deepseek_llm_7b,advanced_rag,analytical,correctness,average_vs_poor,average,poor,100,100,0.99,1.0,-0.01,-0.03,0.0,0.63615,0.99
6,deepseek_llm_7b,advanced_rag,analytical,relevance,good_vs_average,good,average,100,100,1.0,1.0,0.0,0.0,0.0,1.0,1.0
7,deepseek_llm_7b,advanced_rag,analytical,relevance,good_vs_poor,good,poor,100,100,1.0,1.0,0.0,0.0,0.0,1.0,1.0
8,deepseek_llm_7b,advanced_rag,analytical,relevance,average_vs_poor,average,poor,100,100,1.0,1.0,0.0,0.0,0.0,1.0,1.0
9,deepseek_llm_7b,advanced_rag,comprehension,completeness,good_vs_average,good,average,100,100,0.99,0.98,0.01,-0.02,0.04,0.78915,1.010204



Counts per model/rag/qtype/criterion:


Unnamed: 0,model,rag_method,question_type,criterion,n_tests
0,deepseek_llm_7b,advanced_rag,analytical,completeness,3
1,deepseek_llm_7b,advanced_rag,analytical,correctness,3
2,deepseek_llm_7b,advanced_rag,analytical,relevance,3
3,deepseek_llm_7b,advanced_rag,comprehension,completeness,3
4,deepseek_llm_7b,advanced_rag,comprehension,correctness,3
...,...,...,...,...,...
76,mistral-7b_v01,naive_rag,comprehension,correctness,3
77,mistral-7b_v01,naive_rag,comprehension,relevance,3
78,mistral-7b_v01,naive_rag,textual_stylistic,completeness,3
79,mistral-7b_v01,naive_rag,textual_stylistic,correctness,3
