In [2]:
import pandas as pd
from sklearn.metrics import cohen_kappa_score
from scipy.stats import pearsonr
import numpy as np
from itertools import combinations
import json
import os
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.metrics import confusion_matrix


In [3]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [None]:
# ==================== Load data safely ====================
print("Loading data...")
# Update this path if your file is named differently
file_path = "raw_results/complete_judge_result.jsonl" 

if not os.path.exists(file_path):
    # Dummy data creation for code integrity if file doesn't exist
    print(f"File {file_path} not found. Ensure your data is in the current directory.")
    exit()

with open(file_path, "r", encoding="utf-8") as f:
    data = [json.loads(line.strip()) for line in f if line.strip()]

df = pd.DataFrame(data)
print(f"Total samples: {len(df):,}")

# Split identification
train_df = df[df['split'] == 'train'].reset_index(drop=True)
test_df  = df[df['split'] == 'test'].reset_index(drop=True)

print(f"Train : {len(train_df):,}")
print(f"Test  : {len(test_df):,}")

def evaluate_judge(df_subset, method):
    if len(df_subset) == 0:
        return None

    # --- Logic for extracting "Accepted" Mask ---
    if method == "BGE@0.85":
        accepted = df_subset['bge_retrieved_top1_score'] >= 0.85
    elif method == "BGE@0.85 Key":
        accepted = df_subset['bge_keyword_retrieved_top1_score'] >= 0.85
    elif method == "BGE@0.85 Intent":
        accepted = df_subset['bge_intent_retrieved_top1_score'] >= 0.85
    # Cleaned Base Names
    elif method == "Qwen":
        accepted = df_subset['qwen'].fillna(False).astype(bool)
    elif method == "Llama":
        accepted = df_subset['llama'].fillna(False).astype(bool)
    elif method == "Mistral":
        accepted = df_subset['mistral'].fillna(False).astype(bool)
    elif method == "Gemma":
        accepted = df_subset['gemma'].fillna(False).astype(bool)
    elif method == "ChatGLM":
        accepted = df_subset['chatglm'].fillna(False).astype(bool)
    
    # Augmented Methods (using the y_true fallback versions)
    elif method == "Qwen Aug":
        accepted = df_subset['qwen_aug'].astype(bool)
    elif method == "Llama Aug":
        accepted = df_subset['llama_aug'].astype(bool)
    elif method == "Mistral Aug":
        accepted = df_subset['mistral_aug'].astype(bool)
    elif method == "Gemma Aug":
        accepted = df_subset['gemma_aug'].astype(bool)

    elif method == "ChatGLM Aug":
        accepted = df_subset['chatglm_aug'].astype(bool)

    elif method == "GPT5":
        accepted = df_subset['gpt5'].astype(bool)
    
    elif method == "Majority Vote":
        # Using the cleaned names
        base_judges = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']
        accepted = df_subset[base_judges].fillna(0).sum(axis=1) >= 3
        vote_df = df_subset[base_judges].fillna(0).astype(int)
        true_votes = vote_df.sum(axis=1)
        false_votes = 5 - true_votes
        accepted = true_votes > false_votes
    elif method == "Majority Vote Aug":
            # Using the cleaned names
            base_judges = ['qwen_aug', 'llama_aug', 'mistral_aug', 'gemma_aug', 'chatglm_aug']
            accepted = df_subset[base_judges].fillna(0).sum(axis=1) >= 3
            vote_df = df_subset[base_judges].fillna(0).astype(int)
            true_votes = vote_df.sum(axis=1)
            false_votes = 5 - true_votes
            accepted = true_votes > false_votes
    elif method == "Weighted Vote":
        base_judges = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']
        # We calculate the weighted sum of bipolar votes
        weighted_score = pd.Series(0.0, index=df_subset.index)
        
        for judge in base_judges:
            # Map True -> 1, False -> -1
            # .astype(bool) ensures logic holds for different data types
            v_i = df_subset[judge].fillna(False).astype(bool).map({True: 1, False: -1})
            weighted_score += v_i * learned_weights[judge]
            
        # If the weighted sum > 0, the collective decision is True
        accepted = weighted_score > 0
    else:
        accepted = df_subset[method.lower()]
    if method != "BGE@0.85 Key":
        y_true = df_subset['y_true'].astype(bool)
    else:
        y_true = (df_subset['sample_class'] == 'positive') & (
        df_subset['bge_keyword_retrieved_top1_id'] == df_subset['question_id']
    )
    
    tn, fp, fn, tp = confusion_matrix(y_true, accepted, labels=[False, True]).ravel()
    
    total = len(y_true)
    n_pos = tp + fn
    n_neg = tn + fp
    
    accuracy = (tp + tn) / total
    recall = tp / n_pos if n_pos > 0 else 0.0
    specificity = tn / n_neg if n_neg > 0 else 0.0
    precision = tp / (tp + fp) if (tp + fp) > 0 else 0.0
    f1 = 2 * precision * recall / (precision + recall) if (precision + recall) > 0 else 0.0
    hallu_risk = fp / (fp + tn)
    accept_ratio = accepted.sum() / y_true.sum() if y_true.sum() > 0 else 0
    
    return {
        "Method": method,
        "Accept%": accept_ratio,
        "Accuracy": accuracy,
        "Hallu Risk": hallu_risk,
        "Recall%": recall,
        "Neg Reject%": specificity,
        "Precision": precision,
        "F1": f1
    }

# --- Execution ---
methods = [
    "BGE@0.85", 
    "BGE@0.85 Key", 
    "BGE@0.85 Intent",
    "Qwen", "Qwen Aug",
    "Llama", "Llama Aug",
    "Mistral", "Mistral Aug",
    "Gemma", "Gemma Aug",
    "ChatGLM", "ChatGLM Aug",
    "GPT5",
    "Majority Vote", "Majority Vote Aug",
    "Weighted Vote",
    "neural_model",
    "ral2m"
]

# 1. PRE-CALCULATE WEIGHTS (Run this before defining/calling the eval loop)
base_judges = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']

# Calculate accuracy on train split to determine ranks
judge_accs = {}
for judge in base_judges:
    correct = (train_df[judge].fillna(False).astype(bool) == train_df['y_true'].astype(bool))
    judge_accs[judge] = correct.mean()

# Sort judges by accuracy (ascending)
sorted_judges = sorted(base_judges, key=lambda x: judge_accs[x])

# Assign normalized rank weights: rank / sum(ranks)
# For 5 judges, sum is 15
sum_ranks = sum(range(1, len(base_judges) + 1))
learned_weights = {judge: (i + 1) / sum_ranks for i, judge in enumerate(sorted_judges)}

print("Assigned Weights (Normalized Rank):")
for j in sorted_judges:
    print(f"  {j}: {learned_weights[j]:.3f} (Acc: {judge_accs[j]:.2%})")

results = [evaluate_judge(test_df, m) for m in methods if evaluate_judge(test_df, m)]

final_df = pd.DataFrame(results)
# final_df = final_df[["Method", "Accept%", "Accuracy", "Hallu Risk", "Recall%", "Neg Reject%", "Precision", "F1"]]
final_df = final_df[["Method", "Accuracy", "Hallu Risk", "Precision", "F1"]]

# Formatting
display_df = final_df.copy()
# display_df["Accept%"] = display_df["Accept%"].map("{:.1%}".format)
display_df["Accuracy"] = display_df["Accuracy"].map("{:.1%}".format)
display_df["Hallu Risk"] = display_df["Hallu Risk"].map("{:.1%}".format)
# display_df["Recall%"] = display_df["Recall%"].map("{:.1%}".format)
# display_df["Neg Reject%"] = display_df["Neg Reject%"].map("{:.1%}".format)
display_df["Precision"] = display_df["Precision"].map("{:.3f}".format)
display_df["F1"] = display_df["F1"].map("{:.3f}".format)

print("\n" + "="*100)
print(f"{'LLM JUDGE EVALUATION (TEST SET)':^100}")
print("="*100)
print(display_df.to_string(index=False))
print("="*100)

Loading data...
Total samples: 82,603
Train : 57,564
Test  : 25,039
Assigned Weights (Normalized Rank):
  mistral: 0.067 (Acc: 51.86%)
  chatglm: 0.133 (Acc: 75.67%)
  llama: 0.200 (Acc: 79.29%)
  qwen: 0.267 (Acc: 87.16%)
  gemma: 0.333 (Acc: 90.27%)

                                  LLM JUDGE EVALUATION (TEST SET)                                   
           Method Accuracy Hallu Risk Precision    F1
         BGE@0.85    51.3%      41.4%     0.430 0.423
     BGE@0.85 Key    42.2%      99.7%     0.424 0.592
  BGE@0.85 Intent    42.2%      99.7%     0.424 0.592
             Qwen    65.9%       6.9%     0.764 0.426
         Qwen Aug    63.1%       2.4%     0.842 0.285
            Llama    61.5%      18.7%     0.586 0.440
        Llama Aug    63.6%      27.6%     0.585 0.550
          Mistral    42.9%      99.7%     0.428 0.599
      Mistral Aug    43.3%      97.2%     0.429 0.596
            Gemma    58.6%      42.0%     0.515 0.552
        Gemma Aug    60.9%      38.4%     0.539 0.56

In [33]:
from scipy.stats import binom

def get_mcnemar_p_value(df, model_a_name, model_b_name):
    """
    Computes the exact p-value using the binomial distribution.
    model_a_name: Name of the baseline in the dataframe (e.g., 'majority_vote')
    model_b_name: Name of the target model in the dataframe (e.g., 'ral2m')
    """
    y_true = df['y_true'].astype(bool)
    
    # Identify correct predictions for both models
    # Note: ensure column names match your dataframe keys exactly
    correct_a = (df[model_a_name].fillna(False).astype(bool) == y_true)
    correct_b = (df[model_b_name].fillna(False).astype(bool) == y_true)
    
    # Count discordant pairs
    # b: Baseline correct, but ral2m wrong
    # c: ral2m correct, but Baseline wrong
    b = (correct_a & ~correct_b).sum()
    c = (~correct_a & correct_b).sum()
    
    # n is the total number of disagreements
    n = b + c
    if n == 0:
        return 1.0
    
    # The exact p-value is based on the binomial distribution B(n, 0.5)
    # We calculate the probability of seeing a split as extreme as b and c
    k = min(b, c)
    p_value = 2 * binom.cdf(k, n, 0.5)
    return min(1.0, p_value)

# --- Statistical Calculation Logic ---

# We compare against 'Majority Vote' as the primary baseline
# Note: Ensure these strings match the actual columns in your test_df
# If 'Majority Vote' is computed inside evaluate_judge, you may need to 
# pre-compute it in test_df before running this test.
baseline_key = 'majority_vote' 
target_key = 'ral2m'

# Check if keys exist in df, if not, create them for the test
if baseline_key not in test_df.columns:
    base_judges = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']
    test_df[baseline_key] = test_df[base_judges].fillna(0).sum(axis=1) >= 3

p_val = get_mcnemar_p_value(test_df, baseline_key, target_key)

# Determine Significance Asterisks
sig_stars = ""
if p_val < 0.001: sig_stars = "***"
elif p_val < 0.01: sig_stars = "**"
elif p_val < 0.05: sig_stars = "*"

# Apply formatting to the display_df
display_df['Accuracy'] = display_df.apply(
    lambda x: f"{x['Accuracy']}{sig_stars}" if x['Method'] == 'ral2m' else x['Accuracy'], 
    axis=1
)

# --- Final Output ---
print("\n" + "="*100)
print(f"{'LLM JUDGE EVALUATION (TEST SET)':^100}")
print(f"{f'Stat. Significance (ral2m vs {baseline_key}): p = {p_val:.4f}':^100}")
print("="*100)
print(display_df.to_string(index=False))
print("="*100)
print(f"Significance levels: * p < 0.05, ** p < 0.01, *** p < 0.001")


                                  LLM JUDGE EVALUATION (TEST SET)                                   
                      Stat. Significance (ral2m vs majority_vote): p = 0.0000                       
           Method Accuracy Hallu Risk Precision    F1
         BGE@0.85    51.3%      41.4%     0.430 0.423
     BGE@0.85 Key    42.2%      99.7%     0.424 0.592
  BGE@0.85 Intent    42.2%      99.7%     0.424 0.592
             Qwen    65.9%       6.9%     0.764 0.426
         Qwen Aug    63.1%       2.4%     0.842 0.285
            Llama    61.5%      18.7%     0.586 0.440
        Llama Aug    63.6%      27.6%     0.585 0.550
          Mistral    42.9%      99.7%     0.428 0.599
      Mistral Aug    43.3%      97.2%     0.429 0.596
            Gemma    58.6%      42.0%     0.515 0.552
        Gemma Aug    60.9%      38.4%     0.539 0.567
          ChatGLM    48.9%      85.8%     0.454 0.615
      ChatGLM Aug    51.5%      80.0%     0.468 0.624
             GPT5    55.7%      61.3%    

In [8]:
cols = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']

# 1. Prepare base data
y_true = test_df['y_true'].astype(int)
vote_df = test_df[cols].fillna(0).astype(int)

# 2. Calculate counts per sample
num_yes = vote_df.sum(axis=1)
num_no = 5 - num_yes

# 3. Calculate "Correctness" (Alignment to $y_{true}$)
# This compares each judge column to the ground truth
alignment_df = vote_df.apply(lambda col: col == y_true)
num_correct = alignment_df.sum(axis=1)

# 4. Create the new tracking DataFrame
# This records the breakdown for every single sample
sample_stats_df = pd.DataFrame({
    'num_yes': num_yes,
    'num_no': num_no,
    'num_correct': num_correct,
    'ground_truth': y_true
})

# 5. Determine majority decision and binary correctness
# The judge majority votes '1' if num_yes >= 3
majority_decision = (num_yes >= 3).astype(int)
# The majority is correct ONLY if the decision matches ground truth
is_majority_correct = (majority_decision == y_true)

sample_stats_df['majority_decision'] = majority_decision
sample_stats_df['is_majority_correct'] = is_majority_correct

# The final 'accepted' Series for your evaluation function
accepted = majority_decision.astype(bool)

In [9]:
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt

# --- 1. Data Processing ---
# (Assumes test_df is already loaded)

cols = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']

# A. Prepare base data
y_true = test_df['y_true'].astype(int)
vote_df = test_df[cols].fillna(0).astype(int)

# B. Explicitly calculate counts and correctness per sample
num_yes = vote_df.sum(axis=1)
num_no = 5 - num_yes

# Alignment: Compare each judge column strictly to the ground truth
alignment_df = vote_df.apply(lambda col: col == y_true)
num_correct = alignment_df.sum(axis=1)

# C. Build the Stats DataFrame
sample_stats_df = pd.DataFrame({
    'num_yes': num_yes,
    'num_no': num_no,
    'num_correct': num_correct,
    'ground_truth': y_true
})

# D. Define Majority Decision Logic
majority_decision = (num_yes >= 3).astype(int)
is_majority_correct = (majority_decision == y_true)

sample_stats_df['majority_decision'] = majority_decision
sample_stats_df['is_majority_correct'] = is_majority_correct

# --- 2. Generate Correctness Plotting Data (k-values) ---
acc_counts = sample_stats_df['num_correct'].value_counts().sort_index()
total = len(sample_stats_df)

k_values = []
percentages = []
cumulative_correct = 0

# Calculating cumulative percentages for k = 5, 4, 3, 2, 1
for k in range(5, 0, -1):
    cumulative_correct += acc_counts.get(k, 0)
    if k == 5:
        k_values.append(f"$k = {k}$")
    else:
        k_values.append(f"$k \\geq {k}$")
    percentages.append((cumulative_correct / total) * 100)

# --- 3. Generate Consensus Plotting Data (Agreement) ---
def get_consensus_level(row):
    """Categorizes the agreement level for a 5-judge panel."""
    top_vote_count = row.value_counts().max()
    if top_vote_count == 5:
        return "Full (5/5)"
    elif top_vote_count == 4:
        return "Strong (4/1)"
    elif top_vote_count == 3:
        return "Weak (3/2)"
    else:
        return "Major Discrepancy (Split)"

# Apply function to get the series of levels
consensus_series = vote_df.apply(get_consensus_level, axis=1)

# Create the consensus_df
consensus_counts = consensus_series.value_counts(normalize=True) * 100
consensus_df = consensus_counts.reset_index()
consensus_df.columns = ['Level', 'Percentage']

# Define the logical order and sort
level_order = ["Full (5/5)", "Strong (4/1)", "Weak (3/2)"]
consensus_df['Level'] = pd.Categorical(consensus_df['Level'], categories=level_order, ordered=True)
consensus_df = consensus_df.sort_values('Level').reset_index(drop=True)

# --- Verification ---
print("--- Correctness Data (Main Plot) ---")
for k, p in zip(k_values, percentages):
    print(f"{k}: {p:.2f}%")

print("\n--- Consensus Data (Inset Plot) ---")
print(consensus_df)

--- Correctness Data (Main Plot) ---
$k = 5$: 5.13%
$k \geq 4$: 23.32%
$k \geq 3$: 60.23%
$k \geq 2$: 90.55%
$k \geq 1$: 98.48%

--- Consensus Data (Inset Plot) ---
          Level  Percentage
0    Full (5/5)    6.649627
1  Strong (4/1)   26.111266
2    Weak (3/2)   67.239107


In [29]:
# DEBUG: Print the relationship to see why accuracy != 80%
total_samples = len(sample_stats_df)
samples_with_3_correct = (sample_stats_df['num_correct'] >= 3).sum()
actual_accuracy = is_majority_correct.sum()

print(f"Samples with $k \geq 3$ judges correct: {samples_with_3_correct / total_samples:.2%}")
print(f"Actual Majority Vote Accuracy: {actual_accuracy / total_samples:.2%}")

Samples with $k \geq 3$ judges correct: 60.23%
Actual Majority Vote Accuracy: 60.23%


In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# --- Global Config ---
plt.rcParams['font.family'] = 'serif'
sns.set_context("paper", font_scale=1.1)

# 1. Initialize Figure
fig = plt.figure(figsize=(5, 3), dpi=1000)
ax1 = fig.add_subplot(111)

# Reverse k_values to plot as 5,4,3,2,1
k_values_rev = k_values[::-1]
percentages_rev = percentages[::-1]

ax1 = sns.barplot(x=k_values_rev, y=percentages_rev, palette="Blues_d",
                  edgecolor="0.2", linewidth=1, width=0.8, ax=ax1)

# ax1.set_ylabel("Samples by Correct Judges (%)", fontsize=9, fontweight='bold')
ax1.yaxis.labelpad = 2
ax1.set_xlabel("", fontsize=0)
ax1.grid(False)
ax1.set_ylim(0, 105) 
ax1.set_yticks([20, 40, 60, 80, 100])
sns.despine(ax=ax1, left=True)
ax1.tick_params(labelsize=9, pad=2)
ax1.tick_params(axis='y', labelsize=8, pad=0)
ax1.tick_params(axis='y', length=0)
# Main Plot Annotations
for p in ax1.patches:
    ax1.annotate(f'{p.get_height():.1f}%',
                  (p.get_x() + p.get_width() / 2., p.get_height() + 3),
                  ha='center', va='center', fontsize=8, fontweight='500')



plt.savefig("figures/correctness_distribution.png", dpi=1600, bbox_inches='tight')
plt.savefig("figures/correctness_distribution.pdf", dpi=1600, bbox_inches='tight', format='pdf')
plt.show()

In [12]:
import pandas as pd

# --- Table 1: Top-level k-values ---
# Reverse to match plotting order (5,4,3,2,1)
table_k = pd.DataFrame({
    'k': k_values[::-1],
    'Number of Correct Judges (%)': percentages[::-1]
})

print("=== Top-level k-values Table ===")
print(table_k.to_string(index=False))

# --- Table 2: Consensus Levels ---
# Clean Level names (remove text in parentheses)
consensus_df['Consensus Level'] = [lvl.split(' (')[0] for lvl in consensus_df['Level']]

table_consensus = consensus_df[['Consensus Level', 'Percentage']].rename(
    columns={'Percentage': 'Agreement (%)'}
)

print("\n=== Consensus Levels Table ===")
print(table_consensus.to_string(index=False))


=== Top-level k-values Table ===
         k  Number of Correct Judges (%)
$k \geq 1$                     98.478374
$k \geq 2$                     90.554735
$k \geq 3$                     60.234035
$k \geq 4$                     23.315628
   $k = 5$                      5.128000

=== Consensus Levels Table ===
Consensus Level  Agreement (%)
           Full       6.649627
         Strong      26.111266
           Weak      67.239107


In [35]:
import pandas as pd
import numpy as np
import json
from sklearn.metrics import cohen_kappa_score
from itertools import combinations


df_subset = df
df_subset = df_subset.apply(pd.to_numeric, errors='coerce').fillna(0).astype(int)

# --- 2. Models ---
models = ['qwen', 'llama', 'mistral', 'gemma', 'chatglm']

# --- 3. Cohen's Kappa Matrix ---
kappa_matrix = pd.DataFrame(index=models, columns=models, dtype=float)
for c1, c2 in combinations(models, 2):
    k = cohen_kappa_score(df_subset[c1], df_subset[c2])
    kappa_matrix.loc[c1, c2] = k
    kappa_matrix.loc[c2, c1] = k
kappa_matrix.fillna(1.0, inplace=True)

# --- 4. Pearson Correlation Matrix ---
pearson_matrix = df_subset.corr(method='pearson')

# --- 5. Print Matrices ---
print("\n--- Cohen's Kappa Matrix ---")
print(kappa_matrix.round(3))

print("\n--- Pearson Correlation Matrix ---")
print(pearson_matrix.round(3))



--- Cohen's Kappa Matrix ---
          qwen  llama  mistral  gemma  chatglm
qwen     1.000  0.478    0.004  0.543    0.213
llama    0.478  1.000    0.004  0.419    0.197
mistral  0.004  0.004    1.000  0.008    0.020
gemma    0.543  0.419    0.008  1.000    0.374
chatglm  0.213  0.197    0.020  0.374    1.000

--- Pearson Correlation Matrix ---
                                  query_id  question_id  dataset  user_query  \
query_id                               NaN          NaN      NaN         NaN   
question_id                            NaN        1.000      NaN         NaN   
dataset                                NaN          NaN      NaN         NaN   
user_query                             NaN          NaN      NaN         NaN   
sample_class                           NaN          NaN      NaN         NaN   
split                                  NaN          NaN      NaN         NaN   
qwen                                   NaN       -0.156      NaN         NaN   
mistral     