In [1]:
import json
import pandas as pd

# Load data
with open("raw_results/complete_judge_result.jsonl", "r", encoding="utf-8") as f:
    data = [json.loads(line) for line in f if line.strip()]
df = pd.DataFrame(data)

print(f"Total samples: {len(df):,}\n")

# 1. Train/Test distribution
print("1. Train/Test split:")
print(df['split'].value_counts())

# 2. Overall positive/negative
print("\n2. Overall Positive/Negative:")
print(df['y_true'].value_counts())
print(df['y_true'].value_counts(normalize=True).map("{:.1%}".format))

# 3. By split: Positive/Negative
print("\n3. By split - Positive/Negative counts:")
split_dist = df.groupby(['split', 'y_true']).size().unstack(fill_value=0)
split_dist.columns = ['Negative', 'Positive']
split_dist['Total'] = split_dist.sum(axis=1)
print(split_dist)

print("\nBy split - Positive/Negative %:")
split_perc = df.groupby('split')['y_true'].value_counts(normalize=True).unstack() * 100
split_perc.columns = ['Negative %', 'Positive %']
split_perc = split_perc.round(1)
print(split_perc)

# Dataset-wise
print("\n" + "="*60)
print("DATASET-WISE STATISTICS")
print("="*60)

datasets = sorted(df['dataset'].unique())

for ds in datasets:
    sub = df[df['dataset'] == ds]
    print(f"\nDataset: {ds}")
    print(f"  Total: {len(sub):,}")
    print("  Train/Test:")
    print(sub['split'].value_counts().to_string())
    print("  Positive/Negative counts:")
    counts = sub['y_true'].value_counts().sort_index()
    print(f"    Negative: {counts.get(False, 0):,}   Positive: {counts.get(True, 0):,}")
    print("  Positive %: {sub['y_true'].mean():.1%}")
    print("  By split:")
    for split in ['train', 'test']:
        ss = sub[sub['split'] == split]
        if len(ss) > 0:
            print(f"    {split.capitalize()}: {len(ss):,} samples, Positive {ss['y_true'].mean():.1%}")
        else:
            print(f"    {split.capitalize()}: 0 samples")

Total samples: 82,603

1. Train/Test split:
split
train    57564
test     25039
Name: count, dtype: int64

2. Overall Positive/Negative:
y_true
0    42227
1    40376
Name: count, dtype: int64
y_true
0    51.1%
1    48.9%
Name: proportion, dtype: object

3. By split - Positive/Negative counts:
       Negative  Positive  Total
split                           
test      14303     10736  25039
train     27924     29640  57564

By split - Positive/Negative %:
       Negative %  Positive %
split                        
test         57.1        42.9
train        48.5        51.5

DATASET-WISE STATISTICS

Dataset: covidqa
  Total: 14,958
  Train/Test:
split
train    10419
test      4539
  Positive/Negative counts:
    Negative: 0   Positive: 0
  Positive %: {sub['y_true'].mean():.1%}
  By split:
    Train: 10,419 samples, Positive 43.7%
    Test: 4,539 samples, Positive 47.2%

Dataset: expertqa
  Total: 14,089
  Train/Test:
split
train    9806
test     4283
  Positive/Negative counts:
    Nega