In [1]:
import json
from pathlib import Path
import yaml

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from pan25_genai_detection.evaluator import evaluator

In [28]:
OUT_DIR = Path('../data/pan-task-eval-out/')
OUT_DIR.mkdir(parents=True, exist_ok=True)

### Load Data

In [2]:
def load_run_jsonl(infile):
    for l in open(infile):
        j = json.loads(l)
        obj = {k: v for k, v in j.items() if k in ['dataset', 'team', 'software', 'run_id']}
        obj['eval_field'] = 'all'
        obj['eval_value'] = 'all'
        eval_keys = [k for k in j['evaluation'] if k.startswith('_eval-')]
        if obj['team'] == 'baseline':
            obj['team'] =  obj['software']
        obj_all = obj.copy()
        obj_all.update({'resource_' + k: v for k, v in j['used_resources'].items()})
        obj_all.update({k: v for k, v in j['evaluation'].items() if k not in eval_keys})
        yield obj_all

        for k in eval_keys:
            obj['eval_field'] = k.replace('_eval-', '')
            for kv, v in j['evaluation'][k].items():
                obj['eval_value'] = kv
                obj.update(**v)
                yield obj.copy()

# Load run evaluations, but keep only last runif softwares were executed multiple times
df_run_eval_raw = (pd.DataFrame(load_run_jsonl('../data/tira/runs-gen-ai-authorship-verification/runs.jsonl'))
                   .sort_values('run_id')
                   .drop_duplicates(subset=['dataset', 'team', 'software', 'eval_field', 'eval_value'], keep='last')
                   .set_index(['dataset', 'team', 'software', 'run_id', 'eval_field', 'eval_value']))
_resource_cols = [c for c in df_run_eval_raw.columns if c.startswith('resource_')]
df_resources = df_run_eval_raw[_resource_cols].dropna()
df_run_eval_raw = df_run_eval_raw.drop(columns=_resource_cols)

# Keep only highest by mean score on main test set
softwares = set(i[1] for i in df_run_eval_raw.loc['pan25-generative-ai-detection-20250604-test']
                .query('eval_field == "all"')
                .groupby('team')
                .idxmax()['mean'])
df_run_eval_raw = df_run_eval_raw.query('software in @softwares')

# Drop "all"
df_run_eval_raw = df_run_eval_raw.query('eval_field != "all"')

In [3]:
# Load case ID map
id_map = {}
for f in Path('../data/sampled').glob('*-orig-ids.jsonl'):
    for l in open(f, 'r'):
        j = json.loads(l)
        assert j['id'] not in id_map
        id_map[j['id']] = j['orig_id']

In [4]:
# Load raw predictions and back-translate case IDs
df_pred_raw = []
for pt in tqdm(list(Path('../data/tira/runs-gen-ai-authorship-verification').glob('*/run.prototext')), desc='Loading raw predictions'):
    y = yaml.full_load(open(pt))
    try:
        t, s = df_run_eval_raw.xs((y['inputDataset'], y['runId']), level=['dataset', 'run_id']).index.unique().values[0][:2]
    except KeyError:
        # Software filtered
        continue
    df_pred_raw.append(pd.read_json(next(pt.parent.glob('output/*.jsonl')), lines=True)[['id', 'label']])
    df_pred_raw[-1]['dataset'] = y['inputDataset']
    df_pred_raw[-1]['run_id'] = y['runId']
    df_pred_raw[-1]['team'] = t if t != 'baseline' else s
    df_pred_raw[-1]['software'] = s
    df_pred_raw[-1]['id'] = df_pred_raw[-1]['id'].map(lambda x: id_map[x])

df_pred_raw = pd.concat(df_pred_raw).sort_values('id').set_index(['dataset', 'team', 'software', 'run_id', 'id'])

Loading raw predictions:   0%|          | 0/150 [00:00<?, ?it/s]

In [5]:
# Merge ELOQUENT submissions
rows = []
for (t, s), d in df_pred_raw.query('dataset.str.contains("-eloquent-")').groupby(['team', 'software']):
    r = {
            'dataset': 'pan25-generative-ai-detection-eloquent-test-all',
            'team': t,
            'software': s,
            'run_id': tuple(d.index.get_level_values('run_id').unique()),
            'eval_field': 'model',
        }

    for i, d_ in enumerate([d.query('id.str.startswith("eloquent25/human/")'),
                            d.query('not id.str.startswith("eloquent25/human/")'),
                            d]):
        if i in [0, 1]:
            r_ = {
                **r,
                'eval_field': 'model',
                'eval_value': 'eloquent-human' if i == 0 else 'eloquent-llm',
                **{k: (np.float64(v) if type(v) is not list else v)
                   for k, v in evaluator.evaluate_all(np.array([i] * len(d_)), np.squeeze(d_['label'].values)).items()},
            }
        else:
            r_ = {
                **r,
                'eval_field': 'source',
                'eval_value': 'eloquent',
                **evaluator.evaluate_all(
                    d_.reset_index()['id'].str.startswith('eloquent25/human/').values,
                    np.squeeze(d_['label'].values)),
            }

        rows.append(pd.Series(list(r_.values()), index=list(r_.keys())))

df_run_eval_eloquent_raw = df_run_eval_raw.query('dataset.str.contains("-eloquent-")')
df_run_eval = pd.concat([
    df_run_eval_raw.query('not dataset.str.contains("-eloquent-")'),
    pd.DataFrame(rows).set_index(df_run_eval_raw.index.names)
])

# Fix F1 and F0.5u NaN values
def _fix_f1_nan(s):
    if np.isnan(s['f1']) and s['c@1'] == 1.0:
        s['f1'] = 1.0
        s['f05u'] = 1.0
    return s

df_run_eval = df_run_eval.apply(_fix_f1_nan, axis=1)

def _recalc_means(s):
    s['mean'] = float(np.mean([v for k, v in s.dropna().items() if type(v) is float and k != 'mean']))
    return s

# Update means
df_run_eval = df_run_eval.apply(_recalc_means, axis=1)

# Calculate FPR, FNR
df_run_eval['fpr'] = df_run_eval['confusion'].map(lambda s: s[1][0] / np.sum(s[1]) if np.sum(s[1]) else 0)
df_run_eval['fnr'] = df_run_eval['confusion'].map(lambda s: s[0][1] / np.sum(s[0]) if np.sum(s[0]) else 0)

# Split off ELOQUENT
df_run_eval_eloquent = df_run_eval.query('dataset.str.contains("-eloquent-")')

### PAN Evaluation

In [29]:
print('Datasets:\n - ', end='')
print('\n - '.join(df_run_eval.query('eval_field == "source"').reset_index()['eval_value'].unique()))

Datasets:
 - riddell-juola-obfuscated
 - riddell-juola-o1-deepseek
 - riddell-juola
 - pan24-test
 - pan24-o1
 - gutenberg-19c-fiction-obfuscated
 - gutenberg-19c-fiction-o1
 - gutenberg-19c-fiction
 - brennan-greenstadt-obfuscated
 - brennan-greenstadt
 - eloquent


In [50]:
pan_leaderboard_source = df_run_eval.query('eval_field == "source"').groupby(['team', 'software']).mean(numeric_only=True).sort_values(['mean'], ascending=False)
pan_leaderboard_source.reset_index().to_html(OUT_DIR / 'leaderboard.html', float_format=lambda f: f'{f:.3f}')
pan_leaderboard_source.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
team,software,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
mdok,mdok,0.853,0.896,0.894,0.898,0.903,0.899,0.108,0.094
steely,fine-roberta,0.842,0.879,0.877,0.865,0.881,0.88,0.151,0.1
nexus-interrogators,sensitive-liason,0.865,0.874,0.87,0.86,0.881,0.879,0.159,0.083
yangjlg,pink-condenser,0.845,0.878,0.871,0.856,0.881,0.877,0.172,0.062
cnlp-nits-pp,tomato-conduction,0.825,0.873,0.873,0.854,0.882,0.874,0.176,0.05
unibuc-nlp,tangy-arch,0.828,0.885,0.864,0.845,0.876,0.872,0.187,0.052
moadmoad,modernbert,0.822,0.866,0.865,0.855,0.882,0.871,0.175,0.058
iimasnlp,isg-graph-v3,0.838,0.868,0.856,0.851,0.877,0.869,0.171,0.077
bohan-li,distinct-dachshund,0.848,0.858,0.852,0.847,0.87,0.866,0.174,0.092
advacheck,watery-bag,0.802,0.855,0.855,0.854,0.879,0.863,0.169,0.084


In [8]:
pan_leaderboard_model = df_run_eval.query('eval_field == "model"').groupby(['team', 'software']).mean(numeric_only=True).sort_values(['mean'], ascending=False)
pan_leaderboard_model.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
team,software,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
mdok,mdok,,0.97,0.966,0.953,0.964,0.963,0.034,0.0
hiwiy,connected-svn,,0.855,0.949,0.949,0.949,0.925,0.0,0.051
team-a,deafening-template,,0.868,0.868,0.883,0.908,0.882,0.088,0.044
iunlp,persistent-strut,,0.873,0.873,0.871,0.909,0.881,0.12,0.008
steely,fine-roberta,,0.859,0.857,0.842,0.883,0.86,0.141,0.002
baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,,0.865,0.827,0.833,0.884,0.852,0.157,0.012
baseline-ppmd,baseline-ppmd,,0.828,0.831,0.836,0.881,0.844,0.119,0.036
baseline-tf-idf,baseline-tf-idf,,0.877,0.841,0.821,0.84,0.843,0.135,0.006
hello-world,tart-objective,,0.877,0.841,0.821,0.84,0.843,0.135,0.006
nexus-interrogators,sensitive-liason,,0.84,0.833,0.818,0.858,0.837,0.164,0.003


In [9]:
pan_leaderboard_genre = df_run_eval.query('eval_field == "genre"').groupby(['team', 'software']).mean(numeric_only=True).sort_values(['mean'], ascending=False)
pan_leaderboard_genre.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
team,software,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
mdok,mdok,0.996,0.987,0.985,0.991,0.994,0.991,0.013,0.007
steely,fine-roberta,0.95,0.924,0.922,0.947,0.964,0.943,0.072,0.065
moadmoad,modernbert,0.974,0.914,0.913,0.94,0.97,0.943,0.094,0.005
yangjlg,pink-condenser,0.966,0.92,0.913,0.94,0.968,0.943,0.091,0.016
nexus-interrogators,sensitive-liason,0.969,0.918,0.914,0.941,0.965,0.942,0.085,0.041
cnlp-nits-pp,tomato-conduction,0.939,0.913,0.913,0.94,0.97,0.937,0.094,0.005
hello-world,tart-objective,0.976,0.914,0.914,0.923,0.954,0.937,0.076,0.069
baseline-tf-idf,baseline-tf-idf,0.976,0.914,0.914,0.923,0.954,0.937,0.076,0.069
unibuc-nlp,tangy-arch,0.968,0.916,0.899,0.927,0.962,0.935,0.109,0.009
advacheck,watery-bag,0.927,0.911,0.911,0.937,0.964,0.933,0.09,0.026


### ELOQUENT Evaluation

In [21]:
pan_leaderboard_eloquent = df_run_eval_eloquent.groupby(['team', 'software', 'eval_field']).mean(numeric_only=True).sort_values(['eval_field', 'mean'], ascending=False)
pan_leaderboard_eloquent.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
team,software,eval_field,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,source,0.62,0.874,0.806,0.0,0.0,0.46,1.0,0.14
asdkklkk,chromatic-fruit,source,0.444,0.746,0.746,0.049,0.038,0.405,0.909,0.204
baseline-ppmd,baseline-ppmd,source,0.473,0.693,0.366,0.134,0.088,0.351,0.227,0.684
team-a,deafening-template,source,0.541,0.264,0.264,0.144,0.096,0.262,0.136,0.782
baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,source,0.265,0.575,0.365,0.059,0.04,0.261,0.727,0.632
s-titze,undecidable-muenster,source,0.299,0.549,0.446,0.0,0.0,0.259,1.0,0.519
shushantatud,dense-casket,source,0.229,0.488,0.479,0.0,0.0,0.239,1.0,0.484
ds-gt-pan,metallic-artillery,source,0.26,0.443,0.443,0.012,0.008,0.233,0.955,0.526
cnlp-nits-pp,tomato-conduction,source,0.246,0.456,0.456,0.0,0.0,0.232,1.0,0.509
unibuc-nlp,tangy-arch,source,0.092,0.556,0.472,0.0,0.0,0.224,1.0,0.491


In [22]:
df_eval_eloquent = df_run_eval_eloquent.xs(('model',), level=['eval_field'])
df_eval_eloquent.index.rename('model', level='eval_value', inplace=True)
df_eval_eloquent.to_csv('../data/tira/eloquent-stats.csv')
df_eval_eloquent.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,roc-auc,brier,c@1,f1,f05u,mean,confusion,fpr,fnr
dataset,team,software,run_id,model,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
pan25-generative-ai-detection-eloquent-test-all,advacheck,watery-bag,"(2025-06-11-00-07-47, 2025-06-05-11-20-09)",eloquent-human,,0.955,0.955,0.0,0.0,0.478,"[[21, 1], [0, 0]]",0.0,0.045
pan25-generative-ai-detection-eloquent-test-all,advacheck,watery-bag,"(2025-06-11-00-07-47, 2025-06-05-11-20-09)",eloquent-llm,,0.719,0.719,0.837,0.928,0.801,"[[0, 0], [80, 205]]",0.281,0.0
pan25-generative-ai-detection-eloquent-test-all,asdkklkk,chromatic-fruit,"(2025-06-11-00-08-06, 2025-06-05-11-20-25)",eloquent-human,,0.909,0.909,0.0,0.0,0.454,"[[20, 2], [0, 0]]",0.0,0.091
pan25-generative-ai-detection-eloquent-test-all,asdkklkk,chromatic-fruit,"(2025-06-11-00-08-06, 2025-06-05-11-20-25)",eloquent-llm,,0.204,0.204,0.338,0.561,0.327,"[[0, 0], [227, 58]]",0.796,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,"(2025-06-11-00-10-24, 2025-06-05-11-23-05)",eloquent-human,,0.786,0.727,0.0,0.0,0.378,"[[16, 6], [0, 0]]",0.0,0.273
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,"(2025-06-11-00-10-24, 2025-06-05-11-23-05)",eloquent-llm,,0.812,0.628,0.766,0.891,0.774,"[[0, 0], [105, 180]]",0.368,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,"(2025-06-11-00-09-39, 2025-06-05-11-22-27)",eloquent-human,,0.915,1.0,1.0,1.0,0.979,"[[22, 0], [0, 0]]",0.0,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,"(2025-06-11-00-09-39, 2025-06-05-11-22-27)",eloquent-llm,,0.392,0.131,0.23,0.427,0.295,"[[0, 0], [245, 40]]",0.86,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-ppmd,baseline-ppmd,"(2025-06-11-00-08-43, 2025-06-05-11-21-50)",eloquent-human,,0.707,0.258,0.0,0.0,0.241,"[[5, 17], [0, 0]]",0.0,0.773
pan25-generative-ai-detection-eloquent-test-all,baseline-ppmd,baseline-ppmd,"(2025-06-11-00-08-43, 2025-06-05-11-21-50)",eloquent-llm,,0.781,0.654,0.755,0.885,0.769,"[[0, 0], [90, 195]]",0.316,0.0


In [23]:
df_eval_eloquent_mean = df_eval_eloquent.groupby('model').mean(numeric_only=True).sort_values('c@1', ascending=True)
df_eval_eloquent_mean.to_csv('../data/tira/eloquent-stats-mean.csv')
df_eval_eloquent_mean.round(3)

Unnamed: 0_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
eloquent-llm,,0.676,0.633,0.75,0.87,0.732,0.362,0.0
eloquent-human,,0.901,0.869,0.5,0.464,0.686,0.0,0.143


In [24]:
df_pred_raw_eloquent = df_pred_raw.query('dataset.str.contains("-eloquent-")')
df_pred_raw_eloquent.to_csv('../data/tira/eloquent-pred-raw.csv')
df_pred_raw_eloquent.round(3)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,label
dataset,team,software,run_id,id,Unnamed: 5_level_1
pan25-generative-ai-detection-eloquent-20250610-test,baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,2025-06-11-00-09-39,eloquent25/adhd/030,0.156
pan25-generative-ai-detection-eloquent-20250610-test,styloch,big-cv,2025-06-11-00-43-53,eloquent25/adhd/030,0.202
pan25-generative-ai-detection-eloquent-20250610-test,baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,2025-06-11-00-10-24,eloquent25/adhd/030,0.816
pan25-generative-ai-detection-eloquent-20250610-test,hello-world,tart-objective,2025-06-11-00-28-56,eloquent25/adhd/030,0.605
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/030,1.000
...,...,...,...,...,...
pan25-generative-ai-detection-eloquent-20250605-test,xlbniu,poky-corgie,2025-06-05-13-14-14,eloquent25/translation/052,0.000
pan25-generative-ai-detection-eloquent-20250605-test,iimasnlp,isg-graph-v3,2025-06-05-12-31-52,eloquent25/translation/052,0.000
pan25-generative-ai-detection-eloquent-20250605-test,pindrop,blistering-band,2025-06-05-12-44-03,eloquent25/translation/052,0.257
pan25-generative-ai-detection-eloquent-20250605-test,yangjlg,pink-condenser,2025-06-05-13-14-25,eloquent25/translation/052,0.000
