In [1]:
import json
from pathlib import Path
import yaml

import pandas as pd
import numpy as np
from tqdm.notebook import tqdm

from pan25_genai_detection.evaluator import evaluator

### Load Data

In [2]:
def load_run_jsonl(infile):
    for l in open(infile):
        j = json.loads(l)
        obj = {k: v for k, v in j.items() if k in ['dataset', 'team', 'software', 'run_id']}
        obj['eval_field'] = 'all'
        obj['eval_value'] = 'all'
        eval_keys = [k for k in j['evaluation'] if k.startswith('_eval-')]
        if obj['team'] == 'baseline':
            obj['team'] =  obj['software']
        obj_all = obj.copy()
        obj_all.update({'resource_' + k: v for k, v in j['used_resources'].items()})
        obj_all.update({k: v for k, v in j['evaluation'].items() if k not in eval_keys})
        yield obj_all

        for k in eval_keys:
            obj['eval_field'] = k.replace('_eval-', '')
            for kv, v in j['evaluation'][k].items():
                obj['eval_value'] = kv
                obj.update(**v)
                yield obj.copy()

# Load run evaluations, but keep only last runif softwares were executed multiple times
df_run_eval_raw = (pd.DataFrame(load_run_jsonl('../data/tira/runs-gen-ai-authorship-verification/runs.jsonl'))
                   .sort_values('run_id')
                   .drop_duplicates(subset=['dataset', 'team', 'software', 'eval_field', 'eval_value'], keep='last')
                   .set_index(['dataset', 'team', 'software', 'run_id', 'eval_field', 'eval_value']))
_resource_cols = [c for c in df_run_eval_raw.columns if c.startswith('resource_')]
df_resources = df_run_eval_raw[_resource_cols].dropna()
df_run_eval_raw = df_run_eval_raw.drop(columns=_resource_cols)

# Keep only highest by mean score on main test set
softwares = set(i[1] for i in df_run_eval_raw.loc['pan25-generative-ai-detection-20250604-test']
                .query('eval_field == "all"')
                .groupby('team')
                .idxmax()['mean'])
df_run_eval_raw = df_run_eval_raw.query('software in @softwares')

In [3]:
# Load case ID map
id_map = {}
for f in Path('../data/sampled').glob('*-orig-ids.jsonl'):
    for l in open(f, 'r'):
        j = json.loads(l)
        assert j['id'] not in id_map
        id_map[j['id']] = j['orig_id']

In [4]:
# Load raw predictions and back-translate case IDs
df_pred_raw = []
for pt in tqdm(list(Path('../data/tira/runs-gen-ai-authorship-verification').glob('*/run.prototext')), desc='Loading raw predictions'):
    y = yaml.full_load(open(pt))
    try:
        t, s = df_run_eval_raw.xs((y['inputDataset'], y['runId']), level=['dataset', 'run_id']).index.unique().values[0][:2]
    except KeyError:
        # Software filtered
        continue
    df_pred_raw.append(pd.read_json(next(pt.parent.glob('output/*.jsonl')), lines=True)[['id', 'label']])
    df_pred_raw[-1]['dataset'] = y['inputDataset']
    df_pred_raw[-1]['run_id'] = y['runId']
    df_pred_raw[-1]['team'] = t if t != 'baseline' else s
    df_pred_raw[-1]['software'] = s
    df_pred_raw[-1]['id'] = df_pred_raw[-1]['id'].map(lambda x: id_map[x])

df_pred_raw = pd.concat(df_pred_raw).sort_values('id').set_index(['dataset', 'team', 'software', 'run_id', 'id'])

Loading raw predictions:   0%|          | 0/150 [00:00<?, ?it/s]

In [15]:
# Merge ELOQUENT submissions
rows = []
for (t, s), d in df_pred_raw.query('dataset.str.contains("-eloquent-")').groupby(['team', 'software']):
    r = {
            'dataset': 'pan25-generative-ai-detection-eloquent-test-all',
            'team': t,
            'software': s,
            'run_id': tuple(d.index.get_level_values('run_id').unique()),
            'eval_field': 'model',
        }

    for i, d_ in enumerate([d.query('id.str.startswith("eloquent25/human/")'),
                            d.query('not id.str.startswith("eloquent25/human/")'),
                            d, d]):
        if i in [0, 1]:
            r_ = {
                **r,
                'eval_field': 'model',
                'eval_value': 'eloquent-human' if i == 0 else 'eloquent-llm',
                **{k: (np.float64(v) if type(v) is not list else v)
                   for k, v in evaluator.evaluate_all(np.array([i] * len(d_)), np.squeeze(d_['label'].values)).items()},
            }
        else:
            r_ = {
                **r,
                'eval_field': 'all' if i == 2 else 'source',
                'eval_value': 'all' if i == 2 else 'eloquent',
                **evaluator.evaluate_all(
                    d_.reset_index()['id'].str.startswith('eloquent25/human/').values,
                    np.squeeze(d_['label'].values)),
            }

        rows.append(pd.Series(list(r_.values()), index=list(r_.keys())))

df_run_eval_eloquent_raw = df_run_eval_raw.query('dataset.str.contains("-eloquent-")')
df_run_eval = pd.concat([
    df_run_eval_raw.query('not dataset.str.contains("-eloquent-")'),
    pd.DataFrame(rows).set_index(df_run_eval_raw.index.names)
])

# Fix F1 and F0.5u NaN values
def _fix_f1_nan(s):
    if np.isnan(s['f1']) and s['c@1'] == 1.0:
        s['f1'] = 1.0
        s['f05u'] = 1.0
    return s

df_run_eval = df_run_eval.apply(_fix_f1_nan, axis=1)

def _recalc_means(s):
    s['mean'] = float(np.mean([v for k, v in s.dropna().items() if type(v) is float and k != 'mean']))
    return s

# Update means
df_run_eval = df_run_eval.apply(_recalc_means, axis=1)

# Calculate FPR, FNR
df_run_eval['fpr'] = df_run_eval['confusion'].map(lambda s: s[1][0] / np.sum(s[1]) if np.sum(s[1]) else 0)
df_run_eval['fnr'] = df_run_eval['confusion'].map(lambda s: s[0][1] / np.sum(s[0]) if np.sum(s[0]) else 0)

### PAN Evaluation

In [6]:
pan_leaderboard_all = df_run_eval.groupby(['team', 'software', 'eval_field']).mean(numeric_only=True).sort_values(['eval_field', 'mean'], ascending=False)
pan_leaderboard_all

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
team,software,eval_field,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mdok,mdok,source,0.852714,0.896091,0.893909,0.898273,0.902818,0.899473,0.108044,0.093667
steely,fine-roberta,source,0.842143,0.878727,0.877000,0.864818,0.880545,0.879668,0.151166,0.100330
nexus-interrogators,sensitive-liason,source,0.864857,0.873636,0.869818,0.859727,0.881000,0.878773,0.158979,0.083389
yangjlg,pink-condenser,source,0.844857,0.878091,0.870727,0.855636,0.881455,0.877341,0.171935,0.062166
cnlp-nits-pp,tomato-conduction,source,0.824571,0.873000,0.873000,0.854455,0.882091,0.873936,0.175687,0.049651
...,...,...,...,...,...,...,...,...,...,...
baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,all,0.680000,0.773500,0.654000,0.219500,0.329000,0.531200,0.850940,0.075593
iunlp,persistent-strut,all,0.529000,0.524500,0.524500,0.479000,0.479500,0.507300,0.453062,0.488618
mdok,mdok,all,0.499000,0.502500,0.499500,0.495000,0.497000,0.498600,0.508435,0.496169
hiwiy,connected-svn,all,0.478000,0.699000,0.384500,0.478000,0.415000,0.490900,0.000000,1.000000


### ELOQUENT Evaluation

In [7]:
df_eval_eloquent = df_run_eval.query('dataset.str.contains("-eloquent-")')
df_eval_eloquent = df_eval_eloquent.xs(('model',), level=['eval_field'])
df_eval_eloquent.index.rename('model', level='eval_value', inplace=True)
df_eval_eloquent.to_csv('../data/tira/eloquent-stats.csv')
df_eval_eloquent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,roc-auc,brier,c@1,f1,f05u,mean,confusion,fpr,fnr
dataset,team,software,run_id,model,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1
pan25-generative-ai-detection-eloquent-test-all,advacheck,watery-bag,"(2025-06-11-00-07-47, 2025-06-05-11-20-09)",eloquent-human,,0.955,0.955,0.0,0.0,0.4775,"[[21, 1], [0, 0]]",0.0,0.045455
pan25-generative-ai-detection-eloquent-test-all,advacheck,watery-bag,"(2025-06-11-00-07-47, 2025-06-05-11-20-09)",eloquent-llm,,0.719,0.719,0.837,0.928,0.80075,"[[0, 0], [80, 205]]",0.280702,0.0
pan25-generative-ai-detection-eloquent-test-all,asdkklkk,chromatic-fruit,"(2025-06-11-00-08-06, 2025-06-05-11-20-25)",eloquent-human,,0.909,0.909,0.0,0.0,0.4545,"[[20, 2], [0, 0]]",0.0,0.090909
pan25-generative-ai-detection-eloquent-test-all,asdkklkk,chromatic-fruit,"(2025-06-11-00-08-06, 2025-06-05-11-20-25)",eloquent-llm,,0.204,0.204,0.338,0.561,0.32675,"[[0, 0], [227, 58]]",0.796491,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,"(2025-06-11-00-10-24, 2025-06-05-11-23-05)",eloquent-human,,0.786,0.727,0.0,0.0,0.37825,"[[16, 6], [0, 0]]",0.0,0.272727
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,"(2025-06-11-00-10-24, 2025-06-05-11-23-05)",eloquent-llm,,0.812,0.628,0.766,0.891,0.77425,"[[0, 0], [105, 180]]",0.368421,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,"(2025-06-11-00-09-39, 2025-06-05-11-22-27)",eloquent-human,,0.915,1.0,1.0,1.0,0.97875,"[[22, 0], [0, 0]]",0.0,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,"(2025-06-11-00-09-39, 2025-06-05-11-22-27)",eloquent-llm,,0.392,0.131,0.23,0.427,0.295,"[[0, 0], [245, 40]]",0.859649,0.0
pan25-generative-ai-detection-eloquent-test-all,baseline-ppmd,baseline-ppmd,"(2025-06-11-00-08-43, 2025-06-05-11-21-50)",eloquent-human,,0.707,0.258,0.0,0.0,0.24125,"[[5, 17], [0, 0]]",0.0,0.772727
pan25-generative-ai-detection-eloquent-test-all,baseline-ppmd,baseline-ppmd,"(2025-06-11-00-08-43, 2025-06-05-11-21-50)",eloquent-llm,,0.781,0.654,0.755,0.885,0.76875,"[[0, 0], [90, 195]]",0.315789,0.0


In [8]:
df_eval_eloquent_mean = df_eval_eloquent.groupby('model').mean(numeric_only=True).sort_values('c@1', ascending=True)
df_eval_eloquent_mean.to_csv('../data/tira/eloquent-stats-mean.csv')
df_eval_eloquent_mean

Unnamed: 0_level_0,roc-auc,brier,c@1,f1,f05u,mean,fpr,fnr
model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
eloquent-llm,,0.675929,0.632679,0.750143,0.869857,0.732152,0.36203,0.0
eloquent-human,,0.901464,0.868929,0.5,0.464286,0.685848,0.0,0.142857


In [9]:
df_pred_raw_eloquent = df_pred_raw.query('dataset.str.contains("-eloquent-")')
df_pred_raw_eloquent.to_csv('../data/tira/eloquent-pred-raw.csv')
df_pred_raw_eloquent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,label
dataset,team,software,run_id,id,Unnamed: 5_level_1
pan25-generative-ai-detection-eloquent-20250610-test,baseline-binoculars-tiny-llama,baseline-binoculars-tiny-llama,2025-06-11-00-09-39,eloquent25/adhd/030,0.156250
pan25-generative-ai-detection-eloquent-20250610-test,styloch,big-cv,2025-06-11-00-43-53,eloquent25/adhd/030,0.202366
pan25-generative-ai-detection-eloquent-20250610-test,baseline-binoculars-llama-3.1,baseline-binoculars-llama-3.1,2025-06-11-00-10-24,eloquent25/adhd/030,0.816406
pan25-generative-ai-detection-eloquent-20250610-test,hello-world,tart-objective,2025-06-11-00-28-56,eloquent25/adhd/030,0.604634
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/030,1.000000
...,...,...,...,...,...
pan25-generative-ai-detection-eloquent-20250605-test,xlbniu,poky-corgie,2025-06-05-13-14-14,eloquent25/translation/052,0.000000
pan25-generative-ai-detection-eloquent-20250605-test,iimasnlp,isg-graph-v3,2025-06-05-12-31-52,eloquent25/translation/052,0.000000
pan25-generative-ai-detection-eloquent-20250605-test,pindrop,blistering-band,2025-06-05-12-44-03,eloquent25/translation/052,0.257436
pan25-generative-ai-detection-eloquent-20250605-test,yangjlg,pink-condenser,2025-06-05-13-14-25,eloquent25/translation/052,0.000100
