In [1]:
import json
from pathlib import Path
import yaml

import pandas as pd
from tqdm.notebook import tqdm

### Load Data

In [2]:
def load_run_jsonl(infile):
    for l in open(infile):
        j = json.loads(l)
        obj = {k: v for k, v in j.items() if k in ['dataset', 'team', 'software', 'run_id']}
        obj['eval_field'] = 'all'
        obj['eval_value'] = 'all'
        eval_keys = [k for k in j['evaluation'] if k.startswith('_eval-')]
        obj_all = obj.copy()
        obj_all.update({'resource_' + k: v for k, v in j['used_resources'].items()})
        obj_all.update({k: v for k, v in j['evaluation'].items() if k not in eval_keys})
        yield obj_all

        for k in eval_keys:
            obj['eval_field'] = k.replace('_eval-', '')
            for kv, v in j['evaluation'][k].items():
                obj['eval_value'] = kv
                obj.update(**v)
                yield obj.copy()

df_run_eval = pd.DataFrame(load_run_jsonl('../data/tira/runs-gen-ai-authorship-verification/runs.jsonl')).set_index(['dataset', 'team', 'software', 'run_id'])
_resource_cols = [c for c in df_run_eval.columns if c.startswith('resource_')]
df_resources = df_run_eval[_resource_cols].dropna()
df_run_eval = df_run_eval.drop(columns=_resource_cols)
df_run_eval

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,eval_field,eval_value,roc-auc,brier,c@1,f1,f05u,mean,confusion
dataset,team,software,run_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,all,all,0.937,0.852,0.823,0.846,0.929,0.878,"[[18, 4], [42, 221]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,source,eloquent,0.937,0.852,0.823,0.846,0.929,0.878,"[[18, 4], [42, 221]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,Homoglyph,,0.895,0.950,0.952,0.980,0.756,"[[0, 0], [1, 21]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,HumanAIzers-0,,0.825,0.775,0.811,0.915,0.665,"[[0, 0], [4, 18]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,HumanAIzers-1,,0.895,0.950,0.952,0.980,0.756,"[[0, 0], [1, 21]]"
...,...,...,...,...,...,...,...,...,...,...,...,...
pan25-generative-ai-detection-20250604-test,ds-gt-pan,metallic-artillery,2025-06-04-17-23-58,genre,essays,0.973,0.988,0.988,0.993,0.990,0.986,"[[254, 14], [3, 1160]]"
pan25-generative-ai-detection-20250604-test,ds-gt-pan,metallic-artillery,2025-06-04-17-23-58,genre,essays-obfs,0.734,0.531,0.531,0.662,0.828,0.657,"[[70, 2], [454, 446]]"
pan25-generative-ai-detection-20250604-test,ds-gt-pan,metallic-artillery,2025-06-04-17-23-58,genre,fiction,0.993,0.992,0.992,0.993,0.996,0.993,"[[956, 1], [15, 1065]]"
pan25-generative-ai-detection-20250604-test,ds-gt-pan,metallic-artillery,2025-06-04-17-23-58,genre,fiction-obfs,,0.960,0.960,0.980,0.992,0.778,"[[0, 0], [4, 96]]"


In [3]:
# Load case ID map
id_map = {}
for f in Path('../data/sampled').glob('*-orig-ids.jsonl'):
    for l in open(f, 'r'):
        j = json.loads(l)
        assert j['id'] not in id_map
        id_map[j['id']] = j['orig_id']

In [4]:
# Load raw raw predictions and back-translate case IDs
df_pred_raw = []
for pt in tqdm(list(Path('../data/tira/runs-gen-ai-authorship-verification').glob('*/run.prototext')), desc='Loading raw predictions'):
    y = yaml.full_load(open(pt))
    df_pred_raw.append(pd.read_json(next(pt.parent.glob('output/*.jsonl')), lines=True)[['id', 'label']])
    df_pred_raw[-1]['dataset'] = y['inputDataset']
    df_pred_raw[-1]['run_id'] = y['runId']
    r = df_run_eval.query('dataset == @y["inputDataset"] and run_id == @y["runId"]').index[0]
    df_pred_raw[-1]['team'] = r[1]
    df_pred_raw[-1]['software'] = r[2]
    df_pred_raw[-1]['id'] = df_pred_raw[-1]['id'].map(lambda x: id_map[x])

df_pred_raw = pd.concat(df_pred_raw).set_index(['dataset', 'team', 'software', 'run_id', 'id'])
df_pred_raw

Loading raw predictions:   0%|          | 0/148 [00:00<?, ?it/s]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,label
dataset,team,software,run_id,id,Unnamed: 5_level_1
pan25-generative-ai-detection-20250604-test,mdok,mdok,2025-06-04-16-11-23,gutenberg-19c-fiction/PG5673_9,0.000308
pan25-generative-ai-detection-20250604-test,mdok,mdok,2025-06-04-16-11-23,riddell-juola-obfuscated/gpt-4o/59db82,1.000000
pan25-generative-ai-detection-20250604-test,mdok,mdok,2025-06-04-16-11-23,gutenberg-19c-fiction/deepseek-r1-distill-qwen-32b-obfs-temp-1.1/PG4062_9,1.000000
pan25-generative-ai-detection-20250604-test,mdok,mdok,2025-06-04-16-11-23,brennan-greenstadt/gpt-4o-mini/j_12_4,1.000000
pan25-generative-ai-detection-20250604-test,mdok,mdok,2025-06-04-16-11-23,pan24-test/text-bison-002/news-2021-01-01-2021-12-31-colonialpipelinehack/art-064,1.000000
...,...,...,...,...,...
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/037,1.000000
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/038,0.901600
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/030,1.000000
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/033,0.991400


### ELOQUENT Evaluation

In [5]:
eloquent_ds_names = [d for d in df_run_eval.reset_index()['dataset'].unique() if '-eloquent-' in d]
df_eval_eloquent = df_run_eval.query('dataset in @eloquent_ds_names')
df_eval_eloquent = df_eval_eloquent[df_eval_eloquent['eval_field'] == 'model']
df_eval_eloquent.to_csv('../data/tira/eloquent-stats.csv')
df_eval_eloquent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,eval_field,eval_value,roc-auc,brier,c@1,f1,f05u,mean,confusion
dataset,team,software,run_id,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,Homoglyph,,0.895,0.950,0.952,0.980,0.756,"[[0, 0], [1, 21]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,HumanAIzers-0,,0.825,0.775,0.811,0.915,0.665,"[[0, 0], [4, 18]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,HumanAIzers-1,,0.895,0.950,0.952,0.980,0.756,"[[0, 0], [1, 21]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,HumanAIzers-2,,0.967,1.000,1.000,1.000,0.793,"[[0, 0], [0, 21]]"
pan25-generative-ai-detection-eloquent-20250605-test,hello-world,tart-objective,2025-06-05-12-11-02,model,HumanAIzers-3,,0.898,0.855,0.900,0.957,0.722,"[[0, 0], [3, 19]]"
...,...,...,...,...,...,...,...,...,...,...,...,...
pan25-generative-ai-detection-eloquent-20250610-test,cnlp-nits-pp,tomato-conduction,2025-06-11-00-11-42,model,adhd,,0.773,0.773,0.872,0.944,0.672,"[[0, 0], [5, 17]]"
pan25-generative-ai-detection-eloquent-20250610-test,nexus-interrogators,sensitive-liason,2025-06-11-00-35-49,model,adhd,,1.000,1.000,1.000,1.000,0.800,"[[0, 0], [0, 22]]"
pan25-generative-ai-detection-eloquent-20250610-test,bohan-li,distinct-dachshund,2025-06-11-00-10-49,model,adhd,,0.784,0.773,0.872,0.944,0.675,"[[0, 0], [5, 17]]"
pan25-generative-ai-detection-eloquent-20250610-test,ds-gt-pan,flavorful-concourse,2025-06-11-00-28-29,model,adhd,,0.091,0.091,0.167,0.333,0.136,"[[0, 0], [20, 2]]"


In [6]:
df_eval_eloquent_mean = df_eval_eloquent[df_eval_eloquent['eval_field'] == 'model'].groupby('eval_value').mean(numeric_only=True).sort_values('mean', ascending=False)
df_eval_eloquent_mean.to_csv('../data/tira/eloquent-stats-mean.csv')
df_eval_eloquent_mean

Unnamed: 0_level_0,roc-auc,brier,c@1,f1,f05u,mean
eval_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
HumanAIzers-2,,0.84512,0.81436,0.85888,0.904,0.68452
Moa,,0.78138,0.7496,0.8152,0.89176,0.64756
TeamUTK,,0.78814,0.74118,0.80166,0.87424,0.64104
TeamUTK_bis,,0.77414,0.7336,0.80422,0.88384,0.63916
HumanAIzers-3,,0.78072,0.73082,0.79156,0.86042,0.6328
PJs-team-v1,,0.72224,0.6656,0.75438,0.851,0.59864
adhd,,0.68386,0.64232,0.72598,0.8228,0.57488
HumanAIzers-1,,0.59754,0.54884,0.59466,0.6545,0.47912
Homoglyph,,0.59754,0.54884,0.59466,0.6545,0.47912
JUNLP_SS,,0.51444,0.43666,0.53274,0.668,0.43034


In [11]:
df_pred_raw_eloquent = df_pred_raw.query('dataset in @eloquent_ds_names')
df_pred_raw_eloquent.to_csv('../data/tira/eloquent-pred-raw.csv')
df_pred_raw_eloquent

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,label
dataset,team,software,run_id,id,Unnamed: 5_level_1
pan25-generative-ai-detection-eloquent-20250605-test,advacheck,watery-bag,2025-06-05-11-20-09,eloquent25/humanaizers-0/047,0.0000
pan25-generative-ai-detection-eloquent-20250605-test,advacheck,watery-bag,2025-06-05-11-20-09,eloquent25/junlp_ss/033,1.0000
pan25-generative-ai-detection-eloquent-20250605-test,advacheck,watery-bag,2025-06-05-11-20-09,eloquent25/pjs-team-v2/031,0.0000
pan25-generative-ai-detection-eloquent-20250605-test,advacheck,watery-bag,2025-06-05-11-20-09,eloquent25/humanaizers-2/031,1.0000
pan25-generative-ai-detection-eloquent-20250605-test,advacheck,watery-bag,2025-06-05-11-20-09,eloquent25/moa/036,1.0000
...,...,...,...,...,...
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/037,1.0000
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/038,0.9016
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/030,1.0000
pan25-generative-ai-detection-eloquent-20250610-test,iimasnlp,isg-graph-v3,2025-06-11-12-48-56,eloquent25/adhd/033,0.9914
