# Evaluate the RoBERTa generalisation ability of the DetectRL dataset

General info: this notebook does the aggregation of the training results. The training itself is done with the notebook 'T02'

0. Setup and data analysis
     0. Setup and imports
     1. Dataset analysis:
        - closer look at DetectRL data folder "/Benchmark/Benchmark_Data/Tasks"
            --> content of the folder Task1 and Task2 completely identical
        - short analysis of the label, domain and attack distribution for the Task1 folder
    
1. Analyse the generalisation performance of the Regex cleaned data compared to the uncleaned one
    - the models are trained on a single llm single domain dataset and evaluated on the other llms in that domain --> the target is to analyse the generalisation performance across multiple LLMs; we especially want to compare the uncleaned llms with the cleaned ones to identify llms and domains, there the cleaning had the highest (positive or negative) impact on the generalisation performance
    - Results: 
        -  Overall only minor differences between the regex cleaned data and the uncleaned one
        -  Max. abs differences: change of around 0.06 in f1-score 

# 0. Setup

## 0.0 imports

In [1]:
import os
import sys
import logging
import pandas as pd
from tqdm import tqdm
import json
from sklearn.model_selection import train_test_split

BASE_DIR = "../../"
sys.path.append(BASE_DIR)


from src.general_functions_and_patterns_for_detection import (
    file_hash,
    get_info_based_on_input_path,
    load_dataframe_from_json,
    TrainRobertaHelper,
    RESULT_DIR, CLEANED_FILES_DIR, ORIGINAL_DATA_DIR, 
    DETECT_RL_DIR, TASK_DIR,
    LLMs, DOMAINS, COLUMNS_DIRECTLY_LLM_GENERATED_DETECT_RL as LLM_PROMPTS,
)

prepare_df_for_roberta_training = TrainRobertaHelper.prepare_df_for_roberta_training

sys.path.append(os.path.join(DETECT_RL_DIR, "Detectors"))
import DetectRL.Detectors.train_roberta as train_roberta

DEBUG = True
DRY_RUN = False
ALL_DATA = True
SEED = 2023
logging.basicConfig(level=logging.WARNING, format="%(asctime)s %(levelname)s %(message)s")

2025-09-06 14:28:26.857443: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-06 14:28:26.875544: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757168906.897090 1447122 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757168906.903043 1447122 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757168906.919281 1447122 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
task1_path = f"{TASK_DIR}/Task1/"
task2_path = f"{TASK_DIR}/Task2/"

## 0.1 Take a look at the different paths

In [3]:
task1_dir = task1_path
task2_dir = task2_path

unequal_rows_df = pd.DataFrame()
equal_count = 0
unequal_count = 0

for filename in os.listdir(task2_dir):
    if filename.endswith(".json"):
        path2 = os.path.join(task2_dir, filename)
        path1 = os.path.join(task1_dir, filename)

        if not os.path.exists(path1):
            print(f"File {filename} not found in Task1. Skipping.")
            continue

        hash1 = file_hash(path1)
        hash2 = file_hash(path2)

        if hash1 == hash2:
            df = pd.read_json(path2, encoding="utf-8")
            equal_count += len(df)
        else:
            print(path1, "and", path2, "are not the same by hash, checking for the rows")
            df1 = pd.read_json(path1)
            df2 = pd.read_json(path2)

            # Align columns to prevent misalignment
            common_columns = df1.columns.intersection(df2.columns)
            df1 = df1[common_columns]
            df2 = df2[common_columns]

            matches = df1.equals(df2)
            if matches:
                equal_count += len(df1)
            else:
                # Compare row-wise
                comparison = df1.eq(df2)
                row_equality = comparison.all(axis=1)

                equal_rows = df2[row_equality]
                unequal_rows = df2[~row_equality]

                equal_count += len(equal_rows)
                unequal_count += len(unequal_rows)

                unequal_rows_df = pd.concat([unequal_rows_df, unequal_rows], ignore_index=True)

print(f"Equal rows: {equal_count}")
print(f"Unequal rows: {unequal_count}")

Equal rows: 349165
Unequal rows: 0


In [4]:
for llm in LLMs:
    other_LLMs = list(set(LLMs) - {llm})

    # Train and test for the current LLM
    train_df = load_dataframe_from_json(os.path.join(task1_path, f"multi_llms_{llm}_train.json"))
    test_df = load_dataframe_from_json(os.path.join(task1_path, f"multi_llms_{llm}_test.json"))
    
    if llm == "ChatGPT":
        break
        
print(train_df.columns, test_df.columns)

Index(['text', 'label', 'data_type', 'llm_type'], dtype='object') Index(['text', 'label', 'data_type', 'llm_type'], dtype='object')


In [6]:
train_df[["label"]].value_counts()

label
llm      24187
human     1800
Name: count, dtype: int64

In [7]:
human_df_1 = train_df[train_df["label"]=="human"]
human_df_1["data_type"].value_counts()

data_type
abstract    450
document    450
story       450
content     450
Name: count, dtype: int64

In [8]:
train_df[["data_type", "label"]].value_counts()

data_type                        label
adversarial_character_llm        llm      2688
adversarial_word_llm             llm      2688
adversarial_character_word_llm   llm      2688
paraphrase_back_translation_llm  llm      2688
direct_prompt                    llm      2688
prompt_few_shot                  llm      2688
paraphrase_polish_llm            llm      2688
prompt_SICO                      llm      2688
paraphrase_dipper_llm            llm      2683
content                          human     450
abstract                         human     450
document                         human     450
story                            human     450
Name: count, dtype: int64

In [9]:
test_df[["data_type", "label", "llm_type"]].value_counts()


data_type                        label  llm_type
abstract                         human  ChatGPT     250
content                          human  ChatGPT     250
document                         human  ChatGPT     250
story                            human  ChatGPT     250
adversarial_character_llm        llm    ChatGPT     112
adversarial_word_llm             llm    ChatGPT     112
adversarial_character_word_llm   llm    ChatGPT     112
paraphrase_back_translation_llm  llm    ChatGPT     112
direct_prompt                    llm    ChatGPT     112
paraphrase_dipper_llm            llm    ChatGPT     112
paraphrase_polish_llm            llm    ChatGPT     112
prompt_SICO                      llm    ChatGPT     112
prompt_few_shot                  llm    ChatGPT     112
Name: count, dtype: int64

In [10]:
for llm in LLMs:
    other_LLMs = list(set(LLMs) - {llm})

    # Train and test for the current LLM
    train_df = load_dataframe_from_json(os.path.join(task2_path, f"multi_llms_{llm}_train.json"))
    test_df = load_dataframe_from_json(os.path.join(task2_path, f"multi_llms_{llm}_test.json"))
    
    if llm == "ChatGPT":
        break

train_df[["data_type", "label", "llm_type"]].value_counts()

data_type                        label  llm_type
adversarial_character_llm        llm    ChatGPT     2688
adversarial_word_llm             llm    ChatGPT     2688
adversarial_character_word_llm   llm    ChatGPT     2688
paraphrase_back_translation_llm  llm    ChatGPT     2688
direct_prompt                    llm    ChatGPT     2688
prompt_few_shot                  llm    ChatGPT     2688
paraphrase_polish_llm            llm    ChatGPT     2688
prompt_SICO                      llm    ChatGPT     2688
paraphrase_dipper_llm            llm    ChatGPT     2683
content                          human  ChatGPT      450
abstract                         human  ChatGPT      450
document                         human  ChatGPT      450
story                            human  ChatGPT      450
Name: count, dtype: int64

In [11]:
print(test_df["label"].value_counts())
test_df[["data_type", "label", "llm_type"]].value_counts()


label
llm      1008
human    1000
Name: count, dtype: int64


data_type                        label  llm_type
abstract                         human  ChatGPT     250
content                          human  ChatGPT     250
document                         human  ChatGPT     250
story                            human  ChatGPT     250
adversarial_character_llm        llm    ChatGPT     112
adversarial_word_llm             llm    ChatGPT     112
adversarial_character_word_llm   llm    ChatGPT     112
paraphrase_back_translation_llm  llm    ChatGPT     112
direct_prompt                    llm    ChatGPT     112
paraphrase_dipper_llm            llm    ChatGPT     112
paraphrase_polish_llm            llm    ChatGPT     112
prompt_SICO                      llm    ChatGPT     112
prompt_few_shot                  llm    ChatGPT     112
Name: count, dtype: int64

# 1. Training

## 1.0 General functions

## 1.1 Aggregate Training Results

In [5]:
data_paths = [CLEANED_FILES_DIR, ORIGINAL_DATA_DIR]
# DOMAINS = ["writing_prompt"]
result_list = []
count_runs_done = 0
count_runs_missing = 0

for h, _domain in enumerate(DOMAINS):
    for j, _llm in enumerate(LLMs):
        for k, prompt in enumerate(LLM_PROMPTS):            
            training_df_original = load_dataframe_from_json(f"{ORIGINAL_DATA_DIR}{_domain}_2800.json")
            training_df_cleaned = pd.read_parquet(f"{CLEANED_FILES_DIR}{_domain}_2800_cleaned_all_v2.parquet")
            _, prompt_key, human_key = get_info_based_on_input_path(_domain)
            training_df_original = prepare_df_for_roberta_training(training_df_original, column_to_be_used_for_text=prompt,
                                                          column_to_be_used_for_human=human_key, column_title=prompt_key)
            training_df_cleaned = prepare_df_for_roberta_training(training_df_cleaned, column_to_be_used_for_text=prompt,
                                                          column_to_be_used_for_human=human_key, column_title=prompt_key)
            training_df_original = train_roberta.hash_dataframe_as_parquet(training_df_original[training_df_original["llm_type"]==_llm])
            training_df_cleaned = train_roberta.hash_dataframe_as_parquet(training_df_cleaned[training_df_cleaned["llm_type"]==_llm])
            
            if training_df_cleaned == training_df_original:
                print(_llm, _domain, prompt)

ChatGPT xsum paraphrase_polish_human


In [14]:
RESULT_DIR = "/mnt/hdd-baracuda/pdingfelder/Training_Results"

In [15]:
data_paths = [CLEANED_FILES_DIR, ORIGINAL_DATA_DIR]
# DOMAINS = ["writing_prompt"]
result_list = []
count_runs_done = 0
count_runs_missing = 0

for h, _domain in enumerate(tqdm(DOMAINS)):
    for j, _llm in enumerate(LLMs):
        for k, prompt in enumerate(LLM_PROMPTS):
            for i, _train_path in enumerate(data_paths):
                result = {}
                if _train_path.startswith(ORIGINAL_DATA_DIR):
                    training_df = load_dataframe_from_json(f"{ORIGINAL_DATA_DIR}{_domain}_2800.json")
                    # print("original")
                else:
                    training_df = pd.read_parquet(f"{CLEANED_FILES_DIR}{_domain}_2800_cleaned_all_v2.parquet")
                    # print("cleaned")
                _, prompt_key, human_key = get_info_based_on_input_path(_domain)
                training_df = prepare_df_for_roberta_training(training_df, column_to_be_used_for_text=prompt,
                                                              column_to_be_used_for_human=human_key, column_title=prompt_key)
                df_claude = training_df[training_df["llm_type"]==_llm]
                other_llms = LLMs.copy()
                other_llms.remove(_llm)
                df_llama, df_palm, df_chatgpt = [training_df[training_df["llm_type"]==_llm].dropna(subset=["label", "text"]) for _llm in other_llms]
                # print(df_claude.head())
                
                train_df, test_df = train_test_split(df_claude, test_size=0.2, random_state=SEED, shuffle=True)
                # print(train_df.shape, test_df.shape, len(df_llama), len(df_claude), len(df_chatgpt),
                #  train_df.columns)
                
                save_model_path = f"{RESULT_DIR}/{_llm}_{prompt}_test"
                dict_temp = {_llm: test_df, other_llms[0]: df_llama, other_llms[1]: df_palm, other_llms[2]: df_chatgpt}.items()
                for key, df in dict_temp:
                    df_hash = train_roberta.hash_dataframe_as_parquet(df)
                    result_path = f"{save_model_path}/{df_hash}.roberta-base_result.json"
                    if os.path.exists(result_path):
                        with open(result_path) as fp:
                            result = json.load(fp)
                        fp.close()
                        result["training_llm"] = _llm
                        result["test_llm"] = key
                        result["hash_df"] = df_hash
                        result["domain"] = _domain
                        result["cleaned"] = _train_path == CLEANED_FILES_DIR
                        result["llm_prompt"] = prompt
                        result_list.append(result)
                        count_runs_done += 1
                    else:
                        count_runs_missing += 1
print(count_runs_done, count_runs_missing)

100%|██████████| 4/4 [01:20<00:00, 20.01s/it]


In [None]:
df_results = pd.DataFrame(result_list)
df_results.sort_values(["roc_auc"], inplace=True)
df_results.drop(columns=["hash_df", "optimal_threshold", "conf_matrix"]).reset_index(drop=True).head(20)

In [27]:
claude_writing_original = load_dataframe_from_json(f"{ORIGINAL_DATA_DIR}writing_prompt_2800.json", filter_llm=True)
# claude_writing = _df[_df["llm_type"]=="Claude-instant"]
for _, item in claude_writing_original.head(20).iterrows():
    print("\n", item["prompt_SICO"])


 Here is a 22 sentence story in a more human writing style based on the prompt:It was a quiet autumn evening when Death realized something was stirring within her cosmic being. As her spectral hands wandered over her robed figure, she felt an unexpected warmth in her core.  Puzzled, she retired to her gloomy realm for deliberations. Through the long night, mysterious flutters arose within the void of her soul. Come dawn, Death ventured to the lands of the living to seek counsel. Among the bereaved arranging funerals scheduled for dreary weeks ahead, Life spotted Death's troubled gaze."What ails you my old friend?" inquired Life with sincere concern. Death grasped Life's vibrant hands and confided her strange discovery through tears of netherworld mist.  Stunned into rare silence, Life processed this unthinkable notion. "But how can it be?" wondered Life aloud. Death had no answers to the question echoing in her hollow bones.Together they sought the wisdom of Time in his endless halls.

In [26]:
_df = pd.read_parquet(f"{CLEANED_FILES_DIR}writing_prompt_2800_cleaned_all_v2.parquet")
claude_writing = _df[_df["llm_type"]=="Claude-instant"]
for _, item in claude_writing.head(20).iterrows():
    print("\n", item["prompt_SICO"])


 It was a quiet autumn evening when Death realized something was stirring within her cosmic being. As her spectral hands wandered over her robed figure, she felt an unexpected warmth in her core.  Puzzled, she retired to her gloomy realm for deliberations. Through the long night, mysterious flutters arose within the void of her soul. Come dawn, Death ventured to the lands of the living to seek counsel. Among the bereaved arranging funerals scheduled for dreary weeks ahead, Life spotted Death's troubled gaze."What ails you my old friend?" inquired Life with sincere concern. Death grasped Life's vibrant hands and confided her strange discovery through tears of netherworld mist.  Stunned into rare silence, Life processed this unthinkable notion. "But how can it be?" wondered Life aloud. Death had no answers to the question echoing in her hollow bones.Together they sought the wisdom of Time in his endless halls. Upon hearing their troubling news, even the ancient clockmaker was left perpl

In [28]:
df_results.sort_values(["tpr_at_fpr_0_01"])

Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,training_llm,test_llm,hash_df,domain,cleaned,llm_prompt
193,0.927236,-0.503357,"[[611, 89], [116, 583]]",0.867560,0.834049,0.850474,0.853467,0.021459,Claude-instant,Llama-2-70b,a2bf050f3dd9c70d5ddfc7e19bf42c5d97120a69cca947...,writing_prompt,True,prompt_SICO
185,0.940752,-0.510063,"[[612, 88], [73, 626]]",0.876751,0.895565,0.886058,0.884918,0.032904,Claude-instant,Llama-2-70b,44f3aae5e8f0f3cee831bb0714df8c95063d650f1677e9...,writing_prompt,True,prompt_few_shot
174,0.968916,-0.504111,"[[645, 55], [52, 648]]",0.921764,0.925714,0.923735,0.923571,0.037143,Claude-instant,Google-PaLM,d1bd8e8b775a140ad7d5034375839a9487f01eadc37803...,writing_prompt,False,paraphrase_polish_human
249,0.962028,-0.545794,"[[618, 82], [46, 593]]",0.878519,0.928013,0.902588,0.904406,0.046948,Google-PaLM,Claude-instant,0c1376403e18a9d03b494872e08334ca8272bd226340a9...,writing_prompt,True,paraphrase_polish_human
250,0.966120,-0.540817,"[[630, 70], [52, 645]]",0.902098,0.925395,0.913598,0.912670,0.073171,Google-PaLM,Llama-2-70b,2396f3c025f2f16bc686b250b64c9c6cef9fff1f6e08bc...,writing_prompt,True,paraphrase_polish_human
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
336,1.000000,-0.610236,"[[131, 0], [0, 147]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,376a85ceb35b82317685de3a0f9a763103fdd448bd1ed8...,xsum,True,paraphrase_polish_llm
340,1.000000,-0.575462,"[[133, 0], [0, 147]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,52713a321afe02f4ac79a7fb4506fb887f67ab27993894...,xsum,False,paraphrase_polish_llm
103,1.000000,-0.482712,"[[700, 0], [0, 700]]",1.000000,1.000000,1.000000,1.000000,1.000000,Google-PaLM,ChatGPT,4aac458ad644a4130b558c4fd1a6a992f2fec3f908f203...,arxiv,False,paraphrase_polish_llm
348,1.000000,-0.518056,"[[133, 0], [0, 147]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,b2bc9bb292a110b9296c558ef60845d23a844dc3eed769...,xsum,False,prompt_few_shot


In [10]:
df_results_wo_human = df_results[df_results["llm_prompt"]!="paraphrase_polish_human"]
df_results_wo_human.head()

Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,training_llm,test_llm,hash_df,domain,cleaned,llm_prompt
0,1.0,-0.492397,"[[133, 0], [0, 147]]",1.0,1.0,1.0,1.0,1.0,Claude-instant,Claude-instant,2e20b8e1716c086366ea1da5303ecf95d4c8c9f7a02e78...,arxiv,True,direct_prompt
1,1.0,-0.505136,"[[700, 0], [0, 700]]",1.0,1.0,1.0,1.0,1.0,Claude-instant,Llama-2-70b,85ab1b630691f37a4f385717a62b378df41ca5dc28f522...,arxiv,True,direct_prompt
2,0.99936,-0.515875,"[[700, 0], [4, 692]]",1.0,0.994253,0.997118,0.997135,0.994253,Claude-instant,Google-PaLM,eb6a9e1b095764a39d9d4aec157fb6525efe17a616d96a...,arxiv,True,direct_prompt
3,1.0,-0.488934,"[[700, 0], [0, 700]]",1.0,1.0,1.0,1.0,1.0,Claude-instant,ChatGPT,3faa71ffb2522bb968592762bbc61e227d95177f37a732...,arxiv,True,direct_prompt
4,1.0,-0.501424,"[[133, 0], [0, 147]]",1.0,1.0,1.0,1.0,1.0,Claude-instant,Claude-instant,446dbd8bd3c9581ea057b9363e3e557ca4e448fb3cb999...,arxiv,False,direct_prompt


In [29]:
print(f"Runs executed: {count_runs_done}, missing: {count_runs_missing}, {count_runs_done/(count_runs_done+count_runs_missing)}")
# df_results = pd.DataFrame(result_list)
# df_results.head(20)

Runs executed: 441, missing: 199, 0.6890625


In [12]:
df_results["domain"].unique()

array(['arxiv', 'writing_prompt', 'xsum'], dtype=object)

In [13]:
df_results[(df_results["domain"]=="xsum")&(df_results["llm_prompt"]=="prompt_few_shot")].sort_values(["training_llm", "test_llm"])

Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,training_llm,test_llm,hash_df,domain,cleaned,llm_prompt
433,1.0,-0.513668,"[[133, 0], [0, 147]]",1.0,1.0,1.0,1.0,1.0,ChatGPT,ChatGPT,ba91d0773ef08661745bf343c5c6f4f4608faa48fc7521...,xsum,True,prompt_few_shot
434,0.999926,-0.577739,"[[698, 2], [1, 671]]",0.997028,0.998512,0.99777,0.997813,0.96131,ChatGPT,Claude-instant,592cfa0aa166f1c4ad4c35b913009774bddd8d65758421...,xsum,True,prompt_few_shot
436,0.922886,-0.660083,"[[657, 43], [93, 455]]",0.913655,0.830292,0.869981,0.891026,0.54927,ChatGPT,Google-PaLM,0b042bfd8735b78967638630404a5415dfd2ab9d1422df...,xsum,True,prompt_few_shot
435,0.997618,-0.60482,"[[689, 11], [10, 690]]",0.984308,0.985714,0.985011,0.985,0.857143,ChatGPT,Llama-2-70b,1c3e99e696c1b014d3cc624638e95fe3789d2a57755499...,xsum,True,prompt_few_shot
347,0.998916,-0.663829,"[[695, 5], [6, 694]]",0.992847,0.991429,0.992137,0.992143,0.614286,Claude-instant,ChatGPT,96cdf9ce21e5202d5020711865821a860976c0e175ee6a...,xsum,True,prompt_few_shot
351,0.999092,-0.619488,"[[692, 8], [5, 695]]",0.98862,0.992857,0.990734,0.990714,0.607143,Claude-instant,ChatGPT,db019cdb3f2c58d0fbc4dc5026d2427a4f686af132cd00...,xsum,False,prompt_few_shot
344,1.0,-0.537093,"[[135, 0], [0, 140]]",1.0,1.0,1.0,1.0,1.0,Claude-instant,Claude-instant,b6c55736280df67cbf9e948ca514b508d79cafdf62f0db...,xsum,True,prompt_few_shot
348,1.0,-0.518056,"[[133, 0], [0, 147]]",1.0,1.0,1.0,1.0,1.0,Claude-instant,Claude-instant,b2bc9bb292a110b9296c558ef60845d23a844dc3eed769...,xsum,False,prompt_few_shot
346,0.926449,-0.723472,"[[670, 30], [86, 462]]",0.939024,0.843066,0.888462,0.907051,0.653285,Claude-instant,Google-PaLM,0b042bfd8735b78967638630404a5415dfd2ab9d1422df...,xsum,True,prompt_few_shot
350,0.930718,-0.67036,"[[664, 36], [102, 598]]",0.943218,0.854286,0.896552,0.901429,0.674286,Claude-instant,Google-PaLM,c9909605e35c21164f508d02da26267467ef61c8678bda...,xsum,False,prompt_few_shot


In [14]:
df_results.groupby("cleaned", as_index=False).mean(numeric_only=True)

Unnamed: 0,cleaned,roc_auc,optimal_threshold,precision,recall,f1,accuracy,tpr_at_fpr_0_01
0,False,0.990681,-0.515216,0.978927,0.973146,0.975622,0.976119,0.824088
1,True,0.989848,-0.527514,0.976161,0.971347,0.973392,0.975501,0.834621


In [31]:
df_results.groupby("domain", as_index=False).mean(numeric_only=True).sort_values(by="roc_auc").reset_index(drop=True)

Unnamed: 0,domain,roc_auc,optimal_threshold,precision,recall,f1,accuracy,tpr_at_fpr_0_01,cleaned
0,writing_prompt,0.979544,-0.491796,0.948902,0.943296,0.945194,0.947664,0.608984,0.5
1,xsum,0.991576,-0.598557,0.985945,0.975025,0.980277,0.98202,0.900628,0.661157
2,arxiv,0.999891,-0.494057,0.999496,0.998876,0.999184,0.999185,0.997107,0.5


In [15]:
df_results_wo_human.groupby("cleaned", as_index=False).mean(numeric_only=True)

Unnamed: 0,cleaned,roc_auc,optimal_threshold,precision,recall,f1,accuracy,tpr_at_fpr_0_01
0,False,0.990523,-0.514943,0.980333,0.971965,0.975664,0.976393,0.845153
1,True,0.990286,-0.526181,0.978575,0.971389,0.974583,0.976715,0.850313


In [16]:
df_results_wo_human[df_results_wo_human.test_llm=="Claude-instant"].sort_values(by="f1").drop(columns=["hash_df", 'optimal_threshold'])

Unnamed: 0,roc_auc,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,training_llm,test_llm,domain,cleaned,llm_prompt
273,0.749314,"[[632, 68], [343, 340]]",0.833333,0.497804,0.623281,0.702820,0.273792,Google-PaLM,Claude-instant,writing_prompt,True,prompt_SICO
277,0.755400,"[[635, 65], [347, 353]]",0.844498,0.504286,0.631485,0.705714,0.277143,Google-PaLM,Claude-instant,writing_prompt,False,prompt_SICO
317,0.863724,"[[608, 92], [226, 474]]",0.837456,0.677143,0.748815,0.772857,0.268571,ChatGPT,Claude-instant,writing_prompt,False,prompt_SICO
313,0.890398,"[[598, 102], [170, 513]]",0.834146,0.751098,0.790447,0.803326,0.278184,ChatGPT,Claude-instant,writing_prompt,True,prompt_SICO
233,0.898875,"[[631, 69], [171, 512]]",0.881239,0.749634,0.810127,0.826464,0.360176,Llama-2-70b,Claude-instant,writing_prompt,True,prompt_SICO
...,...,...,...,...,...,...,...,...,...,...,...,...
340,1.000000,"[[133, 0], [0, 147]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,xsum,False,paraphrase_polish_llm
344,1.000000,"[[135, 0], [0, 140]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,xsum,True,prompt_few_shot
348,1.000000,"[[133, 0], [0, 147]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,xsum,False,prompt_few_shot
352,1.000000,"[[139, 0], [0, 136]]",1.000000,1.000000,1.000000,1.000000,1.000000,Claude-instant,Claude-instant,xsum,True,prompt_SICO


In [17]:
df_results_wo_human[df_results_wo_human.test_llm=="Claude-instant"].groupby(["training_llm"], as_index=False).mean(numeric_only=True)

Unnamed: 0,training_llm,roc_auc,optimal_threshold,precision,recall,f1,accuracy,tpr_at_fpr_0_01,cleaned
0,ChatGPT,0.981837,-0.527498,0.965111,0.949738,0.956799,0.959059,0.731664,0.6
1,Claude-instant,0.992715,-0.513124,0.982632,0.981611,0.981882,0.981762,0.882144,0.5
2,Google-PaLM,0.965214,-0.531573,0.953049,0.922592,0.933388,0.941189,0.766582,0.6
3,Llama-2-70b,0.990267,-0.542659,0.980866,0.973065,0.976502,0.977791,0.874352,0.5


In [36]:
grouped = (
    df_results
    .groupby(["training_llm", "domain", "cleaned", "llm_prompt"], as_index=False)
    .mean(numeric_only=True)
)

grouped.sort_values("tpr_at_fpr_0_01")

Unnamed: 0,training_llm,domain,cleaned,llm_prompt,roc_auc,optimal_threshold,precision,recall,f1,accuracy,tpr_at_fpr_0_01
45,Claude-instant,writing_prompt,True,prompt_SICO,0.943680,-0.503004,0.884387,0.891900,0.887480,0.885770,0.207707
73,Google-PaLM,writing_prompt,True,paraphrase_polish_human,0.966188,-0.541437,0.909680,0.921425,0.915198,0.922493,0.228651
46,Claude-instant,writing_prompt,True,prompt_few_shot,0.972884,-0.503488,0.902330,0.933553,0.917571,0.930323,0.247373
68,Google-PaLM,writing_prompt,False,paraphrase_polish_human,0.970353,-0.516351,0.923675,0.928061,0.925093,0.923929,0.250085
38,Claude-instant,writing_prompt,False,paraphrase_polish_human,0.973160,-0.505484,0.914952,0.938639,0.926604,0.925000,0.290782
...,...,...,...,...,...,...,...,...,...,...,...
59,Google-PaLM,arxiv,False,paraphrase_polish_llm,0.999999,-0.493948,1.000000,0.999643,0.999821,0.999821,0.999643
62,Google-PaLM,arxiv,True,direct_prompt,0.999999,-0.499061,1.000000,0.999643,0.999821,0.999821,0.999643
21,ChatGPT,xsum,False,paraphrase_polish_llm,1.000000,-0.393586,1.000000,1.000000,1.000000,1.000000,1.000000
8,ChatGPT,arxiv,True,prompt_SICO,1.000000,-0.475486,1.000000,1.000000,1.000000,1.000000,1.000000


In [37]:
df_results.cleaned.value_counts()

cleaned
True     240
False    201
Name: count, dtype: int64

In [38]:
# Step 1: Group by the relevant columns and get unique values of "cleaned"
COLUMNS = ["training_llm", "domain", "llm_prompt", "test_llm"]
grouped = df_results_wo_human.groupby(COLUMNS)["cleaned"].nunique()

# Step 2: Keep only those groups where there are exactly 2 unique values (True and False)
valid_combinations = grouped[grouped == 2].index

# Step 3: Filter the original DataFrame
filtered_df = df_results_wo_human[df_results_wo_human.set_index(COLUMNS).index.isin(valid_combinations)]
filtered_df = filtered_df.reset_index(drop=True) 
filtered_df.groupby("cleaned", as_index=False).mean(numeric_only=True)


Unnamed: 0,cleaned,roc_auc,optimal_threshold,precision,recall,f1,accuracy,tpr_at_fpr_0_01
0,False,0.990523,-0.514943,0.980333,0.971965,0.975664,0.976393,0.845153
1,True,0.990083,-0.515535,0.977874,0.971204,0.9741,0.976073,0.840754


In [43]:
# Step 2: Pivot so that we get f1 values for cleaned == True and False side-by-side
pivoted = (
    filtered_df.pivot_table(
        index=COLUMNS,
        columns="cleaned",
        values="f1"
    )
    .dropna()  # Drop any rows with missing data just in case
)

# Step 3: Compute the difference
pivoted["diff"] = pivoted[True] - pivoted[False]
pivoted["abs_diff"] = pivoted["diff"].abs()

# Step 4: Show top 10 combinations with the highest absolute difference
top10 = pivoted.sort_values("abs_diff", ascending=False)

# Optional: clean up the display
top10_display = top10[["diff", "abs_diff"]].reset_index()
# top10_display.head(10)
top10_display

cleaned,training_llm,domain,llm_prompt,test_llm,diff,abs_diff
0,Google-PaLM,writing_prompt,prompt_few_shot,Google-PaLM,-0.069101,0.069101
1,Claude-instant,writing_prompt,prompt_few_shot,Google-PaLM,-0.068263,0.068263
2,Claude-instant,writing_prompt,prompt_SICO,Claude-instant,-0.065995,0.065995
3,Claude-instant,writing_prompt,prompt_few_shot,Claude-instant,-0.058813,0.058813
4,ChatGPT,writing_prompt,prompt_SICO,Claude-instant,0.041632,0.041632
...,...,...,...,...,...,...
154,Llama-2-70b,xsum,direct_prompt,ChatGPT,0.000000,0.000000
155,Llama-2-70b,xsum,paraphrase_polish_llm,ChatGPT,0.000000,0.000000
156,Llama-2-70b,xsum,paraphrase_polish_llm,Llama-2-70b,0.000000,0.000000
157,Llama-2-70b,xsum,prompt_few_shot,ChatGPT,0.000000,0.000000


In [44]:
top10_display.head(10)

cleaned,training_llm,domain,llm_prompt,test_llm,diff,abs_diff
0,Google-PaLM,writing_prompt,prompt_few_shot,Google-PaLM,-0.069101,0.069101
1,Claude-instant,writing_prompt,prompt_few_shot,Google-PaLM,-0.068263,0.068263
2,Claude-instant,writing_prompt,prompt_SICO,Claude-instant,-0.065995,0.065995
3,Claude-instant,writing_prompt,prompt_few_shot,Claude-instant,-0.058813,0.058813
4,ChatGPT,writing_prompt,prompt_SICO,Claude-instant,0.041632,0.041632
5,Llama-2-70b,xsum,prompt_few_shot,Google-PaLM,-0.026355,0.026355
6,Google-PaLM,writing_prompt,prompt_few_shot,Llama-2-70b,0.021409,0.021409
7,Claude-instant,writing_prompt,prompt_SICO,Llama-2-70b,-0.019831,0.019831
8,Llama-2-70b,writing_prompt,prompt_few_shot,Google-PaLM,-0.019075,0.019075
9,ChatGPT,writing_prompt,prompt_few_shot,Google-PaLM,-0.01867,0.01867


In [40]:
df_results_wo_human[(df_results_wo_human["training_llm"]=="Claude-instant")&(df_results_wo_human["domain"]=="writing_prompt")&(df_results_wo_human["llm_prompt"]=="prompt_few_shot")]

Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,training_llm,test_llm,hash_df,domain,cleaned,llm_prompt
184,0.979957,-0.505787,"[[144, 13], [5, 104]]",0.888889,0.954128,0.920354,0.932331,0.275229,Claude-instant,Claude-instant,890386674417d8ddb48f96dea27e383b92165c3db6c5dc...,writing_prompt,True,prompt_few_shot
185,0.940752,-0.510063,"[[612, 88], [73, 626]]",0.876751,0.895565,0.886058,0.884918,0.032904,Claude-instant,Llama-2-70b,44f3aae5e8f0f3cee831bb0714df8c95063d650f1677e9...,writing_prompt,True,prompt_few_shot
186,0.977923,-0.498682,"[[665, 35], [29, 275]]",0.887097,0.904605,0.895765,0.936255,0.223684,Claude-instant,Google-PaLM,fa3295986d18c71b05ac05e549c573a6c89a55d207c419...,writing_prompt,True,prompt_few_shot
187,0.992902,-0.49942,"[[669, 31], [14, 683]]",0.956583,0.979914,0.968108,0.967788,0.457676,Claude-instant,ChatGPT,2b91a125f0c7f39f4bee5cdb90e5a5cbe91cd76e8a02ab...,writing_prompt,True,prompt_few_shot
188,0.996368,-0.485927,"[[133, 0], [6, 141]]",1.0,0.959184,0.979167,0.978571,0.959184,Claude-instant,Claude-instant,8262be56c5018c8817ad61af58ab724ecc543452a919be...,writing_prompt,False,prompt_few_shot
189,0.952296,-0.502553,"[[633, 67], [88, 612]]",0.901325,0.874286,0.8876,0.889286,0.09,Claude-instant,Llama-2-70b,f9babf5b1bbfc0b7816bf9778fcf43586c41c87780c52e...,writing_prompt,False,prompt_few_shot
190,0.983965,-0.492166,"[[680, 20], [30, 670]]",0.971014,0.957143,0.964029,0.964286,0.185714,Claude-instant,Google-PaLM,6cda2362c15d7011036d4a166b347abb6ba17192a87a49...,writing_prompt,False,prompt_few_shot
191,0.99542,-0.490298,"[[688, 12], [24, 676]]",0.982558,0.965714,0.974063,0.974286,0.432857,Claude-instant,ChatGPT,228f44b376166a75b25d94ddc61c71e6aacb44869ca9dc...,writing_prompt,False,prompt_few_shot
