# 0. Setup

In [1]:
import os
import sys
import pandas as pd
from sklearn.model_selection import train_test_split
from tqdm import tqdm

BASE_DIR = "../../"
sys.path.append(BASE_DIR)

from src.general_functions_and_patterns_for_detection import (
    get_info_based_on_input_path,
    load_dataframe_from_json,
    TrainRobertaHelper, RESULT_DIR, CLEANED_FILES_DIR, ORIGINAL_DATA_DIR, TASK_DIR,
    LLMs, DOMAINS, COLUMNS_DIRECTLY_LLM_GENERATED_DETECT_RL as LLM_PROMPTS,
    json_path_abstract, json_path_writing, )

prepare_df_for_roberta_training = TrainRobertaHelper.prepare_df_for_roberta_training
import DetectRL.Detectors.train_roberta as train_roberta

DEBUG = True
DRY_RUN = False
ALL_DATA = True
SEED = 2023

[nltk_data] Downloading package punkt to
[nltk_data]     /home/pdingfelder/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
2025-09-08 08:43:56.718543: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-08 08:43:56.733448: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757321036.752058 1811177 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757321036.757523 1811177 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W000

In [2]:
# TODO: adjust CUDA setup depending on your setup
# Disable NCCL features incompatible with RTX 40xx
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

# Restrict to only GPU 0 (CUDA:0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

# 1. Execute training for all LLMs, domains, prompts

- for the cleaned and uncleaned case
- check generalisation performance from cleaned to uncleaned

In [None]:
data_paths = [CLEANED_FILES_DIR, ORIGINAL_DATA_DIR]
counter = 0
total = len(DOMAINS) * len(data_paths) * len(LLMs) * len(LLM_PROMPTS)

for h, _domain in enumerate(tqdm(DOMAINS)):
    for i, training_data_path in enumerate(data_paths):
        for j, _llm in enumerate(LLMs):
            for k, prompt in enumerate(LLM_PROMPTS):
                if training_data_path.startswith(ORIGINAL_DATA_DIR):
                    training_df = load_dataframe_from_json(f"{ORIGINAL_DATA_DIR}{_domain}_2800.json")
                else:
                    training_df = pd.read_parquet(f"{CLEANED_FILES_DIR}{_domain}_2800_cleaned_all_v2.parquet")
                _, prompt_key, human_key = get_info_based_on_input_path(_domain)
                training_df = prepare_df_for_roberta_training(training_df, column_to_be_used_for_text=prompt,
                                                              column_to_be_used_for_human=human_key,
                                                              column_title=prompt_key)
                df_claude = training_df[training_df["llm_type"] == _llm]
                other_llms = LLMs.copy()
                other_llms.remove(_llm)
                transfer_df = [training_df[training_df["llm_type"] == _llm].dropna(subset=["label", "text"]) for _llm in
                               other_llms]
                # print(df_claude.head())

                train_df, test_df = train_test_split(df_claude, test_size=0.2, random_state=SEED, shuffle=True)
                # print(train_df.shape, test_df.shape, len(df_llama), len(df_claude), len(df_chatgpt),
                #  train_df.columns)

                save_model_path = f"{RESULT_DIR}{_llm}_{prompt}_test"

                not_executed_df: list = []
                for _df in transfer_df:
                    df_hash = train_roberta.hash_dataframe_as_parquet(_df)
                    result_path = f"{save_model_path}/{df_hash}.roberta-base_result.json"
                    if not os.path.exists(result_path):
                        not_executed_df.append(_df)

                if len(not_executed_df) > 1:
                    args = train_roberta.generate_args_for_training_roberta(
                        train_df=train_df, test_df=test_df, transfer_df=not_executed_df,
                        save_model_path=save_model_path, device="cpu"
                    )

                    train_roberta.run(args)

                # sum_it = (h+1) * (i+1) * (j+1) * len(LLM_PROMPTS) +  (k+1-len(LLM_PROMPTS))
                counter += 1
                print("Iteration: ", counter, counter / total, "% model saved to:", save_model_path)

  0%|          | 0/4 [00:00<?, ?it/s]

{'eval_loss': 0.6696428656578064, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 1.6796, 'eval_samples_per_second': 130.984, 'eval_steps_per_second': 16.671, 'epoch': 1.0}
{'eval_loss': 0.6051893830299377, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 1.7017, 'eval_samples_per_second': 129.279, 'eval_steps_per_second': 16.454, 'epoch': 2.0}
{'eval_loss': 0.5443264245986938, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 1.7186, 'eval_samples_per_second': 128.011, 'eval_steps_per_second': 16.292, 'epoch': 3.0}
{'train_runtime': 76.2849, 'train_samples_per_second': 35.394, 'train_steps_per_second': 4.444, 'train_loss': 0.6293072995886339, 'epoch': 3.0}
{'eval_loss': 0.5443264245986938, 'eval_accuracy': 1.0, 'eval_f1': 1.0, 'eval_precision': 1.0, 'eval_recall': 1.0, 'eval_runtime': 1.741, 'eval_samples_per_second': 126.363, 'eval_steps_per_

 25%|██▌       | 1/4 [4:36:54<13:50:43, 16614.54s/it]

Iteration:  40 0.25 % model saved to: ..//Detector_Results/ChatGPT_prompt_SICO_test
{'eval_loss': 0.6811037063598633, 'eval_accuracy': 0.8409090909090909, 'eval_f1': 0.8409090909090909, 'eval_precision': 0.8409090909090909, 'eval_recall': 0.8409090909090909, 'eval_runtime': 1.7585, 'eval_samples_per_second': 125.105, 'eval_steps_per_second': 15.923, 'epoch': 1.0}
{'eval_loss': 0.6613017916679382, 'eval_accuracy': 0.9318181818181818, 'eval_f1': 0.9318181818181818, 'eval_precision': 0.9318181818181818, 'eval_recall': 0.9318181818181818, 'eval_runtime': 1.7745, 'eval_samples_per_second': 123.979, 'eval_steps_per_second': 15.779, 'epoch': 2.0}
{'eval_loss': 0.6409193277359009, 'eval_accuracy': 0.9363636363636364, 'eval_f1': 0.9363636363636364, 'eval_precision': 0.9363636363636364, 'eval_recall': 0.9363636363636364, 'eval_runtime': 1.7923, 'eval_samples_per_second': 122.749, 'eval_steps_per_second': 15.623, 'epoch': 3.0}
{'train_runtime': 76.9253, 'train_samples_per_second': 34.67, 'train_s

 50%|█████     | 2/4 [10:10:07<10:20:03, 18601.91s/it]

Iteration:  80 0.5 % model saved to: ..//Detector_Results/ChatGPT_prompt_SICO_test
{'eval_loss': 0.6644893288612366, 'eval_accuracy': 0.6157407407407407, 'eval_f1': 0.6157407407407407, 'eval_precision': 0.6157407407407407, 'eval_recall': 0.6157407407407407, 'eval_runtime': 1.6545, 'eval_samples_per_second': 130.55, 'eval_steps_per_second': 16.319, 'epoch': 1.0}
{'eval_loss': 0.5865931510925293, 'eval_accuracy': 0.8888888888888888, 'eval_f1': 0.8888888888888888, 'eval_precision': 0.8888888888888888, 'eval_recall': 0.8888888888888888, 'eval_runtime': 1.69, 'eval_samples_per_second': 127.811, 'eval_steps_per_second': 15.976, 'epoch': 2.0}
{'eval_loss': 0.5238308906555176, 'eval_accuracy': 0.9351851851851852, 'eval_f1': 0.9351851851851852, 'eval_precision': 0.9351851851851852, 'eval_recall': 0.9351851851851852, 'eval_runtime': 1.7087, 'eval_samples_per_second': 126.411, 'eval_steps_per_second': 15.801, 'epoch': 3.0}
{'train_runtime': 76.1398, 'train_samples_per_second': 34.831, 'train_step

# 2. Compare transfer between cleaned and not cleaned data

In [8]:
training_df = load_dataframe_from_json(json_path_abstract)
training_df = prepare_df_for_roberta_training(training_df, "direct_prompt")
df_claude = training_df[training_df["llm_type"] == "Claude-instant"]

test_data_nc = load_dataframe_from_json(json_path_writing)
test_data_nc = prepare_df_for_roberta_training(test_data_nc, "direct_prompt", "story", "story_prompt")
df_claude_nc, df_llama, df_palm, df_chatgpt = [
    test_data_nc[test_data_nc["llm_type"] == _llm].dropna(subset=["label", "text"]) for _llm in
    ["Claude-instant", "Llama-2-70b", "Google-PaLM", "ChatGPT"]]

test_data_path = f"{CLEANED_FILES_DIR}writing_prompt_2800_cleaned_all_v2.parquet"
test_df = pd.read_parquet(test_data_path)
test_df = prepare_df_for_roberta_training(test_df, "direct_prompt", "story", "story_prompt")
df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned = [
    test_df[test_df["llm_type"] == _llm].dropna(subset=["label", "text"]) for _llm in
    ["Claude-instant", "Llama-2-70b", "Google-PaLM", "ChatGPT"]]
print(df_claude.head())

train_df, test_df = train_test_split(df_claude, test_size=0.2, random_state=SEED, shuffle=True)
print(train_df.shape, test_df.shape, len(df_llama), len(df_claude), len(df_chatgpt),
      train_df.columns)

args = train_roberta.generate_args_for_training_roberta(
    train_df=train_df, test_df=test_df,
    transfer_df=[df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned, df_claude_nc, df_llama,
                 df_palm, df_chatgpt],
    save_model_path=f"{RESULT_DIR}claude_direct_prompt_test", device="cpu"
)

df_result_generalisation_capability = train_roberta.run(args)

        id                                              title        llm_type  \
1400  1401  Real Time Turbulent Video Perfecting by Image ...  Claude-instant   
1401  1402   Finite Euler products and the Riemann Hypothesis  Claude-instant   
1402  1403  An Adaptive Strategy for the Classification of...  Claude-instant   
1403  1404  Detailed Models of super-Earths: How well can ...  Claude-instant   
1404  1405    The Distribution of AGN in Clusters of Galaxies  Claude-instant   

                                                   text  label  
1400  Image and video quality in Long Range Observat...  human  
1401  We show that if the Riemann Hypothesis is true...  human  
1402  One of the major problems in computational bio...  human  
1403  The field of extrasolar planets has rapidly ex...  human  
1404  We present a study of the distribution of AGN ...  human  
(1120, 5) (280, 5) 1400 1400 1400 Index(['id', 'title', 'llm_type', 'text', 'label'], dtype='object')
{'eval_loss': 0.66645

{'e902b4653fc37f0f78c0e7c2dcbd55892a06d50d2436f6ccaf4584e43f504bb7_train': {'roc_auc': 0.8344687045123726,
  'optimal_threshold': -0.49324870109558105,
  'conf_matrix': [[573, 127], [192, 495]],
  'precision': 0.7958199356913184,
  'recall': 0.7205240174672489,
  'f1': 0.7563025210084033,
  'accuracy': 0.7700072098053352,
  'tpr_at_fpr_0_01': 0.07278020378457059},
 '55b89cf7534df82f9a25dfb2676ac48d2ba065af29950638b800a366fb30fd31_train': {'roc_auc': 0.8960226104830422,
  'optimal_threshold': -0.4939974546432495,
  'conf_matrix': [[547, 153], [90, 605]],
  'precision': 0.7981530343007915,
  'recall': 0.8705035971223022,
  'f1': 0.8327598072952512,
  'accuracy': 0.8258064516129032,
  'tpr_at_fpr_0_01': 0.17985611510791366},
 '027aa0a9831e0ddbbdd7a0c6b65ab5a8e3824db1ffdcc5a87dd3d5ac27944228_train': {'roc_auc': 0.8404580397181296,
  'optimal_threshold': -0.49456560611724854,
  'conf_matrix': [[520, 180], [140, 529]],
  'precision': 0.7461212976022567,
  'recall': 0.7907324364723468,
  'f1'

In [18]:
# TODO refactor and delete if possible
df_result_generalisation_capability = pd.DataFrame(
    {'e902b4653fc37f0f78c0e7c2dcbd55892a06d50d2436f6ccaf4584e43f504bb7_train': {'roc_auc': 0.8344687045123726,
                                                                                'optimal_threshold': -0.49324870109558105,
                                                                                'conf_matrix': [[573, 127], [192, 495]],
                                                                                'precision': 0.7958199356913184,
                                                                                'recall': 0.7205240174672489,
                                                                                'f1': 0.7563025210084033,
                                                                                'accuracy': 0.7700072098053352,
                                                                                'tpr_at_fpr_0_01': 0.07278020378457059},
     '55b89cf7534df82f9a25dfb2676ac48d2ba065af29950638b800a366fb30fd31_train': {'roc_auc': 0.8960226104830422,
                                                                                'optimal_threshold': -0.4939974546432495,
                                                                                'conf_matrix': [[547, 153], [90, 605]],
                                                                                'precision': 0.7981530343007915,
                                                                                'recall': 0.8705035971223022,
                                                                                'f1': 0.8327598072952512,
                                                                                'accuracy': 0.8258064516129032,
                                                                                'tpr_at_fpr_0_01': 0.17985611510791366},
     '027aa0a9831e0ddbbdd7a0c6b65ab5a8e3824db1ffdcc5a87dd3d5ac27944228_train': {'roc_auc': 0.8404580397181296,
                                                                                'optimal_threshold': -0.49456560611724854,
                                                                                'conf_matrix': [[520, 180], [140, 529]],
                                                                                'precision': 0.7461212976022567,
                                                                                'recall': 0.7907324364723468,
                                                                                'f1': 0.7677793904208998,
                                                                                'accuracy': 0.7662527392257122,
                                                                                'tpr_at_fpr_0_01': 0.11210762331838565},
     'cae350ec72bfed35e4b12d723ce7f3b9f0675b820ec6aab4558dee91b6c3469f_train': {'roc_auc': 0.8968591836734694,
                                                                                'optimal_threshold': -0.48945221304893494,
                                                                                'conf_matrix': [[629, 71], [175, 525]],
                                                                                'precision': 0.8808724832214765,
                                                                                'recall': 0.75,
                                                                                'f1': 0.8101851851851852,
                                                                                'accuracy': 0.8242857142857143,
                                                                                'tpr_at_fpr_0_01': 0.24428571428571427},
     'd63ba4ece3f3ec5db675869df54a1e72f3fc437fe91ca5d4eb05b84bc5eb2fa4_train': {'roc_auc': 0.9395499999999999,
                                                                                'optimal_threshold': -0.4869958460330963,
                                                                                'conf_matrix': [[654, 46], [116, 584]],
                                                                                'precision': 0.926984126984127,
                                                                                'recall': 0.8342857142857143,
                                                                                'f1': 0.8781954887218045,
                                                                                'accuracy': 0.8842857142857142,
                                                                                'tpr_at_fpr_0_01': 0.38142857142857145},
     '1d7e405be7b1a17406ccb92039bada8fd16621fd9bd68130850c8af790f82c2b_train': {'roc_auc': 0.8994387755102042,
                                                                                'optimal_threshold': -0.4939974546432495,
                                                                                'conf_matrix': [[547, 153], [89, 611]],
                                                                                'precision': 0.7997382198952879,
                                                                                'recall': 0.8728571428571429,
                                                                                'f1': 0.8346994535519126,
                                                                                'accuracy': 0.8271428571428572,
                                                                                'tpr_at_fpr_0_01': 0.19},
     '06d2693f5dfd8639ab3d8ce7d5f4c81c7f925e2d94edd7463a339c7f618c9243_train': {'roc_auc': 0.8357826530612245,
                                                                                'optimal_threshold': -0.49361446499824524,
                                                                                'conf_matrix': [[547, 153], [173, 527]],
                                                                                'precision': 0.775,
                                                                                'recall': 0.7528571428571429,
                                                                                'f1': 0.763768115942029,
                                                                                'accuracy': 0.7671428571428571,
                                                                                'tpr_at_fpr_0_01': 0.10857142857142857},
     '941f984101c349c3e16f19cc0a6f59700baaf9e5852c5a0e7712bfc80be20e88_train': {'roc_auc': 0.8975244897959184,
                                                                                'optimal_threshold': -0.48945221304893494,
                                                                                'conf_matrix': [[629, 71], [173, 527]],
                                                                                'precision': 0.8812709030100334,
                                                                                'recall': 0.7528571428571429,
                                                                                'f1': 0.8120184899845917,
                                                                                'accuracy': 0.8257142857142857,
                                                                                'tpr_at_fpr_0_01': 0.24714285714285714}}).T

df_result_generalisation_capability["domain_train"] = "arxiv"
df_result_generalisation_capability["domain_test"] = "writing_prompt"
df_result_generalisation_capability["cleaned"] = [True, True, True, True, False, False, False, False]
df_result_generalisation_capability["llm_train"] = "Claude-instant"
df_result_generalisation_capability["llm_test"] = ["Claude-instant", "Llama-2-70b", "Google-PaLM", "ChatGPT",
                                                   "Claude-instant", "Llama-2-70b", "Google-PaLM", "ChatGPT"]

df_result_generalisation_capability

Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,domain_train,domain_test,cleaned,llm_train,llm_test
e902b4653fc37f0f78c0e7c2dcbd55892a06d50d2436f6ccaf4584e43f504bb7_train,0.834469,-0.493249,"[[573, 127], [192, 495]]",0.79582,0.720524,0.756303,0.770007,0.07278,arxiv,writing_prompt,True,Claude-instant,Claude-instant
55b89cf7534df82f9a25dfb2676ac48d2ba065af29950638b800a366fb30fd31_train,0.896023,-0.493997,"[[547, 153], [90, 605]]",0.798153,0.870504,0.83276,0.825806,0.179856,arxiv,writing_prompt,True,Claude-instant,Llama-2-70b
027aa0a9831e0ddbbdd7a0c6b65ab5a8e3824db1ffdcc5a87dd3d5ac27944228_train,0.840458,-0.494566,"[[520, 180], [140, 529]]",0.746121,0.790732,0.767779,0.766253,0.112108,arxiv,writing_prompt,True,Claude-instant,Google-PaLM
cae350ec72bfed35e4b12d723ce7f3b9f0675b820ec6aab4558dee91b6c3469f_train,0.896859,-0.489452,"[[629, 71], [175, 525]]",0.880872,0.75,0.810185,0.824286,0.244286,arxiv,writing_prompt,True,Claude-instant,ChatGPT
d63ba4ece3f3ec5db675869df54a1e72f3fc437fe91ca5d4eb05b84bc5eb2fa4_train,0.93955,-0.486996,"[[654, 46], [116, 584]]",0.926984,0.834286,0.878195,0.884286,0.381429,arxiv,writing_prompt,False,Claude-instant,Claude-instant
1d7e405be7b1a17406ccb92039bada8fd16621fd9bd68130850c8af790f82c2b_train,0.899439,-0.493997,"[[547, 153], [89, 611]]",0.799738,0.872857,0.834699,0.827143,0.19,arxiv,writing_prompt,False,Claude-instant,Llama-2-70b
06d2693f5dfd8639ab3d8ce7d5f4c81c7f925e2d94edd7463a339c7f618c9243_train,0.835783,-0.493614,"[[547, 153], [173, 527]]",0.775,0.752857,0.763768,0.767143,0.108571,arxiv,writing_prompt,False,Claude-instant,Google-PaLM
941f984101c349c3e16f19cc0a6f59700baaf9e5852c5a0e7712bfc80be20e88_train,0.897524,-0.489452,"[[629, 71], [173, 527]]",0.881271,0.752857,0.812018,0.825714,0.247143,arxiv,writing_prompt,False,Claude-instant,ChatGPT


In [14]:
train_data_path = f"{CLEANED_FILES_DIR}arxiv_2800_cleaned_all_v2.parquet"
training_df = pd.read_parquet(train_data_path)
training_df = prepare_df_for_roberta_training(training_df, "direct_prompt")
df_claude = training_df[training_df["llm_type"] == "Claude-instant"]

test_data_nc = load_dataframe_from_json(json_path_writing)
test_data_nc = prepare_df_for_roberta_training(test_data_nc, "direct_prompt", "story", "story_prompt")
df_claude_nc, df_llama, df_palm, df_chatgpt = [
    test_data_nc[test_data_nc["llm_type"] == _llm].dropna(subset=["label", "text"]) for _llm in
    ["Claude-instant", "Llama-2-70b", "Google-PaLM", "ChatGPT"]]

test_data_path = f"{CLEANED_FILES_DIR}writing_prompt_2800_cleaned_all_v2.parquet"
test_df = pd.read_parquet(test_data_path)
test_df = prepare_df_for_roberta_training(test_df, "direct_prompt", "story", "story_prompt")
df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned = [
    test_df[test_df["llm_type"] == _llm].dropna(subset=["label", "text"]) for _llm in
    ["Claude-instant", "Llama-2-70b", "Google-PaLM", "ChatGPT"]]
print(df_claude.head())

train_df, test_df = train_test_split(df_claude, test_size=0.2, random_state=SEED, shuffle=True)
print(train_df.shape, test_df.shape, len(df_llama), len(df_claude), len(df_chatgpt),
      train_df.columns)

args = train_roberta.generate_args_for_training_roberta(
    train_df=train_df, test_df=test_df,
    transfer_df=[df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned, df_claude_nc, df_llama,
                 df_palm, df_chatgpt],
    save_model_path=f"{RESULT_DIR}claude_direct_prompt_test", device="cpu"
)

results = train_roberta.run(args)

        id                                              title        llm_type  \
1400  1401  Real Time Turbulent Video Perfecting by Image ...  Claude-instant   
1401  1402   Finite Euler products and the Riemann Hypothesis  Claude-instant   
1402  1403  An Adaptive Strategy for the Classification of...  Claude-instant   
1403  1404  Detailed Models of super-Earths: How well can ...  Claude-instant   
1404  1405    The Distribution of AGN in Clusters of Galaxies  Claude-instant   

                                                   text  label  
1400  Image and video quality in Long Range Observat...  human  
1401  We show that if the Riemann Hypothesis is true...  human  
1402  One of the major problems in computational bio...  human  
1403  The field of extrasolar planets has rapidly ex...  human  
1404  We present a study of the distribution of AGN ...  human  
(1120, 5) (280, 5) 1400 1400 1400 Index(['id', 'title', 'llm_type', 'text', 'label'], dtype='object')
{'eval_loss': 0.66977

In [17]:
df_result_generalisation_capability_cleaned_arxiv = pd.DataFrame(df_result_generalisation_capability).T

df_result_generalisation_capability_cleaned_arxiv["domain_train"] = "arxiv_cleaned"
df_result_generalisation_capability_cleaned_arxiv["domain_test"] = "writing_prompt"
df_result_generalisation_capability_cleaned_arxiv["cleaned"] = [True, True, True, True, False, False, False, False]
df_result_generalisation_capability_cleaned_arxiv["llm_train"] = "Claude-instant"
df_result_generalisation_capability_cleaned_arxiv["llm_test"] = ["Claude-instant", "Llama-2-70b", "Google-PaLM",
                                                                 "ChatGPT", "Claude-instant", "Llama-2-70b",
                                                                 "Google-PaLM", "ChatGPT"]

df_result_generalisation_capability_cleaned_arxiv

Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,domain_train,domain_test,cleaned,llm_train,llm_test
e902b4653fc37f0f78c0e7c2dcbd55892a06d50d2436f6ccaf4584e43f504bb7_train,0.798775,-0.493716,"[[653, 47], [327, 360]]",0.884521,0.524017,0.658135,0.730353,0.120815,arxiv_cleaned,writing_prompt,True,Claude-instant,Claude-instant
55b89cf7534df82f9a25dfb2676ac48d2ba065af29950638b800a366fb30fd31_train,0.757049,-0.504179,"[[467, 233], [172, 523]]",0.691799,0.752518,0.720882,0.709677,0.070504,arxiv_cleaned,writing_prompt,True,Claude-instant,Llama-2-70b
027aa0a9831e0ddbbdd7a0c6b65ab5a8e3824db1ffdcc5a87dd3d5ac27944228_train,0.867549,-0.496443,"[[607, 93], [163, 506]]",0.844741,0.756353,0.798107,0.813002,0.149477,arxiv_cleaned,writing_prompt,True,Claude-instant,Google-PaLM
cae350ec72bfed35e4b12d723ce7f3b9f0675b820ec6aab4558dee91b6c3469f_train,0.936529,-0.497638,"[[605, 95], [101, 599]]",0.863112,0.855714,0.859397,0.86,0.312857,arxiv_cleaned,writing_prompt,True,Claude-instant,ChatGPT
d63ba4ece3f3ec5db675869df54a1e72f3fc437fe91ca5d4eb05b84bc5eb2fa4_train,0.466047,-0.521376,"[[85, 615], [51, 649]]",0.513449,0.927143,0.660896,0.524286,0.001429,arxiv_cleaned,writing_prompt,False,Claude-instant,Claude-instant
1d7e405be7b1a17406ccb92039bada8fd16621fd9bd68130850c8af790f82c2b_train,0.753385,-0.504153,"[[468, 232], [183, 517]]",0.690254,0.738571,0.713596,0.703571,0.067143,arxiv_cleaned,writing_prompt,False,Claude-instant,Llama-2-70b
06d2693f5dfd8639ab3d8ce7d5f4c81c7f925e2d94edd7463a339c7f618c9243_train,0.847173,-0.496463,"[[607, 93], [201, 499]]",0.842905,0.712857,0.772446,0.79,0.108571,arxiv_cleaned,writing_prompt,False,Claude-instant,Google-PaLM
941f984101c349c3e16f19cc0a6f59700baaf9e5852c5a0e7712bfc80be20e88_train,0.93638,-0.497435,"[[609, 91], [105, 595]]",0.867347,0.85,0.858586,0.86,0.314286,arxiv_cleaned,writing_prompt,False,Claude-instant,ChatGPT


In [27]:
df_claude_combined = pd.concat(
    [df_result_generalisation_capability, df_result_generalisation_capability_cleaned_arxiv]).reset_index().sort_values(
    by="index").reset_index(drop=True)

df_claude_combined_generalisation = df_claude_combined[df_claude_combined["llm_test"] != "Claude-instant"]
df_claude_combined_generalisation.head()

Unnamed: 0,index,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01,domain_train,domain_test,cleaned,llm_train,llm_test
0,027aa0a9831e0ddbbdd7a0c6b65ab5a8e3824db1ffdcc5...,0.840458,-0.494566,"[[520, 180], [140, 529]]",0.746121,0.790732,0.767779,0.766253,0.112108,arxiv,writing_prompt,True,Claude-instant,Google-PaLM
1,027aa0a9831e0ddbbdd7a0c6b65ab5a8e3824db1ffdcc5...,0.867549,-0.496443,"[[607, 93], [163, 506]]",0.844741,0.756353,0.798107,0.813002,0.149477,arxiv_cleaned,writing_prompt,True,Claude-instant,Google-PaLM
2,06d2693f5dfd8639ab3d8ce7d5f4c81c7f925e2d94edd7...,0.835783,-0.493614,"[[547, 153], [173, 527]]",0.775,0.752857,0.763768,0.767143,0.108571,arxiv,writing_prompt,False,Claude-instant,Google-PaLM
3,06d2693f5dfd8639ab3d8ce7d5f4c81c7f925e2d94edd7...,0.847173,-0.496463,"[[607, 93], [201, 499]]",0.842905,0.712857,0.772446,0.79,0.108571,arxiv_cleaned,writing_prompt,False,Claude-instant,Google-PaLM
4,1d7e405be7b1a17406ccb92039bada8fd16621fd9bd681...,0.899439,-0.493997,"[[547, 153], [89, 611]]",0.799738,0.872857,0.834699,0.827143,0.19,arxiv,writing_prompt,False,Claude-instant,Llama-2-70b


In [33]:
df_claude_combined_generalisation[["roc_auc", "f1", "accuracy", "tpr_at_fpr_0_01", "domain_train", "cleaned"]].groupby(
    ["domain_train", "cleaned"]).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,roc_auc,f1,accuracy,tpr_at_fpr_0_01
domain_train,cleaned,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
arxiv,False,0.877582,0.803495,0.806667,0.181905
arxiv,True,0.87778,0.803575,0.805448,0.17875
arxiv_cleaned,False,0.845646,0.781542,0.784524,0.163333
arxiv_cleaned,True,0.853709,0.792796,0.794227,0.177613
