# 0. Setup

## 0.0 imports

In [1]:
import os
import pandas as pd
import sys
import transformers

BASE_DIR = "../../"
sys.path.append(BASE_DIR)

from src.general_functions_and_patterns_for_detection import (
    TrainRobertaHelper, TrainingDataHandler,
    RESULT_DIR, REGEX_CLEANED_FILES, ORIGINAL_DATA_DIR,
    seed_everything
)

SEED = 2023
seed_everything(SEED)

prepare_df_for_roberta_training = TrainRobertaHelper.prepare_df_for_roberta_training
import DetectRL.Detectors.train_roberta as train_roberta

2025-09-09 13:10:49.323934: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2025-09-09 13:10:49.342479: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1757423449.362017 2076975 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1757423449.368076 2076975 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1757423449.382654 2076975 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking 

In [2]:
# TODO: adjust CUDA setup depending on your setup
# Disable NCCL features incompatible with RTX 40xx
os.environ["NCCL_P2P_DISABLE"] = "1"
os.environ["NCCL_IB_DISABLE"] = "1"

# Restrict to only GPU 0 (CUDA:0)
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

In [3]:
RESULT_DIR = os.path.join(RESULT_DIR, "T02")
model_cleaned_dataset = f"{RESULT_DIR}claude_direct_prompt_all_domains_multi_llm_cleaned"
model_uncleaned_dataset = f"{RESULT_DIR}claude_direct_prompt_all_domains_multi_llm_not_cleaned"
os.makedirs(RESULT_DIR, exist_ok=True)

# 1. Train for Claude direct prompts

## 1.0 Get data files

In [4]:
PROMPTS = ["direct_prompt"]
_prompt_str = "_" + "-".join(PROMPTS)

df_claude_cleaned, df_llama_cleaned, df_palm_cleaned, df_chatgpt_cleaned = \
    TrainingDataHandler.load_dataframes_all_llms_all_domains(REGEX_CLEANED_FILES,
                                                             _suffix_path="_cleaned_all_v3.parquet", prompts=PROMPTS,
                                                             paraphrase_polish_human_as_ai=False)

df_claude_nc, df_llama_nc, df_palm_nc, df_chatgpt_nc = \
    TrainingDataHandler.load_dataframes_all_llms_all_domains(ORIGINAL_DATA_DIR, _suffix_path=".json", prompts=PROMPTS,
                                                             paraphrase_polish_human_as_ai=False)

# evaluate
# train using the cleaned domains and training structures
train_df_claude, test_df_claude, adjusted_df_claude, sample_ids_claude = \
    TrainingDataHandler.split_training_data_frame_and_adjust_transfer_test_df(df_claude_cleaned, df_claude_nc, _seed=SEED)

## 1.1 Using cleaned data

In [5]:
print(train_df_claude.shape, test_df_claude.shape, adjusted_df_claude.shape)
adjusted_df_claude.tail()

(4448, 7) (1111, 7) (1120, 7)


Unnamed: 0,id,context,llm_type,text,domain,label,llm_prompting_strategy
5577,2078,Barnes & Noble in Southpark is a great place t...,Claude-instant,Here is a 17 sentence continuation of the revi...,yelp_review,llm,direct_prompt
5583,2084,"So help me God, if I ever get married or have ...",Claude-instant,Here are 23 additional sentences continuing th...,yelp_review,llm,direct_prompt
5584,2085,I drive out here for these reasons: nnI like m...,Claude-instant,Here is a continued 13 sentence review based o...,yelp_review,llm,direct_prompt
5586,2087,While in Charlotte for business we ventured to...,Claude-instant,Here is a continued 14 sentence review based o...,yelp_review,llm,direct_prompt
5593,2094,Oh upstream how I love thee.,Claude-instant,Here is a continued 20 sentence review based o...,yelp_review,llm,direct_prompt


In [6]:
test_df_claude["text"].value_counts(dropna=False)

text
Oh upstream how I love thee. Your bright colors and flowing waters fill me with such joy. Every morning I awake eager to see what beauty you will bring. You never cease to amaze with your restless motion, now rushing quickly, now slowly winding your way. There is such peace to be found in your calming sound and flow.  All who live nearby are lucky indeed to have your catchy melodies as part of their everyday lives. Your banks are decorated with lovely wildflowers that wave happily in the breeze. Small creatures live along your edges, knowing you will provide for their needs. The trees that line you are always shady and refreshing. Birds sing joyously as they bathe in your refreshing waters. Their songs are like music to my ears each day. How magical it is to see the sunrise and sunset painted across your reflective surface. I never grow tired of watching you throughout the changing seasons. When summer is hot, your cool depths are so welcoming. In winter, you are transformed with 

In [7]:
adjusted_df_claude["text"].value_counts(dropna=False)

text
As an AI language model, I am unable to engage with content that may violate my usage guidelines. To learn more, visit https://poe.com/usage_guidelines. As an AI language model, I am unable to engage with content that may violate my usage guidelines. To learn more, visit https://poe.com/usage_guidelines.                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                  

In [8]:
adjusted_df_claude[adjusted_df_claude["llm_prompting_strategy"] == "direct_prompt"].label.value_counts()

label
human    560
llm      560
Name: count, dtype: int64

In [9]:
args = train_roberta.generate_args_for_training_roberta(
    train_df=train_df_claude, test_df=test_df_claude, transfer_df=[test_df_claude],
    save_model_path=model_cleaned_dataset, seed=SEED
)
results_cleaned_train = train_roberta.run(args)
df_cleaned_train = pd.DataFrame(results_cleaned_train).T
df_cleaned_train.to_csv(f"{RESULT_DIR}/claude_trained_cleaned_samples_results{_prompt_str}.csv")
df_cleaned_train

  return forward_call(*args, **kwargs)


{'eval_loss': 0.2446754425764084, 'eval_accuracy': 0.9399092970521542, 'eval_f1': 0.9399092970521542, 'eval_precision': 0.9399092970521542, 'eval_recall': 0.9399092970521542, 'eval_runtime': 8.2968, 'eval_samples_per_second': 106.306, 'eval_steps_per_second': 13.379, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.527, 'grad_norm': 27.198169708251953, 'learning_rate': 6.270553064275037e-07, 'epoch': 1.1210762331838564}
{'eval_loss': 0.15168491005897522, 'eval_accuracy': 0.9569160997732427, 'eval_f1': 0.9569160997732427, 'eval_precision': 0.9569160997732427, 'eval_recall': 0.9569160997732427, 'eval_runtime': 8.2902, 'eval_samples_per_second': 106.391, 'eval_steps_per_second': 13.389, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.0796, 'grad_norm': 1.5711628198623657, 'learning_rate': 2.533632286995516e-07, 'epoch': 2.242152466367713}
{'eval_loss': 0.15191736817359924, 'eval_accuracy': 0.9591836734693877, 'eval_f1': 0.9591836734693877, 'eval_precision': 0.9591836734693877, 'eval_recall': 0.9591836734693877, 'eval_runtime': 8.284, 'eval_samples_per_second': 106.47, 'eval_steps_per_second': 13.399, 'epoch': 3.0}
{'train_runtime': 385.0437, 'train_samples_per_second': 27.784, 'train_steps_per_second': 3.475, 'train_loss': 0.24019161218663146, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 0.15191736817359924, 'eval_accuracy': 0.9591836734693877, 'eval_f1': 0.9591836734693877, 'eval_precision': 0.9591836734693877, 'eval_recall': 0.9591836734693877, 'eval_runtime': 8.2863, 'eval_samples_per_second': 106.441, 'eval_steps_per_second': 13.396, 'epoch': 3.0}


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01
2be8f87f817457a355cb9f0daccd9c9429c1a8e21304fea208bdf18da1187297_train,0.996429,-0.06589,"[[547, 13], [4, 547]]",0.976786,0.99274,0.984698,0.984698,0.009074


## 1.2 Using original data

In [10]:
# train using the uncleaned domains and training structures
# evaluate
train_df_claude_nc, test_df_claude_nc, adjusted_df_claude_cleaned, sample_ids_claude_nc = \
    TrainingDataHandler.split_training_data_frame_and_adjust_transfer_test_df(df_claude_nc, df_claude_cleaned, _seed=SEED)

args = train_roberta.generate_args_for_training_roberta(
    train_df=train_df_claude_nc, test_df=test_df_claude_nc, transfer_df=[adjusted_df_claude_cleaned],
    save_model_path=model_uncleaned_dataset, seed=SEED
)

In [11]:
results_cleaned_train = train_roberta.run(args)
df_not_cleaned_train = pd.DataFrame(results_cleaned_train).T
df_not_cleaned_train.to_csv(f"{RESULT_DIR}/claude_trained_original_samples_results{_prompt_str}.csv")
df_not_cleaned_train

  return forward_call(*args, **kwargs)


{'eval_loss': 0.1766575127840042, 'eval_accuracy': 0.9866071428571429, 'eval_f1': 0.9866071428571429, 'eval_precision': 0.9866071428571429, 'eval_recall': 0.9866071428571429, 'eval_runtime': 8.5029, 'eval_samples_per_second': 105.376, 'eval_steps_per_second': 13.172, 'epoch': 1.0}


  return forward_call(*args, **kwargs)


{'loss': 0.5312, 'grad_norm': 4.64056396484375, 'learning_rate': 6.287202380952381e-07, 'epoch': 1.1160714285714286}
{'eval_loss': 0.008780060335993767, 'eval_accuracy': 0.9988839285714286, 'eval_f1': 0.9988839285714286, 'eval_precision': 0.9988839285714286, 'eval_recall': 0.9988839285714286, 'eval_runtime': 8.4868, 'eval_samples_per_second': 105.575, 'eval_steps_per_second': 13.197, 'epoch': 2.0}


  return forward_call(*args, **kwargs)


{'loss': 0.0302, 'grad_norm': 0.6168240904808044, 'learning_rate': 2.566964285714285e-07, 'epoch': 2.232142857142857}
{'eval_loss': 0.007081814110279083, 'eval_accuracy': 0.9988839285714286, 'eval_f1': 0.9988839285714286, 'eval_precision': 0.9988839285714286, 'eval_recall': 0.9988839285714286, 'eval_runtime': 8.4895, 'eval_samples_per_second': 105.542, 'eval_steps_per_second': 13.193, 'epoch': 3.0}
{'train_runtime': 389.4598, 'train_samples_per_second': 27.607, 'train_steps_per_second': 3.451, 'train_loss': 0.21249082258769444, 'epoch': 3.0}


  return forward_call(*args, **kwargs)


{'eval_loss': 0.007081814110279083, 'eval_accuracy': 0.9988839285714286, 'eval_f1': 0.9988839285714286, 'eval_precision': 0.9988839285714286, 'eval_recall': 0.9988839285714286, 'eval_runtime': 8.4804, 'eval_samples_per_second': 105.656, 'eval_steps_per_second': 13.207, 'epoch': 3.0}


  return forward_call(*args, **kwargs)
  return forward_call(*args, **kwargs)


Unnamed: 0,roc_auc,optimal_threshold,conf_matrix,precision,recall,f1,accuracy,tpr_at_fpr_0_01
2be8f87f817457a355cb9f0daccd9c9429c1a8e21304fea208bdf18da1187297_train,0.996429,-0.993189,"[[539, 21], [4, 547]]",0.963028,0.99274,0.977659,0.977498,0.551724


# 2. "Attack" the models (adversarial attacks)

In [12]:
test_df_human = test_df_claude[test_df_claude["label"] == "human"]
example_human_cleaned = test_df_human.iloc[0].text
example_human_cleaned

'We present a study of the distribution of AGN in clusters of galaxies with auniformly selected, spectroscopically complete sample of 35 AGN in eightclusters at z = 0.06 to 0.31. We find that the 12 AGN with L_X > 10^42 erg/s ingalaxies more luminous than a rest-frame M_R < -20 mag are more centrallyconcentrated than typical cluster galaxies of this luminosity, although theseAGN have comparable velocity and substructure distributions to other clustermembers. In contrast, a larger sample of 30 cluster AGN with L_X > 10^41 erg/sdo not show evidence for greater concentration than inactive members, norevidence for a different kinematic or substructure distribution. As we do seeclear differences in the spatial and kinematic distributions of the blueButcher-Oemler and red galaxy populations, any difference in the AGN andinactive galaxy population must be less distinct than that between these twopairs of populations. Comparison of the AGN fraction selected via X-rayemission in this study to s

In [13]:
# direct prompt, claude contaminated, id 1401
test_df_llm = test_df_claude[test_df_claude["label"] == "llm"]
example_llm_cleaned = test_df_llm.iloc[0].text

test_df_llm_nc = test_df_claude_nc[test_df_claude_nc["label"] == "llm"]
example_llm_not_cleaned = test_df_llm_nc.iloc[0].text

example_llm_not_cleaned, example_llm_cleaned

('Here is a 6 sentence abstract for the article title "The Distribution of AGN in Clusters of Galaxies":Active galactic nuclei (AGN) are found throughout the universe, however their distribution within galaxy clusters is not well understood. This study analyzes observational data from X-ray and radio surveys to identify AGN located within hundreds of galaxy clusters. The positions and luminosities of detected AGN are compared to properties of their host clusters such as galaxy number, mass, and spatial distribution. Preliminary results suggest that brighter AGN prefer clusters with fewer member galaxies and lower masses. Further analysis is needed to better understand whether cluster environment plays a role in triggering or suppressing AGN activity. This work aims to provide new insights into the connection between galaxy cluster evolution and growth of supermassive black holes.',
 'Active galactic nuclei (AGN) are found throughout the universe, however their distribution within galax

In [14]:
detector = transformers.AutoModelForSequenceClassification.from_pretrained(model_uncleaned_dataset).to("cuda")
tokenizer = transformers.AutoTokenizer.from_pretrained(model_uncleaned_dataset)
tokenized = tokenizer(example_llm_not_cleaned, padding=True, truncation=True, max_length=512,
                      return_tensors="pt").to("cuda")
_prediction = detector(**tokenized).logits.softmax(-1).tolist()
_prediction

  return forward_call(*args, **kwargs)


[[0.0019935283344238997, 0.9980065226554871]]

In [15]:
tokenized = tokenizer(example_llm_cleaned, padding=True, truncation=True, max_length=512,
                      return_tensors="pt").to("cuda")
_prediction = detector(**tokenized).logits.softmax(-1).tolist()[0]
_prediction

[0.08858197927474976, 0.911418080329895]

In [16]:
detector_cleaned = transformers.AutoModelForSequenceClassification.from_pretrained(model_cleaned_dataset).to("cuda")
tokenizer_cleaned = transformers.AutoTokenizer.from_pretrained(model_cleaned_dataset)
tokenized = tokenizer_cleaned(example_llm_not_cleaned, padding=True, truncation=True, max_length=512,
                              return_tensors="pt").to("cuda")
_prediction = detector_cleaned(**tokenized).logits.softmax(-1)[:, 0].tolist()[0]
_prediction

0.0037773523945361376

In [17]:
print(TrainRobertaHelper.predict_example(_example=example_human_cleaned, _detector=detector, _tokenizer=tokenizer),
      TrainRobertaHelper.predict_example(_example=example_human_cleaned, _detector=detector_cleaned, _tokenizer=tokenizer_cleaned))

[[0.9976292252540588, 0.0023708133958280087]] [[0.9971010088920593, 0.0028989289421588182]]


In [18]:
print(TrainRobertaHelper.predict_example(tokenizer, detector,
                                         "Here is a 6 sentence abstract for the provided article title: " + example_human_cleaned),
      TrainRobertaHelper.predict_example(tokenizer_cleaned, detector_cleaned,
                                         "Here is a 6 sentence abstract for the provided article title:" + example_human_cleaned))

[[0.5029028654098511, 0.4970971643924713]] [[0.9970930814743042, 0.0029069383163005114]]


In [19]:
print(TrainRobertaHelper.predict_example(tokenizer, detector,
                                         "Here is a 7 sentence abstract for the provided article title: " + example_human_cleaned),
      TrainRobertaHelper.predict_example(tokenizer_cleaned, detector_cleaned,
                                         "Here is a 7 sentence abstract for the provided article title:" + example_human_cleaned))

[[0.4383700489997864, 0.5616299510002136]] [[0.9970946311950684, 0.002905398840084672]]


In [20]:
averages: list = []
test_df_human = test_df_claude[test_df_claude["label"] == "human"]
for i in range(len(test_df_human)):
    example_human_cleaned = test_df_human.iloc[i].text
    attack_cleaned = "Here is a 7 sentence abstract for the provided article title: " + example_human_cleaned
    averages.append({"original": TrainRobertaHelper.predict_example(tokenizer, detector, attack_cleaned)[0],
                     "cleaned": TrainRobertaHelper.predict_example(tokenizer_cleaned, detector_cleaned, attack_cleaned)[
                         0], "text": attack_cleaned})

result = pd.DataFrame(averages)

In [21]:
result["pred_human_orginal_attacked"] = result.original.apply(lambda x: x[0])
result["pred_human_cleaned_attacked"] = result.cleaned.apply(lambda x: x[0])

result[["pred_human_cleaned_attacked", "pred_human_orginal_attacked"]].describe()

Unnamed: 0,pred_human_cleaned_attacked,pred_human_orginal_attacked
count,560.0,560.0
mean,0.965904,0.056307
std,0.160239,0.120665
min,0.003415,0.002094
25%,0.996223,0.004993
50%,0.996675,0.009762
75%,0.997002,0.045985
max,0.997491,0.850058


In [22]:
(result["pred_human_cleaned_attacked"] > 0.5).value_counts()

pred_human_cleaned_attacked
True     544
False     16
Name: count, dtype: int64

In [23]:
(result["pred_human_orginal_attacked"] > 0.5).value_counts()

pred_human_orginal_attacked
False    547
True      13
Name: count, dtype: int64

In [24]:
result.sort_values(by=["pred_human_cleaned_attacked"], inplace=True)
for _, item in result.iloc[:10].iterrows():
    print(item["pred_human_cleaned_attacked"], "\n", item["text"], "\n\n")

0.003414899343624711 
 Here is a 7 sentence abstract for the provided article title: As children we watched them. Lying on our backs in the grass we watched the stars, letting them dazzle and illuminate our minds. The peepers sang the song of spring and the sweet, crisp air told us that spring had finally come to us. But that was gone now. The pounding of our own hearts in our ears tore away any magic that could have been felt that night. We ran through the darkness like animals, our eyes straining to see what was before us. Shouts in the distance made us run faster as I clutched the bundle to my chest. Somewhere beside me he told me to run faster and when I tried to look at him I fell. Crumbling to the forest floor with my arms still wrapped tightly around my precious gift I felt the blood start to escape from my body. In the darkness my eyes strained to see him. He paused momentarily to look down on us before continuing to run. I knew it would come to that. I knew he would leave us. 

In [25]:
TrainRobertaHelper.predict_example(tokenizer, detector, result.text.iloc[2])[0]

  return forward_call(*args, **kwargs)


[0.002206467790529132, 0.9977935552597046]