# Evaluation of our final model on benchmark RAID

In [22]:
# import packages
import os
import warnings
warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset
# import RAID
import json
from raid import run_detection, run_evaluation
from raid.utils import load_data

In [23]:
# disable parallel computing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# suppress warnings
warnings.filterwarnings("ignore", message=".*resume_download.*", category=FutureWarning)

In [24]:
# Get the working directories
current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "ML-LoRA-E5/twitter_raid_data/raid_twitter_LoRA_e5")
data_dir = os.path.join(current_dir, "data/")
eval_dir = os.path.join(data_dir, "evaluation/")

In [69]:
# import helper functions
model_helpers_path = os.path.join(current_dir, "src/model_helpers.py")
%run $model_helpers_path

data_helpers_path = os.path.join(current_dir, "src/data_helpers.py")
%run $data_helpers_path

evaluate_helpers_path = os.path.join(current_dir, "src/evaluate_helpers.py")
%run $evaluate_helpers_path

## Load the fine-tuned model

In [5]:
# Get the directory of the latest model checkpoints
lora_checkpoints_dir = os.path.join(model_dir, "checkpoint-29745")

In [6]:
# Load the tokenizer and base model (e.g., BERT, GPT-2)
base_model_name = "intfloat/e5-small"  # Change to the base model you used
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [7]:
# Load the LoRA fine-tuned model (with the latest checkpoints)
our_model = PeftModel.from_pretrained(base_model, lora_checkpoints_dir)

## 1. Evaluate our detector on the test set

In [8]:
# Load the RAID test data
test_df = load_data(split="test")
#test_df = read_csv(os.path.join(eval_dir, "test_raid_set.csv")

In [9]:
test_ds = Dataset.from_pandas(test_df)
print(isinstance(test_ds, Dataset))
print(test_ds)

True
Dataset({
    features: ['id', 'generation'],
    num_rows: 672000
})


In [10]:
# Generate a random subset for testing
test_subset = test_ds.train_test_split(test_size=0.0001, shuffle=True, seed=12)['test']
test_subset_df = Dataset.to_pandas(test_subset)
print(test_subset_df.head(3))
print(test_subset_df.shape)

                                     id  \
0  c39744bc-bfb4-49b4-8508-4149547be2a5   
1  9c5acac0-def5-44d5-a154-9863a6eab11b   
2  7b8b7930-8aee-4be5-8208-ff41e742d669   

                                          generation  
0  Ingredients:\n- 1 loaf of corn bread, crumbled...  
1  Feker Libi, is a song  by Ehud Banai  and Yeho...  
2  Rebecca L. Gottesman (born 1962) is an America...  
(68, 2)


In [11]:
# Define my detector function for evaluation on RAID test set
def my_detector(texts: list[str], model = our_model) -> list[float]:
    predicted_probs = inference_model(model, texts)
    return predicted_probs.predictions

In [12]:
# Run your detector on the dataset
predictions = run_detection(my_detector, test_subset_df)

Tokenizing data:   0%|          | 0/68 [00:00<?, ? examples/s]



In [13]:
pred_probs = [item['score']['Predicted_Probs(1)'] for item in predictions]
print(pred_probs[:2])

[0.9807470440864563, 0.9815977811813354]


In [14]:
# save the predictions
with open(os.path.join(data_dir, "predictions.json"), 'w') as f:
    json.dump(predictions, f, indent=4)

In [15]:
# Get predicted labels
pred_label = get_predicted_labels(pred_probs, 0.95)
print(pred_label[:10])

[1, 1, 1, 1, 1, 0, 1, 1, 1, 1]


## 2. Evaluate our detector on an extra set with new domain

### 2.1 Load the extra set

In [16]:
#import zipfile
#
## Path to the zip file
#zip_file_path = os.path.join(eval_dir, "RAID_extra_noadv_df.csv.zip")
## Directory to extract to (can be the same as zip_file_path or another directory)
#
## Create the directory if it doesn't exist
#os.makedirs(eval_dir, exist_ok=True)
#
## Open the zip file
#with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
#    # Extract all contents into the specified directory
#    zip_ref.extractall(eval_dir)
#
#print(f"Extracted all files successfully.")

In [17]:
# Load validation set without adversarial attacks
extra_set = load_dataset("csv", data_files=os.path.join(eval_dir, "RAID_extra_adv_df.csv"), split = 'all')
print(extra_set)

Dataset({
    features: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation'],
    num_rows: 2039100
})


In [18]:
# Process the adversarial-attack set for training
processed_set = process_data(extra_set, "generation", "model")

In [19]:
# Print summary of the processed set
print(processed_set)
print(is_processed_data(processed_set))
label_counter(processed_set)

ProcessedData with 2039100 rows
True
Count of human-written entries: 58260
Count of machine-generated entries: 1980840


In [70]:
# Generate a random sample
excluded_data, processed_subset = train_test_split_unequal_class(processed_set, test_size = 0.001, seed = 620)

In [71]:
print(processed_set)

# Remove columns other than text and labels
reduced_processed_subset = process_data(processed_subset, "text", "labels", reduced=True)
print(reduced_processed_subset)

ProcessedData with 2039100 rows
ProcessedData with 2040 rows


### 2.2 Evaluate our detector on the extra set

In [72]:
predicted_probs = inference_model(our_model, reduced_processed_subset)

Tokenizing data:   0%|          | 0/2040 [00:00<?, ? examples/s]

Processed 100 entries...
Processed 200 entries...
Processed 300 entries...
Processed 400 entries...
Processed 500 entries...
Processed 600 entries...
Processed 700 entries...
Processed 800 entries...
Processed 900 entries...
Processed 1000 entries...
Processed 1100 entries...
Processed 1200 entries...
Processed 1300 entries...
Processed 1400 entries...
Processed 1500 entries...
Processed 1600 entries...
Processed 1700 entries...
Processed 1800 entries...
Processed 1900 entries...
Processed 2000 entries...


In [75]:
# Save predictions to a CSV file
res_csv_path = os.path.join(eval_dir, 'pred_extra_adv_subset.csv')
save_inference_to_csv(predicted_probs, reduced_processed_subset, res_csv_path)

Combined DataFrame saved successfully.


In [80]:
print(isinstance(predicted_probs, PredictionResults))

# Compute metrics
predicted_label = get_predicted_labels(predicted_probs, 0.6)
res = evaluate_model(predicted_label, reduced_processed_subset)
print(res)

True
{'Accuracy': 0.956, 'F1 score': 0.955, 'FPR': 0.044}


In [81]:
# Read the results as pandas.DataFrame
res_dataframe = read_inference_as_DataFrame(res_csv_path)
print(res_dataframe.head(1))

                                                text  labels  \
0  import cmath\ndef phase_angle(complex_number):...       1   

   Predicted_Probs(1)  
0            0.718956  


In [82]:
# Read the results as datasets.Dataset
res_dataset = read_inference_as_Dataset(res_csv_path)
print(res_dataset[:1])

{'text': ['import cmath\ndef phase_angle(complex_number):\n    # Convert complex number to Python complex type\n    num = complex(complex_number)\n    \n    # Calculate the phase angle of the complex number\n    angle = cmath.phase(num)\n    \n    # Return the phase angle of complex number\n    return angle'], 'labels': [1], 'Predicted_Probs(1)': [0.718956]}
