# Evaluation of our final model on benchmark RAID

In [1]:
# import packages
import os
import warnings
warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset

In [2]:
# disable parallel computing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# suppress warnings
warnings.filterwarnings("ignore", message=".*resume_download.*", category=FutureWarning)
os.environ["MKL_SERVICE_FORCE_INTEL"] = "1"  # Suppresses Intel MKL warnings

In [3]:
# Get the current working directory
current_dir = os.getcwd()

In [4]:
# import helper functions
model_helpers_path = os.path.join(current_dir, "src/model_helpers.py")
%run $model_helpers_path

data_helpers_path = os.path.join(current_dir, "src/data_helpers.py")
%run $data_helpers_path

evaluate_helpers_path = os.path.join(current_dir, "src/evaluate_helpers.py")
%run $evaluate_helpers_path

## Load the validation dataset

In [5]:
import zipfile
data_dir = os.path.join(current_dir, "data/evaluation/")

# Path to the zip file
zip_file_path = os.path.join(data_dir, "RAID_extra_noadv_df.csv.zip")
# Directory to extract to (can be the same as zip_file_path or another directory)

# Create the directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents into the specified directory
    zip_ref.extractall(data_dir)

print(f"Extracted all files successfully.")

Extracted all files successfully.


In [6]:
# Load validation set without adversarial attacks
validation_set = load_dataset("csv", data_files=os.path.join(data_dir, "RAID_extra_noadv_df.csv"), split = 'all')
print(validation_set)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation'],
    num_rows: 169925
})


In [7]:
# Process the no-adversarial-attack validation set for training
processed_set = process_data(validation_set, "generation", "model")

Creating labels column:   0%|          | 0/169925 [00:00<?, ? examples/s]

Dropping NAs:   0%|          | 0/169925 [00:00<?, ? examples/s]

Transforming labels to binary 0/1:   0%|          | 0/169925 [00:00<?, ? examples/s]

In [8]:
# Print summary of the processed validation set
print(processed_set)
print(is_processed_data(processed_set))
label_counter(processed_set)

ProcessedData with 169925 rows
True
Count of human-written entries: 4855
Count of machine-generated entries: 165070


In [9]:
#! Remove this chunk and use the full dataset for evaluation

# Choose a random subset of .1% entries without balanced labels
#excluded_set, processed_set = train_test_split(validation_set, test_size=0.0001, seed=7)

# Choose a random subset of 1% entries with balanced labels
excluded_set, processed_set = train_test_split_equal_size(processed_set, test_size=0.01, seed=7)

Filtering label 0:   0%|          | 0/169925 [00:00<?, ? examples/s]

Filtering label 1:   0%|          | 0/169925 [00:00<?, ? examples/s]

In [10]:
print(processed_set)

# Remove columns other than text and labels
reduced_processed_set = process_data(processed_set, "text", "labels", reduced=True)
print(reduced_processed_set)

ProcessedData with 96 rows


Dropping NAs:   0%|          | 0/96 [00:00<?, ? examples/s]

ProcessedData with 96 rows


## Load the fine-tuned model

In [11]:
# Specify the model directory
model_dir = os.path.join(current_dir, "ML-LoRA-E5/mix_data/final_model")

# Get the directory of the latest model checkpoints
lora_checkpoints_dir = os.path.join(current_dir, "ML-LoRA-E5/mix_data/results_LoRA_e5/checkpoint-53390")

In [12]:
# Load the tokenizer and base model (e.g., BERT, GPT-2)
base_model_name = "intfloat/e5-small"  # Change to the base model you used
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
# Load the LoRA fine-tuned model (with the latest checkpoints)
our_model = PeftModel.from_pretrained(base_model, lora_checkpoints_dir)

In [14]:
# Sample text for inference
text = "This is an example text to test the model."

# Tokenize the input
inputs = tokenizer(text, return_tensors="pt")

# Run inference
with torch.no_grad():
    outputs = our_model(**inputs)

# Access model logits or predictions
logits = outputs.logits
print(logits)

tensor([[-2.1088,  2.0489]])


## Evaluate our detector on the validation set

In [15]:
predicted_probs = inference_model(our_model, reduced_processed_set)

Tokenizing data:   0%|          | 0/96 [00:00<?, ? examples/s]

In [16]:
# Save predictions to a CSV file
res_csv_path = os.path.join(data_dir, 'pred_extra_nonadv_subset.csv')
save_inference_to_csv(predicted_probs, processed_set, res_csv_path)

Combined DataFrame saved successfully.


In [17]:
print(isinstance(predicted_probs, PredictionResults))

# Compute metrics
predicted_label = get_predicted_labels(predicted_probs, 0.9)
res = evaluate_model(predicted_label, reduced_processed_set)
print(res)

True
{'Accuracy': 0.583, 'F1 score': 0.583, 'FPR': 0.417}


In [18]:
# Read the results as pandas.DataFrame
res_dataframe = read_inference_as_DataFrame(res_csv_path)
print(res_dataframe.head(1))

                                     id                         adv_source_id  \
0  ed337d3f-b9d7-4980-b60c-a1961f73bd05  ed337d3f-b9d7-4980-b60c-a1961f73bd05   

                              source_id model  decoding repetition_penalty  \
0  5795208a-5b5c-456d-a137-994297c76d03  gpt4  sampling                 no   

  attack  domain                                          title  \
0   none  german  Barcelona nur 2:2 in Villarreal, Real gewinnt   

                                              prompt  \
0  Schreiben Sie einen Nachrichtenartikel mit dem...   

                                                text  labels  Prediction_0  \
0  In der spanischen Fußball-Liga Primera Divisió...       1      0.038074   

   Prediction_1  
0      0.961926  


In [19]:
# Read the results as datasets.Dataset
res_dataset = read_inference_as_Dataset(res_csv_path)
print(res_dataset[:1])

{'id': ['ed337d3f-b9d7-4980-b60c-a1961f73bd05'], 'adv_source_id': ['ed337d3f-b9d7-4980-b60c-a1961f73bd05'], 'source_id': ['5795208a-5b5c-456d-a137-994297c76d03'], 'model': ['gpt4'], 'decoding': ['sampling'], 'repetition_penalty': ['no'], 'attack': ['none'], 'domain': ['german'], 'title': ['Barcelona nur 2:2 in Villarreal, Real gewinnt'], 'prompt': ['Schreiben Sie einen Nachrichtenartikel mit dem Titel "Barcelona nur 2:2 in Villarreal, Real gewinnt".'], 'text': ['In der spanischen Fußball-Liga Primera División hat der Rennleiter FC Barcelona nur ein Unentschieden gegen den CF Villarreal erreicht. Damit hat Barcelona trotz starker Bemühungen wichtige Punkte verloren und ist jetzt auf dem dritten Platz in der Tabelle gelandet. \n\nDas Spiel in Villarreal endete mit einem 2:2. Barcelona hatte die Führung in der ersten Spielhälfte mit Toren von Messi und Griezmann übernommen. In der zweiten Spielhälfte kämpfte Villarreal jedoch zurück und erzielte in den letzten Minuten zwei Tore, was zu ei