# Evaluation of our final model on benchmark RAID

In [2]:
# import packages
import os
import warnings
warnings.filterwarnings("ignore", message="torch.utils._pytree._register_pytree_node is deprecated")
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from peft import PeftModel, PeftConfig
from datasets import load_dataset
# import RAID
import json
from raid import run_detection, run_evaluation
from raid.utils import load_data

In [12]:
# disable parallel computing
os.environ["TOKENIZERS_PARALLELISM"] = "false"
# suppress warnings
warnings.filterwarnings("ignore", message=".*resume_download.*", category=FutureWarning)
os.environ["MKL_SERVICE_FORCE_INTEL"] = "0"  # Suppresses Intel MKL warnings

In [55]:
# Get the working directories
current_dir = os.getcwd()
model_dir = os.path.join(current_dir, "ML-LoRA-E5/mix_data/")
data_dir = os.path.join(current_dir, "data/")
eval_dir = os.path.join(data_dir, "evaluation/")

In [6]:
# import helper functions
model_helpers_path = os.path.join(current_dir, "src/model_helpers.py")
%run $model_helpers_path

data_helpers_path = os.path.join(current_dir, "src/data_helpers.py")
%run $data_helpers_path

evaluate_helpers_path = os.path.join(current_dir, "src/evaluate_helpers.py")
%run $evaluate_helpers_path

## Load the fine-tuned model

In [56]:
# Get the directory of the latest model checkpoints
lora_checkpoints_dir = os.path.join(model_dir, "results_LoRA_e5/checkpoint-53390")

In [8]:
# Load the tokenizer and base model (e.g., BERT, GPT-2)
base_model_name = "intfloat/e5-small"  # Change to the base model you used
tokenizer = AutoTokenizer.from_pretrained(base_model_name)
base_model = AutoModelForSequenceClassification.from_pretrained(base_model_name)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at intfloat/e5-small and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [9]:
# Load the LoRA fine-tuned model (with the latest checkpoints)
our_model = PeftModel.from_pretrained(base_model, lora_checkpoints_dir)

In [13]:
# Sample text for inference
text = "This is an example text to test the model."

# Tokenize the input
inputs = tokenizer(text, return_tensors="pt")

# Run inference
with torch.no_grad():
    outputs = our_model(**inputs)

# Access model logits or predictions
logits = outputs.logits
print(logits)

tensor([[-2.1088,  2.0489]])


## 1. Evaluate our detector on the validation set

### 1.1 Load the validation set

### 1.2 Evaluate the detector on the validation set

## 2. Evaluate our detector on the test set

In [31]:
# Load the RAID test data
test_df = load_data(split="test")

In [28]:
print(test_df)

test_ds = Dataset.from_pandas(test_df)
print(isinstance(test_ds, Dataset))
print(test_ds)

                                          id  \
0       64005577-3d63-4583-8945-7541d3e53e7d   
1       c2b9df67-4e29-45ca-bdcc-7065fb907b77   
2       07904f22-8530-4d3b-bf49-6bd1642d89f7   
3       dc5aa023-6f57-4f9c-833a-c0f322a994fa   
4       1b1ab19b-fe6f-458d-a666-06bbc1791534   
...                                      ...   
671995  b2694dd7-1c4d-4bef-8e52-0c1e13d54130   
671996  9e5c1a37-9305-4ca7-8dc0-ab1ed763231e   
671997  a233aa5d-b375-423f-ad7a-ffc5045398c5   
671998  e267ebb4-b1f7-4af4-b68b-ecd4ba565f93   
671999  45df9738-b31c-495b-9a0d-2c62220df990   

                                               generation  
0         The Sunspot Number, created by R.Wolf in 184...  
1         We present several analogies between convex ...  
2         Let H be a homology theory for algebraic var...  
3         The two parallel concepts of "small" sets of...  
4         We present new solutions to the strong explo...  
...                                                   ...  
671

In [48]:
# Generate a random subset for testing
test_subset = test_ds.train_test_split(test_size=0.001, shuffle=True, seed=12)['test']
test_subset_df = Dataset.to_pandas(test_subset)
print(test_subset_df.head(3))

                                     id  \
0  c39744bc-bfb4-49b4-8508-4149547be2a5   
1  9c5acac0-def5-44d5-a154-9863a6eab11b   
2  7b8b7930-8aee-4be5-8208-ff41e742d669   

                                          generation  
0  Ingredients:\n- 1 loaf of corn bread, crumbled...  
1  Feker Libi, is a song  by Ehud Banai  and Yeho...  
2  Rebecca L. Gottesman (born 1962) is an America...  


In [49]:
# Define my detector function for evaluation on RAID test set
def my_detector(texts: list[str], model = our_model) -> list[float]:
    
    #processed_data = process_data(texts)
    #predicted_prob = inference_model(model, processed_ata)

    data = Dataset.from_dict({"text": texts})
    
    # tokenize input data
    data = data.map(lambda x: tokenize_data(x), batched=True, desc="Tokenizing data")
    # load batches of tokenized data
    dataloader = DataLoader(data, batch_size=1, collate_fn=custom_collate_fn, shuffle=False)
    # create a list for the output prediction
    predictions = []
    # Counter for entries processed
    entry_count = 0

    with torch.no_grad():
        for batch in dataloader:
            batch = {k: v.to(model.device) for k, v in batch.items()}
            outputs = model(**batch)
            logits = outputs.logits
            # Apply softmax to get probabilities
            probs = softmax(logits, dim=-1).cpu().numpy()
            predictions.extend(probs)
            
            # Update counter and show progress every 100 entries
            entry_count += len(probs)
            if entry_count % 100 == 0:
                print(f"Processed {entry_count} entries...")

    return predictions

In [50]:
# Run your detector on the dataset
predictions = run_detection(my_detector, test_subset_df)

Tokenizing data:   0%|          | 0/672 [00:00<?, ? examples/s]

Processed 100 entries...
Processed 200 entries...
Processed 300 entries...
Processed 400 entries...
Processed 500 entries...
Processed 600 entries...


FileNotFoundError: [Errno 2] No such file or directory: 'predictions.json'

In [81]:
predictions_dicts = [{"id": pred['id'], "score": pred['score']} for pred in predictions]
for pred in predictions_dicts:
    pred['score'] = pred['score'].tolist()  # Convert the NumPy array to a list

print(predictions_dicts[0])

{'id': 'c39744bc-bfb4-49b4-8508-4149547be2a5', 'score': [0.03664873167872429, 0.963351309299469]}


In [82]:
with open(os.path.join(data_dir, "predictions.json"), 'w') as f:
    json.dump(predictions_dicts, f, indent=4)

## 3. Evaluate our detector on an extra set with new domain

### 3.1 Load the extra set

In [5]:
import zipfile

# Path to the zip file
zip_file_path = os.path.join(data_dir, "RAID_extra_noadv_df.csv.zip")
# Directory to extract to (can be the same as zip_file_path or another directory)

# Create the directory if it doesn't exist
os.makedirs(data_dir, exist_ok=True)

# Open the zip file
with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
    # Extract all contents into the specified directory
    zip_ref.extractall(data_dir)

print(f"Extracted all files successfully.")

Extracted all files successfully.


In [6]:
# Load validation set without adversarial attacks
validation_set = load_dataset("csv", data_files=os.path.join(eval_dir, "RAID_extra_noadv_df.csv"), split = 'all')
print(validation_set)

Downloading data files:   0%|          | 0/1 [00:00<?, ?it/s]

Extracting data files:   0%|          | 0/1 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'adv_source_id', 'source_id', 'model', 'decoding', 'repetition_penalty', 'attack', 'domain', 'title', 'prompt', 'generation'],
    num_rows: 169925
})


In [7]:
# Process the no-adversarial-attack validation set for training
processed_set = process_data(validation_set, "generation", "model")

Creating labels column:   0%|          | 0/169925 [00:00<?, ? examples/s]

Dropping NAs:   0%|          | 0/169925 [00:00<?, ? examples/s]

Transforming labels to binary 0/1:   0%|          | 0/169925 [00:00<?, ? examples/s]

In [8]:
# Print summary of the processed validation set
print(processed_set)
print(is_processed_data(processed_set))
label_counter(processed_set)

ProcessedData with 169925 rows
True
Count of human-written entries: 4855
Count of machine-generated entries: 165070


In [9]:
#! Remove this chunk and use the full dataset for evaluation

# Choose a random subset of .1% entries without balanced labels
#excluded_set, processed_set = train_test_split(validation_set, test_size=0.0001, seed=7)

# Choose a random subset of 1% entries with balanced labels
excluded_set, processed_set = train_test_split_equal_size(processed_set, test_size=0.01, seed=7)

Filtering label 0:   0%|          | 0/169925 [00:00<?, ? examples/s]

Filtering label 1:   0%|          | 0/169925 [00:00<?, ? examples/s]

In [10]:
print(processed_set)

# Remove columns other than text and labels
reduced_processed_set = process_data(processed_set, "text", "labels", reduced=True)
print(reduced_processed_set)

ProcessedData with 96 rows


Dropping NAs:   0%|          | 0/96 [00:00<?, ? examples/s]

ProcessedData with 96 rows


### 3.2 Evaluate our detector on the extra set

In [15]:
predicted_probs = inference_model(our_model, reduced_processed_set)

Tokenizing data:   0%|          | 0/96 [00:00<?, ? examples/s]

In [16]:
# Save predictions to a CSV file
res_csv_path = os.path.join(eval_dir, 'pred_extra_nonadv_subset.csv')
save_inference_to_csv(predicted_probs, processed_set, res_csv_path)

Combined DataFrame saved successfully.


In [17]:
print(isinstance(predicted_probs, PredictionResults))

# Compute metrics
predicted_label = get_predicted_labels(predicted_probs, 0.9)
res = evaluate_model(predicted_label, reduced_processed_set)
print(res)

True
{'Accuracy': 0.583, 'F1 score': 0.583, 'FPR': 0.417}


In [18]:
# Read the results as pandas.DataFrame
res_dataframe = read_inference_as_DataFrame(res_csv_path)
print(res_dataframe.head(1))

                                     id                         adv_source_id  \
0  ed337d3f-b9d7-4980-b60c-a1961f73bd05  ed337d3f-b9d7-4980-b60c-a1961f73bd05   

                              source_id model  decoding repetition_penalty  \
0  5795208a-5b5c-456d-a137-994297c76d03  gpt4  sampling                 no   

  attack  domain                                          title  \
0   none  german  Barcelona nur 2:2 in Villarreal, Real gewinnt   

                                              prompt  \
0  Schreiben Sie einen Nachrichtenartikel mit dem...   

                                                text  labels  Prediction_0  \
0  In der spanischen Fußball-Liga Primera Divisió...       1      0.038074   

   Prediction_1  
0      0.961926  


In [19]:
# Read the results as datasets.Dataset
res_dataset = read_inference_as_Dataset(res_csv_path)
print(res_dataset[:1])

{'id': ['ed337d3f-b9d7-4980-b60c-a1961f73bd05'], 'adv_source_id': ['ed337d3f-b9d7-4980-b60c-a1961f73bd05'], 'source_id': ['5795208a-5b5c-456d-a137-994297c76d03'], 'model': ['gpt4'], 'decoding': ['sampling'], 'repetition_penalty': ['no'], 'attack': ['none'], 'domain': ['german'], 'title': ['Barcelona nur 2:2 in Villarreal, Real gewinnt'], 'prompt': ['Schreiben Sie einen Nachrichtenartikel mit dem Titel "Barcelona nur 2:2 in Villarreal, Real gewinnt".'], 'text': ['In der spanischen Fußball-Liga Primera División hat der Rennleiter FC Barcelona nur ein Unentschieden gegen den CF Villarreal erreicht. Damit hat Barcelona trotz starker Bemühungen wichtige Punkte verloren und ist jetzt auf dem dritten Platz in der Tabelle gelandet. \n\nDas Spiel in Villarreal endete mit einem 2:2. Barcelona hatte die Führung in der ersten Spielhälfte mit Toren von Messi und Griezmann übernommen. In der zweiten Spielhälfte kämpfte Villarreal jedoch zurück und erzielte in den letzten Minuten zwei Tore, was zu ei