# ESA Project: Fake or Real: The Impostor Hunt in Texts

This notebook is dedicated to **model evaluation**.  
It covers:

- Load the saved, best-performing `DistilBERT` model from the training phase and the tokenized validation dataset
- Run inference on the validation set to obtain the model's predicted probability of the "Real" class (`label=1`) for every individual chunk
- Apply the Realness score comparison strategy to combine chunk-level predictions: the average "Real" probability of all chunks is calculated for `file1` and `file2`
- Calculate the final Accuracy of the `DistilBERT` model on the original text pair classification task
- Compare the `DistilBERT` model's final accuracy against the baseline model's performance to validate the use of the complex model

# Import librairies

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import os
import sys
from pathlib import Path

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.metrics import accuracy_score, precision_recall_fscore_support, confusion_matrix, classification_report
from scipy.special import softmax
import torch
from datasets import Dataset

# Add the src folder to Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import config

# Custom classes from the model's training notebook

In [2]:
# Define the custom dataset class
class TextPairDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Define the custom trainer class with weighted loss
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Define class weights
        # Had 0 - 138 and 1 - 74
        weights = torch.tensor([1.0, 138/74]).to(logits.device)
        loss_fct = torch.nn.CrossEntropyLoss(weight=weights)
        
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Define the compute_metrics function
def compute_metrics(p):
    preds = np.argmax(p.predictions, axis=1)
    # Note: 'binary' average is used because the task is binary classification (0 or 1)
    precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
    acc = accuracy_score(p.label_ids, preds)
    return {
        'accuracy': acc,
        'f1': f1,
        'precision': precision,
        'recall': recall
    }

# Detect device
device = "cuda" if torch.cuda.is_available() else "mps" if torch.backends.mps.is_available() else "cpu"
print(f"Using device: {device}")

# Define paths
MODEL_PATH = config.OUTPUT_DIR / "distilbert_fake_or_real" / "final_model"
TOKENIZED_TEST_PATH = config.PROCESSED_DATA_DIR / "tokenized_test.pt"

sns.set_theme()

Using device: mps


# Load data and final model

In [3]:
# Load tokenized test dataset
try:
    # Note: You must have a tokenized_test.pt file ready
    test_dataset = torch.load(TOKENIZED_TEST_PATH, weights_only=False)
    print(f"Test dataset size: {len(test_dataset)}")
except FileNotFoundError:
    print(f"Error: Tokenized test data not found at {TOKENIZED_TEST_PATH}")
    sys.exit(1)

# Add labels if needed
if "labels" not in test_dataset:
    n = test_dataset["input_ids"].size(0)
    test_dataset["labels"] = torch.zeros(n, dtype=torch.long)

# Check
print(test_dataset["labels"].shape)

# Convert to HuggingFace Dataset
hf_dataset = Dataset.from_dict({
    "input_ids": test_dataset["input_ids"].tolist(),
    "attention_mask": test_dataset["attention_mask"].tolist(),
    "labels": test_dataset["labels"].tolist()
})

# Load the final model and tokenizer
model = AutoModelForSequenceClassification.from_pretrained(MODEL_PATH)
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
print(f"Model and tokenizer loaded from: {MODEL_PATH}")

# Define minimal training arguments for evaluation
eval_args = TrainingArguments(
    output_dir=str(config.OUTPUT_DIR / "evaluation"),
    per_device_eval_batch_size=64,
    report_to="none",
)

# Instantiate Trainer for evaluation
trainer = WeightedTrainer(
    model=model,
    args=eval_args,
    eval_dataset=hf_dataset,
    compute_metrics=compute_metrics,
)


Test dataset size: 2
torch.Size([2608])
Model and tokenizer loaded from: /Users/photoli93/Desktop/Projets perso Python/esa_fake_or_real/results/distilbert_fake_or_real/final_model


# Model prediction on test set

In [4]:
print("-" * 80)
print("Starting final prediction on test set")
predictions = trainer.predict(hf_dataset)

print("\nTest Set Metrics")
print(predictions)
print("-" * 80)

# Logits from Hugging Face trainer.predict
logits = predictions.predictions
# Convert logits to probabilities
probs = softmax(logits, axis=1)

--------------------------------------------------------------------------------
Starting final prediction on test set




  0%|          | 0/41 [00:00<?, ?it/s]


Test Set Metrics
PredictionOutput(predictions=array([[-0.8699257 ,  1.1607869 ],
       [ 0.89736944, -0.85450935],
       [-0.76462257,  1.1284206 ],
       ...,
       [ 0.63882816, -0.4171373 ],
       [-0.82367814,  1.222086  ],
       [ 0.09846318,  0.1728112 ]], dtype=float32), label_ids=array([0, 0, 0, ..., 0, 0, 0]), metrics={'test_loss': 1.1365916728973389, 'test_model_preparation_time': 0.0007, 'test_accuracy': 0.42829754601226994, 'test_f1': 0.0, 'test_precision': 0.0, 'test_recall': 0.0, 'test_runtime': 124.2052, 'test_samples_per_second': 20.998, 'test_steps_per_second': 0.33})
--------------------------------------------------------------------------------


  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# Reconstruction of the test dataset (unchunked)

In [5]:
test_df_exploded = pd.read_csv("../data/processed/test_exploded.csv")

test_df_exploded["prob_file1_real"] = probs[:,0]
test_df_exploded["prob_file2_real"] = probs[:,1]
display(test_df_exploded.head())

# Sum probabilities over chunks per pair
agg_probs = test_df_exploded.groupby("pair_id")[["prob_file1_real", "prob_file2_real"]].sum().reset_index()
# Decide which file is real
agg_probs["real_text_id"] = np.where(
    agg_probs["prob_file1_real"] > agg_probs["prob_file2_real"], 1, 2
)

agg_probs = agg_probs[["pair_id", "real_text_id"]].rename(columns={"pair_id": "id"})
display(agg_probs.head())

Unnamed: 0,pair_id,file1_text,file2_text,file1_text_cleaned,file2_text_cleaned,file1_text_cleaned_tokens,file1_text_cleaned_num_tokens,file2_text_cleaned_tokens,file2_text_cleaned_num_tokens,file1_text_cleaned_chunks,file1_text_cleaned_num_chunks,file2_text_cleaned_chunks,file2_text_cleaned_num_chunks,text_chunk,source,prob_file1_real,prob_file2_real
0,0,The brightest blue star found within an area c...,GEBURTSiostream\nThe primary standout object o...,bright blue star find within area call h ii re...,geburtsiostream primary standout object observ...,"['bright', 'blue', 'star', 'find', 'within', '...",100,"['ge', '##bu', '##rts', '##ios', '##tream', 'p...",198,['bright blue star find within area call h ii ...,1,['geburtsiostream primary standout object obse...,1,bright blue star find within area call h ii re...,file1,0.116016,0.883984
1,1,The frontiers of space medicine are exploding ...,"In recent decades, scientists have focused on ...",frontier space medicine explode exciting disco...,recent decade scientist focus understand super...,"['frontier', 'space', 'medicine', 'explode', '...",214,"['recent', 'decade', 'scientist', 'focus', 'un...",153,['frontier space medicine explode exciting dis...,1,['recent decade scientist focus understand sup...,1,frontier space medicine explode exciting disco...,file1,0.85219,0.14781
2,2,The second-generation Variable Light Telescope...,**A Giant Leap: The DSM Takes Flight!**\n\nLik...,second generation variable light telescope vlt...,giant leap dsm take flight like giant star rea...,"['second', 'generation', 'variable', 'light', ...",175,"['giant', 'leap', 'ds', '##m', 'take', 'flight...",214,['second generation variable light telescope v...,1,['giant leap dsm take flight like giant star r...,1,second generation variable light telescope vlt...,file1,0.130898,0.869102
3,3,This study examined whether articles published...,underground\nWe analyzed how often articles pu...,study examine whether article publish arxiv re...,underground analyze often article publish arxi...,"['study', 'examine', 'whether', 'article', 'pu...",126,"['underground', 'analyze', 'often', 'article',...",135,['study examine whether article publish arxiv ...,1,['underground analyze often article publish ar...,1,study examine whether article publish arxiv re...,file1,0.241505,0.758495
4,4,The special abilities of ALMA were highlighted...,The distinctive abilities of ALMA were highlig...,special ability alma highlight improve underst...,distinctive ability alma highlight enhance und...,"['special', 'ability', 'alma', 'highlight', 'i...",134,"['distinctive', 'ability', 'alma', 'highlight'...",144,['special ability alma highlight improve under...,1,['distinctive ability alma highlight enhance u...,1,special ability alma highlight improve underst...,file1,0.122651,0.877349


Unnamed: 0,id,real_text_id
0,0,2
1,1,2
2,2,1
3,3,2
4,4,2


# Export final df to csv

In [6]:
agg_probs.to_csv("../data/processed/final_predictions.csv", index=False)

# End of model evaluation notebook