# ESA Project: Fake or Real: The Impostor Hunt in Texts

This notebook is dedicated to **model training**.  
It covers:

- Load the pre-tokenized and saved PyTorch datasets (`tokenized_train.pt` and `tokenized_val.pt`) created in the data creation step.
- Instantiate the pre-trained **DistilBertForSequenceClassification** model, configured for a binary classification task (Real or fake).
- Define the **TrainingArguments** for the Hugging Face Trainer, including hyperparameters such as batch size, learning rate, number of epochs, and logging settings.
- Implemente a function to compute and track key performance metrics on the validation set including **Accuracy, Precision, Recall and F1-score**.
- Executing the training loop using the Hugging Face **Trainer** class which manages the entire process including checkpointing and model saving.
- Save the best-performing model and its tokenizer to disk for later use in inference and deployment.

# Import librairies

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path
import matplotlib.pyplot as plt
import seaborn as sns
import re

import torch
from torch.nn import CrossEntropyLoss
from transformers import DistilBertForSequenceClassification, Trainer, TrainingArguments, AutoTokenizer, EarlyStoppingCallback

import os
import sys

# Add the src folder to Python path
sys.path.append(os.path.abspath(os.path.join('..', 'src')))
import config
from preprocessing import TextPreprocessor, get_text_statistics

import warnings
warnings.filterwarnings("ignore")

sns.set_theme()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/photoli93/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


# Create Custom Dataset Class

In [2]:
class TextPairDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx], dtype=torch.long)
        return item

    def __len__(self):
        return len(self.labels)

# Load tokenized datasets

In [3]:
TOKENIZED_TRAIN_PATH = config.PROCESSED_DATA_DIR / "tokenized_train.pt"
TOKENIZED_VAL_PATH = config.PROCESSED_DATA_DIR / "tokenized_val.pt"

OUTPUT_DIR = config.OUTPUT_DIR / "distilbert_fake_or_real"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Load tokenized datasets
print(f"Loading tokenized training data from: {TOKENIZED_TRAIN_PATH}")
try:
    train_dataset = torch.load(TOKENIZED_TRAIN_PATH, weights_only=False)
    val_dataset = torch.load(TOKENIZED_VAL_PATH, weights_only=False)
except FileNotFoundError:
    print("Error: Tokenized data not found. Please ensure 03_dataset_creation.py (or your data creation notebook) was run successfully")
    sys.exit(1)

print(f"Training dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(val_dataset)}")

Loading tokenized training data from: /Users/photoli93/Desktop/Projets perso Python/esa_fake_or_real/data/processed/tokenized_train.pt
Training dataset size: 212
Validation dataset size: 41


# Load model and tokenizer

In [4]:
# Load the model for sequence classification with 2 labels (Real/Fake)
print(f"Loading model: {config.TOKENIZER_NAME} for Sequence Classification")
model = DistilBertForSequenceClassification.from_pretrained(
    config.TOKENIZER_NAME, 
    num_labels=2
)

# Might need the tokenizer later for prediction/evaluation
tokenizer = AutoTokenizer.from_pretrained(config.TOKENIZER_NAME)

Loading model: distilbert-base-uncased for Sequence Classification


Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


# Define parameters and metrics

In [None]:
# Training configuration
training_args = TrainingArguments(
    output_dir=str(OUTPUT_DIR),          
    num_train_epochs=10,                 # 10 epochs but with early stopping to avoid overfitting
    per_device_train_batch_size=16,      
    per_device_eval_batch_size=64,       
    warmup_steps=500,                    # Nb of steps during which the learning rate increases learning
    weight_decay=0.01,                   # Regularization term to reduce overfitting by penalizing large weights
    logging_dir='./logs',                
    logging_steps=50,                    # Log metrics every 50 training steps
    evaluation_strategy="epoch",         # Evaluate at the end of each epoch
    save_strategy="epoch",               # Save model checkpoint at the end of each epoch
    load_best_model_at_end=True,         # Load the best model found during training
    metric_for_best_model="eval_f1",     # Metric to monitor for best model
    fp16=True,                           # Use 16-bit precision for faster training
    report_to="none"                     # disables wandb, tensorboard, etc. (disabling sending logs online)
)

# Define Evaluation Metric
try:
    from sklearn.metrics import accuracy_score, precision_recall_fscore_support
    
    # p is an object passed by the Hugging Face Trainer. It contains "predictions" and "label_ids"
    def compute_metrics(p):
        preds = np.argmax(p.predictions, axis=1)
        precision, recall, f1, _ = precision_recall_fscore_support(p.label_ids, preds, average='binary')
        acc = accuracy_score(p.label_ids, preds)
        return {
            'accuracy': acc,
            'f1': f1,
            'precision': precision,
            'recall': recall
        }
except ImportError:
    print("Warning: scikit-learn not found. Install with 'pip install scikit-learn' to use advanced metrics")
    def compute_metrics(p):
        return {}

# Custom Loss function

By exploding chunks, it has added unbalance in train dataset (138 for class 0 and 74 for class 1) so custom loss function has to be defined in order to add weights on classes

In [6]:
train_df = pd.read_csv(config.PROCESSED_DATA_DIR / "train_exploded.csv")

# Count of each class
counts = train_df['label'].value_counts().sort_index()
print(counts)

# Dynamic class weights (inverse frequency)
weights = 1.0 / counts
weights = weights / weights.sum()  # Normalize
weights = torch.tensor(weights.values, dtype=torch.float)
print(weights)

# Override the default loss
def compute_loss(model, inputs, return_outputs=False):
    labels = inputs.pop("labels")
    outputs = model(**inputs)
    logits = outputs.logits
    loss_fct = CrossEntropyLoss(weight=weights.to(logits.device))
    loss = loss_fct(logits, labels)
    return (loss, outputs) if return_outputs else loss

label
0    138
1     74
Name: count, dtype: int64
tensor([0.3491, 0.6509])


# Custom subclass Trainer

Since in Hugging Face Transformers, the **Trainer** class does not have a `compute_loss` argument in its constructor, I created a subclass called **WeightedTrainer** to use the custom `compute_loss` method.

In [7]:
class WeightedTrainer(Trainer):
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        
        # Define class weights
        weights = torch.tensor([1.0, 138/74]).to(logits.device)
        loss_fct = CrossEntropyLoss(weight=weights)
        
        loss = loss_fct(logits, labels)
        return (loss, outputs) if return_outputs else loss

# Training process

In [8]:
# Initialize the WeightedTrainer
trainer = WeightedTrainer(
    model=model,                                                  # The instantiated Transformers model to be trained
    args=training_args,                                           # Training arguments, defined above
    train_dataset=train_dataset,                                  # Training dataset
    eval_dataset=val_dataset,                                     # Evaluation dataset
    compute_metrics=compute_metrics,                              # Function to compute metrics
    callbacks=[EarlyStoppingCallback(early_stopping_patience=3)]  # stop if no improvement in 3 epochs
)

# Start Training
print("\nStarting training")
trainer.train()
print("Training complete!")

# Save Final Model
final_model_path = OUTPUT_DIR / "final_model"
trainer.save_model(str(final_model_path))
tokenizer.save_pretrained(str(final_model_path))
print(f"Final model and tokenizer saved to: {final_model_path}")

# Final Evaluation
print("\nFinal evaluation on validation set:")
results = trainer.evaluate()
print(results)

print("\nModel Training Complete")


Starting training


  0%|          | 0/280 [00:00<?, ?it/s]

  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6819775700569153, 'eval_accuracy': 0.4634146341463415, 'eval_f1': 0.6333333333333333, 'eval_precision': 0.4634146341463415, 'eval_recall': 1.0, 'eval_runtime': 2.4839, 'eval_samples_per_second': 16.506, 'eval_steps_per_second': 0.403, 'epoch': 1.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6654515862464905, 'eval_accuracy': 0.6585365853658537, 'eval_f1': 0.7307692307692307, 'eval_precision': 0.5757575757575758, 'eval_recall': 1.0, 'eval_runtime': 2.3248, 'eval_samples_per_second': 17.636, 'eval_steps_per_second': 0.43, 'epoch': 2.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.6270184516906738, 'eval_accuracy': 0.7560975609756098, 'eval_f1': 0.7916666666666666, 'eval_precision': 0.6551724137931034, 'eval_recall': 1.0, 'eval_runtime': 2.2782, 'eval_samples_per_second': 17.996, 'eval_steps_per_second': 0.439, 'epoch': 3.0}
{'loss': 0.6564, 'grad_norm': 1.9313827753067017, 'learning_rate': 5e-06, 'epoch': 3.57}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.5431719422340393, 'eval_accuracy': 0.8292682926829268, 'eval_f1': 0.8444444444444444, 'eval_precision': 0.7307692307692307, 'eval_recall': 1.0, 'eval_runtime': 2.2877, 'eval_samples_per_second': 17.922, 'eval_steps_per_second': 0.437, 'epoch': 4.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.4610339403152466, 'eval_accuracy': 0.7560975609756098, 'eval_f1': 0.7916666666666666, 'eval_precision': 0.6551724137931034, 'eval_recall': 1.0, 'eval_runtime': 2.3416, 'eval_samples_per_second': 17.509, 'eval_steps_per_second': 0.427, 'epoch': 5.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.35856807231903076, 'eval_accuracy': 0.8780487804878049, 'eval_f1': 0.8717948717948718, 'eval_precision': 0.85, 'eval_recall': 0.8947368421052632, 'eval_runtime': 2.3095, 'eval_samples_per_second': 17.753, 'eval_steps_per_second': 0.433, 'epoch': 6.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.3103157579898834, 'eval_accuracy': 0.8292682926829268, 'eval_f1': 0.8292682926829268, 'eval_precision': 0.7727272727272727, 'eval_recall': 0.8947368421052632, 'eval_runtime': 2.2457, 'eval_samples_per_second': 18.257, 'eval_steps_per_second': 0.445, 'epoch': 7.0}
{'loss': 0.3926, 'grad_norm': 4.192562103271484, 'learning_rate': 9.900000000000002e-06, 'epoch': 7.14}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.2897054851055145, 'eval_accuracy': 0.8292682926829268, 'eval_f1': 0.8292682926829268, 'eval_precision': 0.7727272727272727, 'eval_recall': 0.8947368421052632, 'eval_runtime': 2.3161, 'eval_samples_per_second': 17.702, 'eval_steps_per_second': 0.432, 'epoch': 8.0}


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.34315812587738037, 'eval_accuracy': 0.8780487804878049, 'eval_f1': 0.8717948717948718, 'eval_precision': 0.85, 'eval_recall': 0.8947368421052632, 'eval_runtime': 2.3241, 'eval_samples_per_second': 17.641, 'eval_steps_per_second': 0.43, 'epoch': 9.0}
{'train_runtime': 203.3584, 'train_samples_per_second': 20.85, 'train_steps_per_second': 1.377, 'train_loss': 0.4672575488923088, 'epoch': 9.0}
Training complete!
Final model and tokenizer saved to: /Users/photoli93/Desktop/Projets perso Python/esa_fake_or_real/results/distilbert_fake_or_real/final_model

Final evaluation on validation set:


  0%|          | 0/1 [00:00<?, ?it/s]

{'eval_loss': 0.35856807231903076, 'eval_accuracy': 0.8780487804878049, 'eval_f1': 0.8717948717948718, 'eval_precision': 0.85, 'eval_recall': 0.8947368421052632, 'eval_runtime': 2.1548, 'eval_samples_per_second': 19.027, 'eval_steps_per_second': 0.464, 'epoch': 9.0}

Model Training Complete


# End of model training notebook