# Ensemble Voting Model — Don't Patronize Me!

**Binary PCL classification** using RoBERTa, DistilBERT, and DeBERTa with majority-vote ensemble.

## 1. Imports

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt
import optuna

print(f"PyTorch version : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device      : {torch.cuda.get_device_name(0)}")
    print(f"GPU memory      : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version : 2.10.0+cu128
CUDA available  : True
GPU device      : Tesla T4
GPU memory      : 15.6 GB


## 2. Device Setup

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 2
LABEL_NAMES = ["Non-PCL", "PCL"]

# Class weights for the ~9.5:1 imbalance (Non-PCL : PCL).
# Placing on DEVICE once avoids repeated .to() calls inside compute_loss.
CLASS_WEIGHTS = torch.tensor([1.0, 5.0], dtype=torch.float32).to(DEVICE)

print(f"Using device    : {DEVICE}")
print(f"Class weights   : {CLASS_WEIGHTS}  (device: {CLASS_WEIGHTS.device})")

Using device    : cuda
Class weights   : tensor([1., 5.], device='cuda:0')  (device: cuda:0)


## 3. Load & Preprocess Dataset

Binary labels as per the paper: labels 0-1 → **Non-PCL (0)**, labels 2-4 → **PCL (1)**.

We split 80/10/10 into train / val / test. The test set is held out entirely until final evaluation.

In [3]:
def load_data():
    """Load Don't Patronize Me PCL dataset and binarise labels."""
    pcl_columns = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
    df = pd.read_csv(
        "dontpatronizeme_pcl.tsv",
        sep="\t",
        skiprows=4,
        names=pcl_columns,
        on_bad_lines="skip",
        engine="python",
    )

    # Drop rows with missing text or labels
    df = df.dropna(subset=["text", "label"])
    df["label"] = df["label"].astype(int)

    # Binary: 0-1 → Non-PCL (0),  2-4 → PCL (1)
    df["binary_label"] = (df["label"] >= 2).astype(int)

    print(f"Total samples  : {len(df)}")
    print(f"Label distribution:\n{df['binary_label'].value_counts().rename({0: 'Non-PCL', 1: 'PCL'})}")
    print(f"Imbalance ratio: {(df['binary_label'] == 0).sum() / (df['binary_label'] == 1).sum():.2f}:1")

    return df


df = load_data()

# 80 / 10 / 10 stratified split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["binary_label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["binary_label"], random_state=42
)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["binary_label"].tolist()})
val_dataset   = Dataset.from_dict({"text": val_df["text"].tolist(),   "label": val_df["binary_label"].tolist()})
test_dataset  = Dataset.from_dict({"text": test_df["text"].tolist(),  "label": test_df["binary_label"].tolist()})

print(f"\nSplit sizes — train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

Total samples  : 10468
Label distribution:
binary_label
Non-PCL    9475
PCL         993
Name: count, dtype: int64
Imbalance ratio: 9.54:1

Split sizes — train: 8374, val: 1047, test: 1047


## 4. Model Definitions & Tokenisation

We define:
- **Model catalogue** — three transformer architectures
- **`WeightedTrainer`** — custom Trainer that uses class-weighted CrossEntropyLoss. The class weights tensor is moved to device **once** (at init), not on every forward pass.
- **`compute_metrics`** — accuracy, precision, recall, F1
- Per-model tokenisation functions

In [4]:
MODEL_CATALOGUE = {
    "RoBERTa":    "FacebookAI/roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "DeBERTa":    "microsoft/deberta-v3-base",
}

MAX_LENGTH = 128  # EDA: median 42 word tokens, 95th pct ~105; subword inflation ~1.3x → 128 is safe


# ---------------------------------------------------------------------------
# Weighted Trainer — class weights live on the same device as the model
# ---------------------------------------------------------------------------
class WeightedTrainer(Trainer):
    """Trainer that applies class weights to CrossEntropyLoss.
    
    The weights are sent to `self.args.device` once when training begins,
    rather than on every forward pass.
    """

    def __init__(self, class_weights: torch.Tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Store weights; they'll be moved to the training device in compute_loss
        # via logits.device (which Trainer guarantees is correct).
        self._class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Move weights to the same device as the logits (handles multi-GPU too)
        loss_fn = nn.CrossEntropyLoss(
            weight=self._class_weights.to(logits.device)
        )
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, F1 for the positive class (PCL)."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# ---------------------------------------------------------------------------
# Tokenisers
# ---------------------------------------------------------------------------
tokenisers = {}
for name, path in MODEL_CATALOGUE.items():
    tokenisers[name] = AutoTokenizer.from_pretrained(path)
    print(f"Loaded tokeniser for {name}")


def tokenize_dataset(dataset, tokenizer):
    """Tokenise a HuggingFace Dataset with the given tokenizer."""
    def _tok(examples):
        return tokenizer(
            examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH
        )
    return dataset.map(_tok, batched=True)

Loaded tokeniser for RoBERTa
Loaded tokeniser for DistilBERT
Loaded tokeniser for DeBERTa


## 5. Bayesian Hyperparameter Optimisation (Optuna)

For each model we run `trainer.hyperparameter_search` with an Optuna backend. This performs **Bayesian optimisation** (Tree-structured Parzen Estimator by default) over learning rate, number of epochs, batch size, and weight decay.

Key requirements:
- A **`model_init`** function (not a pre-built model) so Trainer can reinitialise fresh weights each trial
- An **`hp_space`** function defining the search ranges
- `direction="maximize"` because we optimise F1

In [5]:
def optuna_hp_space(trial):
    """Define the Bayesian search space."""
    return {
        "learning_rate":             trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs":          trial.suggest_int("num_train_epochs", 2, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "weight_decay":              trial.suggest_float("weight_decay", 0.0, 0.3),
    }


N_TRIALS = 10  # Increase for better search (20-30+), reduce if GPU-constrained

best_hparams = {}   # {model_name: BestRun}
trained_models = {} # {model_name: fine-tuned model}

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Hyperparameter search for {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # model_init: required so Trainer can create a fresh model each trial
    def make_model_init(path):
        def model_init():
            return AutoModelForSequenceClassification.from_pretrained(
                path, num_labels=NUM_LABELS
            )
        return model_init

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=torch.cuda.is_available(),  # mixed precision if GPU available
        report_to="none",                # disable W&B / MLflow
    )

    trainer = WeightedTrainer(
        class_weights=CLASS_WEIGHTS,
        model_init=make_model_init(model_path),
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=N_TRIALS,
        compute_objective=lambda metrics: metrics["eval_f1"],
    )

    best_hparams[name] = best_run
    print(f"\n✓ {name} best trial — F1: {best_run.objective:.4f}")
    print(f"  Hyperparameters: {best_run.hyperparameters}")


  Hyperparameter search for RoBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 12546.23 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 12627.63 examples/s]
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 935.82it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading f

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.60232,0.794196,0.685769,0.218673,0.89,0.351085
2,0.47429,0.42542,0.893028,0.453125,0.58,0.508772
3,0.400352,0.375009,0.864374,0.387097,0.72,0.503497


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.556165,0.483427,0.877746,0.393939,0.52,0.448276
2,0.45358,0.389737,0.880611,0.420382,0.66,0.513619
3,0.403508,0.531346,0.905444,0.505051,0.5,0.502513


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.425063,0.409238,0.82235,0.332031,0.85,0.477528
2,0.311603,0.350196,0.924546,0.598131,0.64,0.618357


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.e

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.473269,0.499902,0.816619,0.314516,0.78,0.448276
2,0.358873,0.430419,0.892073,0.459119,0.73,0.563707
3,0.341936,0.554975,0.917861,0.563636,0.62,0.590476


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.78it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.68204,0.403257,0.878701,0.418182,0.69,0.520755
2,0.569201,0.613116,0.927412,0.65,0.52,0.577778


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'roberta.encoder.layer.3.attention.output.LayerNorm.weight', 'roberta.e

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.484138,0.52935,0.765998,0.276923,0.9,0.423529


[32m[I 2026-02-09 20:40:21,083][0m Trial 5 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1011.82it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.444432,0.370925,0.87297,0.409836,0.75,0.530035
2,0.341266,0.31744,0.905444,0.503704,0.68,0.578723
3,0.282946,0.399294,0.913085,0.537815,0.64,0.584475
4,0.130023,0.488487,0.91595,0.54918,0.67,0.603604


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.Lay

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.466961,0.453259,0.817574,0.320158,0.81,0.458924
2,0.331886,0.370945,0.895893,0.470968,0.73,0.572549
3,0.332039,0.506767,0.91404,0.539683,0.68,0.60177


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.458885,0.544526,0.751671,0.266082,0.91,0.411765


[32m[I 2026-02-09 20:48:21,122][0m Trial 8 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1034.36it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.522013,0.561587,0.696275,0.226131,0.9,0.361446


[32m[I 2026-02-09 20:49:12,775][0m Trial 9 pruned. [0m



✓ RoBERTa best trial — F1: 0.6184
  Hyperparameters: {'learning_rate': 1.2761060278761314e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.04901994531265201}

  Hyperparameter search for DistilBERT


Map: 100%|██████████| 8374/8374 [00:00<00:00, 10118.12 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 9940.30 examples/s] 
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1091.32it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.419363,0.383917,0.850048,0.362319,0.75,0.488599
2,0.344226,0.371944,0.855778,0.37799,0.79,0.511327
3,0.302159,0.359886,0.890162,0.451613,0.7,0.54902


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-09 20:50:39,148][0m Trial 0 finished with value: 0.5490196078431373 and parameters: {'learning_rate': 1.4018800278946007e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.25365076113429197}. Best is trial 0 with value: 0.5490196078431373.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1074.45it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.635235,0.398325,0.880611,0.421384,0.67,0.517375
2,0.46442,0.537095,0.924546,0.636364,0.49,0.553672
3,0.383585,0.712546,0.920726,0.60241,0.5,0.546448
4,0.213117,0.772691,0.914995,0.550459,0.6,0.574163
5,0.140251,0.842662,0.91213,0.535714,0.6,0.566038


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.36it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-09 20:54:33,966][0m Trial 1 finished with value: 0.5660377358490566 and parameters: {'learning_rate': 1.3017569305715164e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 8, 'weight_decay': 0.24294477592798808}. Best is trial 1 with value: 0.5660377358490566.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1051.37it/s, Materializing param

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.623882,0.410186,0.879656,0.416667,0.65,0.507812
2,0.453791,0.575939,0.924546,0.62963,0.51,0.563536


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-09 20:56:08,377][0m Trial 2 finished with value: 0.56353591160221 and parameters: {'learning_rate': 1.9796451687711585e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 8, 'weight_decay': 0.1687498944913866}. Best is trial 1 with value: 0.5660377358490566.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1108.42it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.420179,0.412999,0.802292,0.306859,0.85,0.450928
2,0.357725,0.379504,0.886342,0.43871,0.68,0.533333
3,0.307745,0.405479,0.914995,0.551402,0.59,0.570048


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-09 20:57:54,800][0m Trial 3 finished with value: 0.5700483091787439 and parameters: {'learning_rate': 1.2039046270060845e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.23633029792620497}. Best is trial 3 with value: 0.5700483091787439.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1096.77it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.595025,0.423595,0.882521,0.42069,0.61,0.497959
2,0.452327,0.619568,0.925501,0.652778,0.47,0.546512
3,0.172576,0.732805,0.916905,0.56701,0.55,0.558376


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.37it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.38it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-09 21:00:16,150][0m Trial 4 finished with value: 0.5583756345177665 and parameters: {'learning_rate': 2.8793830534251965e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 8, 'weight_decay': 0.08838708835655204}. Best is trial 3 with value: 0.5700483091787439.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1095.25it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-u

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.611574,0.417959,0.880611,0.418301,0.64,0.505929


[32m[I 2026-02-09 21:01:02,447][0m Trial 5 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1095.82it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.48864,0.488185,0.825215,0.318777,0.73,0.443769


[32m[I 2026-02-09 21:01:36,106][0m Trial 6 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1083.63it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.407706,0.431916,0.787011,0.290102,0.85,0.43257


[32m[I 2026-02-09 21:02:09,720][0m Trial 7 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1072.88it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.4395,0.398064,0.817574,0.318725,0.8,0.45584


[32m[I 2026-02-09 21:02:43,369][0m Trial 8 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1062.92it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_projector.bias    | UNEXPECTED | 
pre_classifier.bias     | MISSING    | 
pre_classifier.weight   | MISSING    | 
classifier.bias         | MISSING    | 
classifier.weight       | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.405444,0.427539,0.797517,0.301418,0.85,0.445026


[32m[I 2026-02-09 21:03:17,002][0m Trial 9 pruned. [0m



✓ DistilBERT best trial — F1: 0.5700
  Hyperparameters: {'learning_rate': 1.2039046270060845e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 16, 'weight_decay': 0.23633029792620497}

  Hyperparameter search for DeBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 10204.01 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 10138.65 examples/s]
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1272.07it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
mask_predictio

ValueError: Attempting to unscale FP16 gradients.

## 6. Train Each Model with Best Hyperparameters

Re-train each model from scratch using the best hyperparameters found above.

In [None]:
trainers = {}  # keep trainers around for prediction

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Final training: {name}")
    print(f"{'='*60}")

    best = best_hparams[name]
    hp = best.hyperparameters

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # Build fresh model with best HPs
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=NUM_LABELS
    )

    # Verify the model is on the expected device (Trainer will move it, but let's show it)
    print(f"  Model device before Trainer: {next(model.parameters()).device}")

    training_args = TrainingArguments(
        output_dir=f"./results/{name}_final",
        num_train_epochs=hp.get("num_train_epochs", 3),
        per_device_train_batch_size=hp.get("per_device_train_batch_size", 16),
        per_device_eval_batch_size=32,
        learning_rate=hp.get("learning_rate", 2e-5),
        weight_decay=hp.get("weight_decay", 0.01),
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=torch.cuda.is_available(),
        report_to="none",
    )

    trainer = WeightedTrainer(
        class_weights=CLASS_WEIGHTS,
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
    )

    trainer.train()

    # After training, Trainer has moved the model to GPU (if available)
    print(f"  Model device after Trainer : {next(model.parameters()).device}")

    trained_models[name] = model
    trainers[name] = trainer
    print(f"✓ {name} final training complete.")

## 7. Per-Model Evaluation — Results & Confusion Matrices

Evaluate each model individually on the **test set**, print classification reports, and plot confusion matrices.

In [None]:
per_model_preds = {}  # {name: np.array of predictions on test set}

for name in MODEL_CATALOGUE:
    print(f"\n{'='*60}")
    print(f"  Test Evaluation: {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    test_tok = tokenize_dataset(test_dataset, tokenizer)
    trainer = trainers[name]

    # Predict on test set
    predictions = trainer.predict(test_tok)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    per_model_preds[name] = preds

    # Classification report
    print(f"\n{name} — Classification Report:")
    print(classification_report(labels, preds, target_names=LABEL_NAMES, digits=4))

    # Confusion matrix
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    cm = confusion_matrix(labels, preds)
    ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
        ax=axes[0], cmap="Blues", colorbar=False
    )
    axes[0].set_title(f"{name} — Counts")

    cm_norm = confusion_matrix(labels, preds, normalize="true")
    ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
        ax=axes[1], cmap="Blues", colorbar=False, values_format=".2%"
    )
    axes[1].set_title(f"{name} — Normalised")

    plt.tight_layout()
    plt.show()

## 8. Overall Ensemble — Majority Vote, Results & Confusion Matrix

Each of the 3 models votes; a sample is classified as **PCL** if **2 or more** models agree.

In [None]:
# Majority vote: PCL (1) if >= 2 out of 3 models predict PCL
votes = np.stack(list(per_model_preds.values()), axis=0)  # (3, n_test)
ensemble_preds = (votes.sum(axis=0) >= 2).astype(int)
true_labels = np.array(test_dataset["label"])

# ---------------------------------------------------------------------------
# Overall classification report
# ---------------------------------------------------------------------------
print("=" * 60)
print("  ENSEMBLE (Majority Vote) — Test Set Results")
print("=" * 60)
print(classification_report(true_labels, ensemble_preds, target_names=LABEL_NAMES, digits=4))

# Per-model vs ensemble summary table
rows = []
for name, preds in per_model_preds.items():
    p, r, f1, _ = precision_recall_fscore_support(true_labels, preds, average="binary", pos_label=1)
    acc = accuracy_score(true_labels, preds)
    rows.append({"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

p, r, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average="binary", pos_label=1)
acc = accuracy_score(true_labels, ensemble_preds)
rows.append({"Model": "ENSEMBLE", "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

summary_df = pd.DataFrame(rows).set_index("Model")
print("\nSummary comparison:")
display(summary_df.style.format("{:.4f}").highlight_max(axis=0, color="lightgreen"))

# ---------------------------------------------------------------------------
# Confusion matrices — ensemble
# ---------------------------------------------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

cm = confusion_matrix(true_labels, ensemble_preds)
ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
    ax=axes[0], cmap="Oranges", colorbar=False
)
axes[0].set_title("Ensemble — Counts")

cm_norm = confusion_matrix(true_labels, ensemble_preds, normalize="true")
ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
    ax=axes[1], cmap="Oranges", colorbar=False, values_format=".2%"
)
axes[1].set_title("Ensemble — Normalised")

plt.tight_layout()
plt.show()

# ---------------------------------------------------------------------------
# Voting agreement heatmap
# ---------------------------------------------------------------------------
print("\nPer-sample voting agreement:")
agreement = votes.sum(axis=0)
for v in [0, 1, 2, 3]:
    count = (agreement == v).sum()
    print(f"  {v}/3 models predict PCL: {count} samples ({count/len(agreement)*100:.1f}%)")