# Ensemble Voting Model — Don't Patronize Me!

**Binary PCL classification** using RoBERTa, DistilBERT, and DeBERTa with majority-vote ensemble.

## 1. Imports

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt
import optuna
from optuna.pruners import MedianPruner
import gc

print(f"PyTorch version : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device      : {torch.cuda.get_device_name(0)}")
    print(f"GPU memory      : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version : 2.10.0+cu128
CUDA available  : True
GPU device      : Tesla T4
GPU memory      : 15.6 GB


## 2. Device Setup

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 2
LABEL_NAMES = ["Non-PCL", "PCL"]

# Class weights for the ~9.5:1 imbalance (Non-PCL : PCL).
# Placing on DEVICE once avoids repeated .to() calls inside compute_loss.
CLASS_WEIGHTS = torch.tensor([1.0, 9.0], dtype=torch.float32).to(DEVICE)

# Mixed-precision strategy:
#   bf16 preferred (Ampere+ GPUs) — works with all models including DeBERTa v3.
#   fp16 as fallback for older GPUs — but NOT safe for DeBERTa v3 (FP16 gradient error).
_BF16_OK = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

print(f"Using device    : {DEVICE}")
print(f"bf16 supported  : {_BF16_OK}")
print(f"Class weights   : {CLASS_WEIGHTS}  (device: {CLASS_WEIGHTS.device})")

Using device    : cuda
Class weights   : tensor([1., 9.], device='cuda:0')  (device: cuda:0)


## 3. Load & Preprocess Dataset

Binary labels as per the paper: labels 0-1 → **Non-PCL (0)**, labels 2-4 → **PCL (1)**.

We split 80/10/10 into train / val / test. The test set is held out entirely until final evaluation.

In [3]:
def load_data():
    """Load Don't Patronize Me PCL dataset and binarise labels."""
    pcl_columns = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
    df = pd.read_csv(
        "dontpatronizeme_pcl.tsv",
        sep="\t",
        skiprows=4,
        names=pcl_columns,
        on_bad_lines="skip",
        engine="python",
    )

    # Drop rows with missing text or labels
    df = df.dropna(subset=["text", "label"])
    df["label"] = df["label"].astype(int)

    # Binary: 0-1 → Non-PCL (0),  2-4 → PCL (1)
    df["binary_label"] = (df["label"] >= 2).astype(int)

    print(f"Total samples  : {len(df)}")
    print(f"Label distribution:\n{df['binary_label'].value_counts().rename({0: 'Non-PCL', 1: 'PCL'})}")
    print(f"Imbalance ratio: {(df['binary_label'] == 0).sum() / (df['binary_label'] == 1).sum():.2f}:1")

    return df


df = load_data()

# 80 / 10 / 10 stratified split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["binary_label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["binary_label"], random_state=42
)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["binary_label"].tolist()})
val_dataset   = Dataset.from_dict({"text": val_df["text"].tolist(),   "label": val_df["binary_label"].tolist()})
test_dataset  = Dataset.from_dict({"text": test_df["text"].tolist(),  "label": test_df["binary_label"].tolist()})

print(f"\nSplit sizes — train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

Total samples  : 10468
Label distribution:
binary_label
Non-PCL    9475
PCL         993
Name: count, dtype: int64
Imbalance ratio: 9.54:1

Split sizes — train: 8374, val: 1047, test: 1047


## 4. Model Definitions & Tokenisation

We define:
- **Model catalogue** — three transformer architectures
- **`WeightedTrainer`** — custom Trainer that uses class-weighted CrossEntropyLoss. The class weights tensor is moved to device **once** (at init), not on every forward pass.
- **`compute_metrics`** — accuracy, precision, recall, F1
- Per-model tokenisation functions

In [None]:
MODEL_CATALOGUE = {
    "RoBERTa":    "FacebookAI/roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "DeBERTa":    "microsoft/deberta-v3-base",
}

MAX_LENGTH = 128  # EDA: median 42 word tokens, 95th pct ~105; subword inflation ~1.3x → 128 is safe


# ---------------------------------------------------------------------------
# Mixed-precision helper — decides fp16 vs bf16 per model
# ---------------------------------------------------------------------------
def get_mixed_precision_flags(model_name: str):
    """Return (fp16, bf16) flags for a given model.

    • bf16 is preferred for ALL models when the GPU supports it (Ampere+).
    • fp16 is used as fallback — except for DeBERTa v3 which produces
      gradient-unscale errors under fp16.
    • DeBERTa v3 falls back to fp32 if bf16 is unavailable.
    """
    if _BF16_OK:
        return False, True          # bf16 for everything
    if model_name == "DeBERTa":
        return False, False         # fp32 fallback (fp16 is unsafe)
    if torch.cuda.is_available():
        return True, False          # fp16 for other models
    return False, False             # CPU


# ---------------------------------------------------------------------------
# Weighted Trainer — class weights live on the same device as the model
# ---------------------------------------------------------------------------
class WeightedTrainer(Trainer):
    """Trainer that applies class weights to CrossEntropyLoss.

    Supports per-trial class_weight_pos override via self.args (set by
    Optuna hp search). Falls back to the weights passed at init.
    """

    def __init__(self, class_weights: torch.Tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # ---- NaN guard: clamp logits to prevent NaN propagation ----------
        if torch.isnan(logits).any() or torch.isinf(logits).any():
            logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)

        # Use per-trial class_weight_pos if set by hp search, else default
        pos_w = getattr(self.args, "class_weight_pos", None)
        if pos_w is not None:
            weights = torch.tensor([1.0, pos_w], dtype=logits.dtype, device=logits.device)
        else:
            weights = self._class_weights.to(dtype=logits.dtype, device=logits.device)
        loss_fn = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, F1 for the positive class (PCL)."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# ---------------------------------------------------------------------------
# Tokenisers — NO padding here; DataCollatorWithPadding pads per-batch
# (median text ~42 tokens → dynamic padding is ~2x faster than pad-to-128)
# ---------------------------------------------------------------------------
tokenisers = {}
for name, path in MODEL_CATALOGUE.items():
    tokenisers[name] = AutoTokenizer.from_pretrained(path)
    print(f"Loaded tokeniser for {name}")


def tokenize_dataset(dataset, tokenizer):
    """Tokenise a HuggingFace Dataset with the given tokenizer (no padding)."""
    def _tok(examples):
        return tokenizer(
            examples["text"], truncation=True, max_length=MAX_LENGTH
        )
    return dataset.map(_tok, batched=True)

Loaded tokeniser for RoBERTa




Loaded tokeniser for DistilBERT
Loaded tokeniser for DeBERTa


## 5. Bayesian Hyperparameter Optimisation (Optuna)

For each model we run `trainer.hyperparameter_search` with an Optuna backend. This performs **Bayesian optimisation** (Tree-structured Parzen Estimator by default) over learning rate, number of epochs, batch size, weight decay, and **class weight for PCL** (searched 8–10 around the ~9.5:1 natural ratio).

Key design decisions:
- **`model_init`** function (not a pre-built model) so Trainer can reinitialise fresh weights each trial
- **`class_weight_pos`** in the search space — the most impactful knob for imbalanced classification
- **Dynamic padding** (`DataCollatorWithPadding`) — pads per-batch instead of to `MAX_LENGTH`, ~2× faster
- **DeBERTa v3** uses `bf16` (or fp32 fallback) instead of `fp16` which causes gradient unscale errors
- `direction="maximize"` because we optimise F1

In [None]:
def optuna_hp_space(trial):
    """Define the Bayesian search space (includes class weight for PCL)."""
    return {
        "learning_rate":             trial.suggest_float("learning_rate", 1e-5, 3e-5, log=True),
        "num_train_epochs":          trial.suggest_int("num_train_epochs", 2, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [16, 32]),
        "weight_decay":              trial.suggest_float("weight_decay", 0.0, 0.2),
        "class_weight_pos":          trial.suggest_float("class_weight_pos", 8.0, 10.0),
    }


N_TRIALS = 10  # Increase for better search (20-30+), reduce if GPU-constrained

best_hparams = {}   # {model_name: BestRun}
trained_models = {} # {model_name: fine-tuned model}

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Hyperparameter search for {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # model_init: required so Trainer can create a fresh model each trial
    def make_model_init(path):
        def model_init():
            return AutoModelForSequenceClassification.from_pretrained(
                path, num_labels=NUM_LABELS
            )
        return model_init

    # Mixed-precision: bf16 when available (all models); fp16 fallback (not DeBERTa)
    use_fp16, use_bf16 = get_mixed_precision_flags(name)
    print(f"  Mixed precision — fp16: {use_fp16}, bf16: {use_bf16}")

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=use_fp16,
        bf16=use_bf16,
        warmup_ratio=0.1,
        max_grad_norm=1.0,
        report_to="none",
    )
    # Seed attribute so Optuna's setattr succeeds for class_weight_pos
    training_args.class_weight_pos = 9.0

    trainer = WeightedTrainer(
        class_weights=CLASS_WEIGHTS,
        model_init=make_model_init(model_path),
        args=training_args,
        train_dataset=train_tok.shuffle(seed=42).select(range(2000)),
        eval_dataset=val_tok.shuffle(seed=42).select(range(250)),
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=N_TRIALS,
        pruner=MedianPruner(),
        compute_objective=lambda metrics: metrics["eval_f1"],
    )

    best_hparams[name] = best_run
    print(f"\n✓ {name} best trial — F1: {best_run.objective:.4f}")
    print(f"  Hyperparameters: {best_run.hyperparameters}")

    # Free GPU memory before next model
    del trainer
    gc.collect()
    torch.cuda.empty_cache()


  Hyperparameter search for RoBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 16986.06 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 17211.40 examples/s]
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 952.52it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading f

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.691849,0.463888,0.916,0.5,0.380952,0.432432
2,0.548217,0.510994,0.908,0.454545,0.47619,0.465116
3,0.372262,0.593476,0.904,0.44,0.52381,0.478261


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.82it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.667313,0.478602,0.912,0.48,0.571429,0.521739
2,0.443149,0.410599,0.82,0.285714,0.761905,0.415584
3,0.468084,0.410292,0.852,0.318182,0.666667,0.430769


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.31s/it]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.688828,0.634278,0.724,0.2,0.761905,0.316832
2,0.558421,0.532407,0.736,0.228916,0.904762,0.365385
3,0.544274,0.4157,0.828,0.303571,0.809524,0.441558


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.676037,0.443213,0.924,0.545455,0.571429,0.55814
2,0.458827,0.396371,0.852,0.326087,0.714286,0.447761
3,0.514805,0.420564,0.86,0.340909,0.714286,0.461538
4,0.266306,0.440828,0.916,0.5,0.571429,0.533333


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
Writing model shards: 100%|██████████| 1/1 [00:01<00:00,  1.86s/it]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.Lay

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.686288,0.528872,0.884,0.394737,0.714286,0.508475
2,0.505392,0.457101,0.788,0.264706,0.857143,0.404494
3,0.53184,0.411345,0.856,0.333333,0.714286,0.454545
4,0.296784,0.40169,0.884,0.4,0.761905,0.52459
5,0.235665,0.432122,0.888,0.393939,0.619048,0.481481


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.80it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.81it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.79it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.a

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.646549,1.180705,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 00:02:19,766][0m Trial 5 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1023.05it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.64571,1.137349,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 00:02:34,522][0m Trial 6 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 964.60it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.680268,1.004167,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 00:02:49,321][0m Trial 7 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 983.10it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.670725,0.973001,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 00:03:04,084][0m Trial 8 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 986.93it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
classifier.out_proj.bias        | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.out_proj.weight      | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.669168,0.958809,0.912,0.333333,0.047619,0.083333


[32m[I 2026-02-10 00:03:18,869][0m Trial 9 pruned. [0m



✓ RoBERTa best trial — F1: 0.5333
  Hyperparameters: {'learning_rate': 1.6241008241875646e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.1326211852579204, 'class_weight_pos': 8.267693207145015}

  Hyperparameter search for DistilBERT


Map: 100%|██████████| 8374/8374 [00:00<00:00, 14319.54 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 14048.60 examples/s]
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1028.65it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.656416,0.484462,0.896,0.407407,0.52381,0.458333
2,0.474447,0.444986,0.876,0.361111,0.619048,0.45614


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-10 00:03:35,440][0m Trial 0 finished with value: 0.45614035087719296 and parameters: {'learning_rate': 2.218956096299018e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 32, 'weight_decay': 0.17196386499823244, 'class_weight_pos': 8.129042021862196}. Best is trial 0 with value: 0.45614035087719296.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1031.66it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.647488,0.462106,0.888,0.37931,0.52381,0.44
2,0.423871,0.48409,0.748,0.223684,0.809524,0.350515
3,0.50251,0.440791,0.828,0.28,0.666667,0.394366


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-10 00:04:00,921][0m Trial 1 finished with value: 0.39436619718309857 and parameters: {'learning_rate': 2.60972541484489e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.15913618652963546, 'class_weight_pos': 8.115037439212538}. Best is trial 0 with value: 0.45614035087719296.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1001.95it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.649512,0.467586,0.896,0.4,0.47619,0.434783
2,0.418949,0.476244,0.756,0.22973,0.809524,0.357895
3,0.523411,0.477448,0.892,0.4,0.571429,0.470588
4,0.222188,0.49802,0.892,0.40625,0.619048,0.490566
5,0.142654,0.558966,0.928,0.588235,0.47619,0.526316


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.35it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-10 00:04:47,037][0m Trial 2 finished with value: 0.5263157894736842 and parameters: {'learning_rate': 2.477059450355478e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.038945602660169026, 'class_weight_pos': 8.959285634831025}. Best is trial 2 with value: 0.5263157894736842.[0m
Loading weights: 100%|██████████| 100/100 [00:00

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.660145,0.480494,0.892,0.392857,0.52381,0.44898
2,0.457664,0.516632,0.712,0.206897,0.857143,0.333333
3,0.546732,0.474749,0.86,0.315789,0.571429,0.40678
4,0.299409,0.484875,0.9,0.428571,0.571429,0.489796
5,0.210083,0.500634,0.904,0.44,0.52381,0.478261


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.32it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-10 00:05:25,669][0m Trial 3 finished with value: 0.4782608695652174 and parameters: {'learning_rate': 1.9721475115869326e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.008083130841472807, 'class_weight_pos': 8.858032769330855}. Best is trial 2 with value: 0.5263157894736842.[0m
Loading weights: 100%|██████████| 100/100 [00:0

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.677431,0.576761,0.892,0.4,0.571429,0.470588
2,0.571963,0.500506,0.776,0.230769,0.714286,0.348837
3,0.520748,0.446967,0.848,0.302326,0.619048,0.40625
4,0.39615,0.442505,0.844,0.295455,0.619048,0.4


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.31it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.34it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-10 00:06:01,727][0m Trial 4 finished with value: 0.4 and parameters: {'learning_rate': 1.1503214170176946e-05, 'num_train_epochs': 4, 'per_device_train_batch_size': 32, 'weight_decay': 0.10198138294624436, 'class_weight_pos': 8.624742894472401}. Best is trial 2 with value: 0.5263157894736842.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 996.23it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_nor

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.645726,0.515256,0.916,0.5,0.285714,0.363636
2,0.533796,0.465899,0.892,0.4,0.571429,0.470588


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.30it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].
[32m[I 2026-02-10 00:06:19,476][0m Trial 5 finished with value: 0.47058823529411764 and parameters: {'learning_rate': 1.6451926444237365e-05, 'num_train_epochs': 2, 'per_device_train_batch_size': 16, 'weight_decay': 0.14649756126159788, 'class_weight_pos': 9.124792706408154}. Best is trial 2 with value: 0.5263157894736842.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1011.75it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                 

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.664346,0.501648,0.88,0.363636,0.571429,0.444444


[32m[I 2026-02-10 00:06:26,140][0m Trial 6 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1018.35it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.613746,0.730815,0.912,0.4,0.095238,0.153846


[32m[I 2026-02-10 00:06:34,067][0m Trial 7 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 996.09it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]    
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.656475,0.480754,0.88,0.363636,0.571429,0.444444


[32m[I 2026-02-10 00:06:40,671][0m Trial 8 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1026.16it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
classifier.bias         | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.660163,0.485927,0.892,0.384615,0.47619,0.425532


[32m[I 2026-02-10 00:06:47,327][0m Trial 9 pruned. [0m



✓ DistilBERT best trial — F1: 0.5263
  Hyperparameters: {'learning_rate': 2.477059450355478e-05, 'num_train_epochs': 5, 'per_device_train_batch_size': 32, 'weight_decay': 0.038945602660169026, 'class_weight_pos': 8.959285634831025}

  Hyperparameter search for DeBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 14501.33 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 14423.83 examples/s]
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1130.97it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
mask_predictio

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.736426,0.625977,0.916,0.0,0.0,0.0
2,0.640884,,0.916,0.0,0.0,0.0
3,0.0,,0.916,0.0,0.0,0.0


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.LayerNorm.weight', 'deberta.encoder.layer.2.output.LayerNorm.bias', 'de

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.733588,,0.916,0.0,0.0,0.0
2,0.0,,0.916,0.0,0.0,0.0
3,0.0,,0.916,0.0,0.0,0.0
4,0.0,,0.916,0.0,0.0,0.0


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.43it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.Lay

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.725046,,0.916,0.0,0.0,0.0
2,0.0,,0.916,0.0,0.0,0.0


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.45it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.44it/s]
[33m[W 2026-02-10 00:09:38,715][0m Trial 2 failed with parameters: {'learning_rate': 1.6299451710021822e-05, 'num_train_epochs': 3, 'per_device_train_batch_size': 32, 'weight_decay': 0.0338200430445502, 'class_weight_pos': 8.163209343907633} because of the following error: KeyboardInterrupt().[0m
Traceback (most recent call last):
  File "/home/azureuser/nlp/.venv/lib/python3.12/site-packages/optuna/study/_optimize.py", line 206, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "/home/azureuser/nlp/.venv/lib/python3.12/site-packages/transformers/integrations/integration_utils.py", line 253, in _objective
    trainer.train(resume_from_checkpoint=checkpoint, trial=trial)
  File "/home/azureuser/nlp/.venv/lib/python3.12/site-packages/transformers/trainer.py", line 2170, in train
    return inner_training_loop(
  

KeyboardInterrupt: 

## 6. Train Each Model with Best Hyperparameters

Re-train each model from scratch using the best hyperparameters found above.

In [None]:
trainers = {}  # keep trainers around for prediction

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Final training: {name}")
    print(f"{'='*60}")

    best = best_hparams[name]
    hp = best.hyperparameters

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # Build fresh model with best HPs
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=NUM_LABELS
    )

    # Verify the model is on the expected device (Trainer will move it, but let's show it)
    print(f"  Model device before Trainer: {next(model.parameters()).device}")

    # Mixed-precision: bf16 when available (all models); fp16 fallback (not DeBERTa)
    use_fp16, use_bf16 = get_mixed_precision_flags(name)
    print(f"  Mixed precision — fp16: {use_fp16}, bf16: {use_bf16}")

    training_args = TrainingArguments(
        output_dir=f"./results/{name}_final",
        num_train_epochs=hp.get("num_train_epochs", 3),
        per_device_train_batch_size=hp.get("per_device_train_batch_size", 16),
        per_device_eval_batch_size=32,
        learning_rate=hp.get("learning_rate", 2e-5),
        weight_decay=hp.get("weight_decay", 0.01),
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=use_fp16,
        bf16=use_bf16,
        warmup_ratio=0.1,
        max_grad_norm=1.0,
        dataloader_num_workers=2,
        report_to="none",
    )

    # Apply best class weight from hp search
    class_w = hp.get("class_weight_pos", 9.0)
    final_weights = torch.tensor([1.0, class_w], dtype=torch.float32).to(DEVICE)

    trainer = WeightedTrainer(
        class_weights=final_weights,
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
    )

    trainer.train()

    # After training, Trainer has moved the model to GPU (if available)
    print(f"  Model device after Trainer : {next(model.parameters()).device}")
    print(f"  Class weight (PCL)        : {class_w:.2f}")

    trained_models[name] = model
    trainers[name] = trainer
    print(f"✓ {name} final training complete.")

## 7. Per-Model Evaluation — Results & Confusion Matrices

Evaluate each model individually on the **test set**, print classification reports, and plot confusion matrices.

In [None]:
per_model_preds = {}  # {name: np.array of predictions on test set}

for name in MODEL_CATALOGUE:
    print(f"\n{'='*60}")
    print(f"  Test Evaluation: {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    test_tok = tokenize_dataset(test_dataset, tokenizer)
    trainer = trainers[name]

    # Predict on test set
    predictions = trainer.predict(test_tok)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    per_model_preds[name] = preds

    # Classification report
    print(f"\n{name} — Classification Report:")
    print(classification_report(labels, preds, target_names=LABEL_NAMES, digits=4))

    # Confusion matrix
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    cm = confusion_matrix(labels, preds)
    ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
        ax=axes[0], cmap="Blues", colorbar=False
    )
    axes[0].set_title(f"{name} — Counts")

    cm_norm = confusion_matrix(labels, preds, normalize="true")
    ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
        ax=axes[1], cmap="Blues", colorbar=False, values_format=".2%"
    )
    axes[1].set_title(f"{name} — Normalised")

    plt.tight_layout()
    plt.show()

## 8. Overall Ensemble — Majority Vote, Results & Confusion Matrix

Each of the 3 models votes; a sample is classified as **PCL** if **2 or more** models agree.

In [None]:
# Majority vote: PCL (1) if >= 2 out of 3 models predict PCL
votes = np.stack(list(per_model_preds.values()), axis=0)  # (3, n_test)
ensemble_preds = (votes.sum(axis=0) >= 2).astype(int)
true_labels = np.array(test_dataset["label"])

# ---------------------------------------------------------------------------
# Overall classification report
# ---------------------------------------------------------------------------
print("=" * 60)
print("  ENSEMBLE (Majority Vote) — Test Set Results")
print("=" * 60)
print(classification_report(true_labels, ensemble_preds, target_names=LABEL_NAMES, digits=4))

# Per-model vs ensemble summary table
rows = []
for name, preds in per_model_preds.items():
    p, r, f1, _ = precision_recall_fscore_support(true_labels, preds, average="binary", pos_label=1)
    acc = accuracy_score(true_labels, preds)
    rows.append({"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

p, r, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average="binary", pos_label=1)
acc = accuracy_score(true_labels, ensemble_preds)
rows.append({"Model": "ENSEMBLE", "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

summary_df = pd.DataFrame(rows).set_index("Model")
print("\nSummary comparison:")
display(summary_df.style.format("{:.4f}").highlight_max(axis=0, color="lightgreen"))

# ---------------------------------------------------------------------------
# Confusion matrices — ensemble
# ---------------------------------------------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

cm = confusion_matrix(true_labels, ensemble_preds)
ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
    ax=axes[0], cmap="Oranges", colorbar=False
)
axes[0].set_title("Ensemble — Counts")

cm_norm = confusion_matrix(true_labels, ensemble_preds, normalize="true")
ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
    ax=axes[1], cmap="Oranges", colorbar=False, values_format=".2%"
)
axes[1].set_title("Ensemble — Normalised")

plt.tight_layout()
plt.show()

# ---------------------------------------------------------------------------
# Voting agreement heatmap
# ---------------------------------------------------------------------------
print("\nPer-sample voting agreement:")
agreement = votes.sum(axis=0)
for v in [0, 1, 2, 3]:
    count = (agreement == v).sum()
    print(f"  {v}/3 models predict PCL: {count} samples ({count/len(agreement)*100:.1f}%)")