# Ensemble Voting Model — Don't Patronize Me!

**Binary PCL classification** using RoBERTa, DistilBERT, and DeBERTa with majority-vote ensemble.

## 1. Imports

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt
import optuna
from optuna.pruners import MedianPruner
import gc

print(f"PyTorch version : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device      : {torch.cuda.get_device_name(0)}")
    print(f"GPU memory      : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version : 2.10.0+cu128
CUDA available  : True
GPU device      : Tesla T4
GPU memory      : 15.6 GB


## 2. Device Setup

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 2
LABEL_NAMES = ["Non-PCL", "PCL"]

# Class weights for the ~9.5:1 imbalance (Non-PCL : PCL).
# Placing on DEVICE once avoids repeated .to() calls inside compute_loss.
CLASS_WEIGHTS = torch.tensor([1.0, 9.0], dtype=torch.float32).to(DEVICE)

# Mixed-precision strategy:
#   bf16 preferred (Ampere+ GPUs) — works with all models including DeBERTa v3.
#   fp16 as fallback for older GPUs — but NOT safe for DeBERTa v3 (FP16 gradient error).
_BF16_OK = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

print(f"Using device    : {DEVICE}")
print(f"bf16 supported  : {_BF16_OK}")
print(f"Class weights   : {CLASS_WEIGHTS}  (device: {CLASS_WEIGHTS.device})")

Using device    : cuda
bf16 supported  : True
Class weights   : tensor([1., 9.], device='cuda:0')  (device: cuda:0)


## 3. Load & Preprocess Dataset

Binary labels as per the paper: labels 0-1 → **Non-PCL (0)**, labels 2-4 → **PCL (1)**.

We split 80/10/10 into train / val / test. The test set is held out entirely until final evaluation.

In [3]:
def load_data():
    """Load Don't Patronize Me PCL dataset and binarise labels."""
    pcl_columns = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
    df = pd.read_csv(
        "dontpatronizeme_pcl.tsv",
        sep="\t",
        skiprows=4,
        names=pcl_columns,
        on_bad_lines="skip",
        engine="python",
    )

    # Drop rows with missing text or labels
    df = df.dropna(subset=["text", "label"])
    df["label"] = df["label"].astype(int)

    # Binary: 0-1 → Non-PCL (0),  2-4 → PCL (1)
    df["binary_label"] = (df["label"] >= 2).astype(int)

    print(f"Total samples  : {len(df)}")
    print(f"Label distribution:\n{df['binary_label'].value_counts().rename({0: 'Non-PCL', 1: 'PCL'})}")
    print(f"Imbalance ratio: {(df['binary_label'] == 0).sum() / (df['binary_label'] == 1).sum():.2f}:1")

    return df


df = load_data()

# 80 / 10 / 10 stratified split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["binary_label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["binary_label"], random_state=42
)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["binary_label"].tolist()})
val_dataset   = Dataset.from_dict({"text": val_df["text"].tolist(),   "label": val_df["binary_label"].tolist()})
test_dataset  = Dataset.from_dict({"text": test_df["text"].tolist(),  "label": test_df["binary_label"].tolist()})

print(f"\nSplit sizes — train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

Total samples  : 10468
Label distribution:
binary_label
Non-PCL    9475
PCL         993
Name: count, dtype: int64
Imbalance ratio: 9.54:1

Split sizes — train: 8374, val: 1047, test: 1047


## 4. Model Definitions & Tokenisation

We define:
- **Model catalogue** — three transformer architectures
- **`WeightedTrainer`** — custom Trainer that uses class-weighted CrossEntropyLoss. The class weights tensor is moved to device **once** (at init), not on every forward pass.
- **`compute_metrics`** — accuracy, precision, recall, F1
- Per-model tokenisation functions

In [4]:
MODEL_CATALOGUE = {
    "RoBERTa":    "FacebookAI/roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "DeBERTa":    "microsoft/deberta-v3-base",
}

MAX_LENGTH = 128  # EDA: median 42 word tokens, 95th pct ~105; subword inflation ~1.3x → 128 is safe


# ---------------------------------------------------------------------------
# Mixed-precision helper — decides fp16 vs bf16 per model
# ---------------------------------------------------------------------------
def get_mixed_precision_flags(model_name: str):
    """Return (fp16, bf16) flags for a given model.

    • bf16 is preferred for ALL models when the GPU supports it (Ampere+).
    • fp16 is used as fallback — except for DeBERTa v3 which produces
      gradient-unscale errors under fp16.
    • DeBERTa v3 falls back to fp32 if bf16 is unavailable.
    """
    if _BF16_OK:
        return False, True          # bf16 for everything
    if model_name == "DeBERTa":
        return False, False         # fp32 fallback (fp16 is unsafe)
    if torch.cuda.is_available():
        return True, False          # fp16 for other models
    return False, False             # CPU


# ---------------------------------------------------------------------------
# Weighted Trainer — class weights live on the same device as the model
# ---------------------------------------------------------------------------
class WeightedTrainer(Trainer):
    """Trainer that applies class weights to CrossEntropyLoss.

    Supports per-trial class_weight_pos override via self.args (set by
    Optuna hp search). Falls back to the weights passed at init.
    """

    def __init__(self, class_weights: torch.Tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # ---- NaN guard: clamp logits to prevent NaN propagation ----------
        if torch.isnan(logits).any() or torch.isinf(logits).any():
            logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)

        # Use per-trial class_weight_pos if set by hp search, else default
        pos_w = getattr(self.args, "class_weight_pos", None)
        if pos_w is not None:
            weights = torch.tensor([1.0, pos_w], dtype=logits.dtype, device=logits.device)
        else:
            weights = self._class_weights.to(dtype=logits.dtype, device=logits.device)
        loss_fn = nn.CrossEntropyLoss(weight=weights)
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, F1 for the positive class (PCL)."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# ---------------------------------------------------------------------------
# Tokenisers — NO padding here; DataCollatorWithPadding pads per-batch
# (median text ~42 tokens → dynamic padding is ~2x faster than pad-to-128)
# ---------------------------------------------------------------------------
tokenisers = {}
for name, path in MODEL_CATALOGUE.items():
    tokenisers[name] = AutoTokenizer.from_pretrained(path)
    print(f"Loaded tokeniser for {name}")


def tokenize_dataset(dataset, tokenizer):
    """Tokenise a HuggingFace Dataset with the given tokenizer (no padding)."""
    def _tok(examples):
        return tokenizer(
            examples["text"], truncation=True, max_length=MAX_LENGTH
        )
    return dataset.map(_tok, batched=True)



Loaded tokeniser for RoBERTa
Loaded tokeniser for DistilBERT
Loaded tokeniser for DeBERTa


## 5. Bayesian Hyperparameter Optimisation (Optuna)

For each model we run `trainer.hyperparameter_search` with an Optuna backend. This performs **Bayesian optimisation** (Tree-structured Parzen Estimator by default) over learning rate, number of epochs, batch size, weight decay, and **class weight for PCL** (searched 8–10 around the ~9.5:1 natural ratio).

Key design decisions:
- **`model_init`** function (not a pre-built model) so Trainer can reinitialise fresh weights each trial
- **`class_weight_pos`** in the search space — the most impactful knob for imbalanced classification
- **Dynamic padding** (`DataCollatorWithPadding`) — pads per-batch instead of to `MAX_LENGTH`, ~2× faster
- **DeBERTa v3** uses `bf16` (or fp32 fallback) instead of `fp16` which causes gradient unscale errors
- `direction="maximize"` because we optimise F1

In [5]:
def optuna_hp_space(trial):
    """Bayesian search over the two highest-impact hyperparameters.

    Fixed with educated defaults (not worth searching):
      • per_device_train_batch_size = 32  (marginal quality diff, 2× faster than 16)
      • num_train_epochs = 3              (standard for transformer fine-tuning)
    """
    return {
        "learning_rate":             trial.suggest_float("learning_rate", 1e-5, 3e-5, log=True),
        "weight_decay":              trial.suggest_float("weight_decay", 0.0, 0.2),
        "class_weight_pos":          trial.suggest_float("class_weight_pos", 8.0, 10.0),
    }


N_TRIALS = 5  # 2-dimensional search → 5 trials gives good coverage

best_hparams = {}   # {model_name: BestRun}
trained_models = {} # {model_name: fine-tuned model}

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Hyperparameter search for {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # model_init: required so Trainer can create a fresh model each trial
    def make_model_init(path):
        def model_init():
            return AutoModelForSequenceClassification.from_pretrained(
                path, num_labels=NUM_LABELS
            )
        return model_init

    # Mixed-precision: bf16 when available (all models); fp16 fallback (not DeBERTa)
    use_fp16, use_bf16 = get_mixed_precision_flags(name)
    print(f"  Mixed precision — fp16: {use_fp16}, bf16: {use_bf16}")

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        num_train_epochs=3,                # fixed — standard for fine-tuning
        per_device_train_batch_size=32,    # fixed — faster than 16, similar quality
        eval_strategy="epoch",
        save_strategy="no",               # no checkpoints during HP search (saves disk)
        load_best_model_at_end=False,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=use_fp16,
        bf16=use_bf16,
        warmup_ratio=0.1,
        max_grad_norm=1.0,
        report_to="none",
    )
    # Seed attribute so Optuna's setattr succeeds for class_weight_pos
    training_args.class_weight_pos = 9.0

    trainer = WeightedTrainer(
        class_weights=CLASS_WEIGHTS,
        model_init=make_model_init(model_path),
        args=training_args,
        train_dataset=train_tok.shuffle(seed=42).select(range(2000)),
        eval_dataset=val_tok.shuffle(seed=42).select(range(250)),
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=N_TRIALS,
        pruner=MedianPruner(n_startup_trials=2, n_warmup_steps=1),
        compute_objective=lambda metrics: metrics["eval_f1"],
    )

    best_hparams[name] = best_run
    print(f"\n✓ {name} best trial — F1: {best_run.objective:.4f}")
    print(f"  Hyperparameters: {best_run.hyperparameters}")

    # Free GPU memory before next model
    del trainer
    gc.collect()
    torch.cuda.empty_cache()


  Hyperparameter search for RoBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 17671.66 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 17312.91 examples/s]
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


  Mixed precision — fp16: False, bf16: True


Loading weights: 100%|██████████| 197/197 [00:00<00:00, 983.69it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.678291,0.515019,0.92,0.533333,0.380952,0.444444
2,0.475872,0.403342,0.832,0.301887,0.761905,0.432432
3,0.493968,0.412185,0.852,0.333333,0.761905,0.463768


[32m[I 2026-02-10 16:47:33,438][0m Trial 0 finished with value: 0.463768115942029 and parameters: {'learning_rate': 1.9694467477810197e-05, 'weight_decay': 0.14430748653421396, 'class_weight_pos': 8.203403785315341}. Best is trial 0 with value: 0.463768115942029.[0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1007.42it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.we

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.683338,0.60228,0.916,0.5,0.047619,0.086957
2,0.54921,0.406949,0.84,0.306122,0.714286,0.428571
3,0.501658,0.404051,0.832,0.301887,0.761905,0.432432


[32m[I 2026-02-10 16:50:21,419][0m Trial 1 finished with value: 0.43243243243243246 and parameters: {'learning_rate': 1.19864923657565e-05, 'weight_decay': 0.05715588276704438, 'class_weight_pos': 8.053462063594088}. Best is trial 0 with value: 0.463768115942029.[0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 995.01it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.we

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.683094,0.511272,0.92,0.526316,0.47619,0.5
2,0.475813,0.417009,0.82,0.285714,0.761905,0.415584
3,0.512144,0.442178,0.848,0.326531,0.761905,0.457143


[32m[I 2026-02-10 16:53:11,361][0m Trial 2 finished with value: 0.45714285714285713 and parameters: {'learning_rate': 1.947233740455189e-05, 'weight_decay': 0.10385095864858529, 'class_weight_pos': 9.76936002334696}. Best is trial 0 with value: 0.463768115942029.[0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 1028.19it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]             
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.we

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.685113,0.596188,0.916,0.5,0.142857,0.222222


[32m[I 2026-02-10 16:54:08,836][0m Trial 3 pruned. [0m
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 969.77it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISS

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.677769,0.489566,0.912,0.478261,0.52381,0.5
2,0.462854,0.398484,0.828,0.303571,0.809524,0.441558
3,0.508907,0.418185,0.852,0.34,0.809524,0.478873


[32m[I 2026-02-10 16:56:59,890][0m Trial 4 finished with value: 0.4788732394366197 and parameters: {'learning_rate': 2.132434171951195e-05, 'weight_decay': 0.0798457371027612, 'class_weight_pos': 8.567344860075119}. Best is trial 4 with value: 0.4788732394366197.[0m



✓ RoBERTa best trial — F1: 0.4789
  Hyperparameters: {'learning_rate': 2.132434171951195e-05, 'weight_decay': 0.0798457371027612, 'class_weight_pos': 8.567344860075119}

  Hyperparameter search for DistilBERT


Map: 100%|██████████| 8374/8374 [00:00<00:00, 14526.97 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 14145.62 examples/s]
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


  Mixed precision — fp16: False, bf16: True


Loading weights: 100%|██████████| 100/100 [00:00<00:00, 407.86it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m
[32m[I 2026-02-10 16:57:06,464][0m A new study created in memory

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.686796,0.590645,0.868,0.35,0.666667,0.459016
2,0.577975,0.51189,0.756,0.214286,0.714286,0.32967
3,0.518122,0.4673,0.812,0.267857,0.714286,0.38961


[32m[I 2026-02-10 16:58:33,275][0m Trial 0 finished with value: 0.38961038961038963 and parameters: {'learning_rate': 1.3905632831511787e-05, 'weight_decay': 0.03558678803511963, 'class_weight_pos': 9.172857141790868}. Best is trial 0 with value: 0.38961038961038963.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1055.48it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loadi

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.677527,0.508987,0.892,0.375,0.428571,0.4
2,0.491045,0.52015,0.732,0.2125,0.809524,0.336634
3,0.513526,0.453968,0.816,0.272727,0.714286,0.394737


[32m[I 2026-02-10 17:00:00,265][0m Trial 1 finished with value: 0.39473684210526316 and parameters: {'learning_rate': 2.1647883049730073e-05, 'weight_decay': 0.04910889965504239, 'class_weight_pos': 8.992252623188367}. Best is trial 1 with value: 0.39473684210526316.[0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 968.82it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loadin

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.684345,0.568047,0.892,0.375,0.428571,0.4


[32m[I 2026-02-10 17:00:29,591][0m Trial 2 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 999.90it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]    
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.680049,0.523997,0.892,0.375,0.428571,0.4


[32m[I 2026-02-10 17:00:58,997][0m Trial 3 pruned. [0m
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 987.97it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing from the checkpoint. Consider training on your downstream task.[0m


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.679452,0.519614,0.892,0.375,0.428571,0.4


[32m[I 2026-02-10 17:01:28,289][0m Trial 4 pruned. [0m



✓ DistilBERT best trial — F1: 0.3947
  Hyperparameters: {'learning_rate': 2.1647883049730073e-05, 'weight_decay': 0.04910889965504239, 'class_weight_pos': 8.992252623188367}

  Hyperparameter search for DeBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 14984.97 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 14669.61 examples/s]
warmup_ratio is deprecated and will be removed in v5.2. Use `warmup_steps` instead.


  Mixed precision — fp16: False, bf16: True


Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1277.11it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
pooler.dense.bias                   

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.708392,0.645954,0.916,0.0,0.0,0.0
2,0.703431,0.676645,0.916,0.0,0.0,0.0
3,0.688829,0.680808,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 17:05:00,739][0m Trial 0 finished with value: 0.0 and parameters: {'learning_rate': 1.023843533490562e-05, 'weight_decay': 0.09078497329645818, 'class_weight_pos': 8.609598584900937}. Best is trial 0 with value: 0.0.[0m
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1172.06it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_h

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.753577,0.831927,0.916,0.0,0.0,0.0
2,1.027853,0.791786,0.084,0.084,1.0,0.154982
3,0.747893,0.663471,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 17:08:22,474][0m Trial 1 finished with value: 0.0 and parameters: {'learning_rate': 1.499614194477049e-05, 'weight_decay': 0.0540273395429711, 'class_weight_pos': 8.749417271870307}. Best is trial 0 with value: 0.0.[0m
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1155.20it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_he

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.751962,0.628898,0.916,0.0,0.0,0.0
2,0.732722,0.731243,0.084,0.084,1.0,0.154982
3,0.707636,0.683043,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 17:11:44,128][0m Trial 2 finished with value: 0.0 and parameters: {'learning_rate': 1.8963698406793235e-05, 'weight_decay': 0.130097146034042, 'class_weight_pos': 9.173895424547274}. Best is trial 0 with value: 0.0.[0m
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1131.99it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_he

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.775009,0.639667,0.916,0.0,0.0,0.0
2,0.70279,0.77952,0.084,0.084,1.0,0.154982
3,0.727684,0.685689,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 17:15:05,858][0m Trial 3 finished with value: 0.0 and parameters: {'learning_rate': 1.8398322698148384e-05, 'weight_decay': 0.12387171609043818, 'class_weight_pos': 9.846986528438917}. Best is trial 0 with value: 0.0.[0m
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1204.60it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_

Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.55092,1.204318,0.916,0.0,0.0,0.0
2,0.965424,0.776806,0.084,0.084,1.0,0.154982
3,0.761134,0.668692,0.916,0.0,0.0,0.0


[32m[I 2026-02-10 17:18:28,113][0m Trial 4 finished with value: 0.0 and parameters: {'learning_rate': 2.5231076504025018e-05, 'weight_decay': 0.17658090154738026, 'class_weight_pos': 8.303777365927857}. Best is trial 0 with value: 0.0.[0m



✓ DeBERTa best trial — F1: 0.0000
  Hyperparameters: {'learning_rate': 1.023843533490562e-05, 'weight_decay': 0.09078497329645818, 'class_weight_pos': 8.609598584900937}


## 6. Train Each Model with Best Hyperparameters

Re-train each model from scratch using the best hyperparameters found above.

In [6]:
trainers = {}  # keep trainers around for prediction

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Final training: {name}")
    print(f"{'='*60}")

    best = best_hparams[name]
    hp = best.hyperparameters

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # Build fresh model with best HPs
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=NUM_LABELS
    )

    # Verify the model is on the expected device (Trainer will move it, but let's show it)
    print(f"  Model device before Trainer: {next(model.parameters()).device}")

    # Mixed-precision: bf16 when available (all models); fp16 fallback (not DeBERTa)
    use_fp16, use_bf16 = get_mixed_precision_flags(name)
    print(f"  Mixed precision — fp16: {use_fp16}, bf16: {use_bf16}")

    training_args = TrainingArguments(
        output_dir=f"./results/{name}_final",
        num_train_epochs=3,                                      # fixed
        per_device_train_batch_size=32,                          # fixed
        per_device_eval_batch_size=32,
        learning_rate=hp.get("learning_rate", 2e-5),
        weight_decay=hp.get("weight_decay", 0.01),
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=use_fp16,
        bf16=use_bf16,
        warmup_ratio=0.1,
        max_grad_norm=1.0,
        dataloader_num_workers=2,
        report_to="none",
    )

    # Apply best class weight from hp search
    class_w = hp.get("class_weight_pos", 9.0)
    final_weights = torch.tensor([1.0, class_w], dtype=torch.float32).to(DEVICE)

    trainer = WeightedTrainer(
        class_weights=final_weights,
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
    )

    trainer.train()

    # After training, Trainer has moved the model to GPU (if available)
    print(f"  Model device after Trainer : {next(model.parameters()).device}")
    print(f"  Class weight (PCL)        : {class_w:.2f}")

    trained_models[name] = model
    trainers[name] = trainer
    print(f"✓ {name} final training complete.")


  Final training: RoBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 18535.69 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 17352.31 examples/s]
Loading weights: 100%|██████████| 197/197 [00:00<00:00, 792.43it/s, Materializing param=roberta.encoder.layer.11.output.dense.weight]              
[1mRobertaForSequenceClassification LOAD REPORT[0m from: FacebookAI/roberta-base
Key                             | Status     | 
--------------------------------+------------+-
roberta.embeddings.position_ids | UNEXPECTED | 
lm_head.dense.weight            | UNEXPECTED | 
lm_head.dense.bias              | UNEXPECTED | 
lm_head.bias                    | UNEXPECTED | 
lm_head.layer_norm.weight       | UNEXPECTED | 
lm_head.layer_norm.bias         | UNEXPECTED | 
classifier.out_proj.weight      | MISSING    | 
classifier.dense.bias           | MISSING    | 
classifier.dense.weight         | MISSING    | 
classifier.out_proj.bias        | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading f

  Model device before Trainer: cpu
  Mixed precision — fp16: False, bf16: True


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.479007,0.365185,0.846227,0.365639,0.83,0.507645
2,0.343715,0.342505,0.82999,0.348837,0.9,0.502793
3,0.231397,0.439241,0.903534,0.496732,0.76,0.600791


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.76it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  1.77it/s]
There were missing keys in the checkpoint model loaded: ['roberta.embeddings.LayerNorm.weight', 'roberta.embeddings.LayerNorm.bias', 'roberta.encoder.layer.0.attention.output.LayerNorm.weight', 'roberta.encoder.layer.0.attention.output.LayerNorm.bias', 'roberta.encoder.layer.0.output.LayerNorm.weight', 'roberta.encoder.layer.0.output.LayerNorm.bias', 'roberta.encoder.layer.1.attention.output.LayerNorm.weight', 'roberta.encoder.layer.1.attention.output.LayerNorm.bias', 'roberta.encoder.layer.1.output.LayerNorm.weight', 'roberta.encoder.layer.1.output.LayerNorm.bias', 'roberta.encoder.layer.2.attention.output.LayerNorm.weight', 'roberta.encoder.layer.2.attention.output.LayerNorm.bias', 'roberta.encoder.layer.2.output.LayerNorm.weight', 'roberta.encoder.layer.2.output.LayerNorm.bias', 'ro

  Model device after Trainer : cuda:0
  Class weight (PCL)        : 8.57
✓ RoBERTa final training complete.

  Final training: DistilBERT


Map: 100%|██████████| 8374/8374 [00:00<00:00, 14123.43 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 13997.86 examples/s]
Loading weights: 100%|██████████| 100/100 [00:00<00:00, 1049.15it/s, Materializing param=distilbert.transformer.layer.5.sa_layer_norm.weight]   
[1mDistilBertForSequenceClassification LOAD REPORT[0m from: distilbert-base-uncased
Key                     | Status     | 
------------------------+------------+-
vocab_projector.bias    | UNEXPECTED | 
vocab_transform.weight  | UNEXPECTED | 
vocab_layer_norm.bias   | UNEXPECTED | 
vocab_layer_norm.weight | UNEXPECTED | 
vocab_transform.bias    | UNEXPECTED | 
classifier.weight       | MISSING    | 
pre_classifier.weight   | MISSING    | 
pre_classifier.bias     | MISSING    | 
classifier.bias         | MISSING    | 

[3mNotes:
- UNEXPECTED[3m	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.
- MISSING[3m	:those params were newly initialized because missing

  Model device before Trainer: cpu
  Mixed precision — fp16: False, bf16: True


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,0.463273,0.435884,0.805158,0.295276,0.75,0.423729
2,0.345962,0.392214,0.845272,0.36036,0.8,0.496894
3,0.230847,0.481754,0.891117,0.453333,0.68,0.544


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.30it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.31it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  3.33it/s]
There were missing keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.weight', 'distilbert.embeddings.LayerNorm.bias'].
There were unexpected keys in the checkpoint model loaded: ['distilbert.embeddings.LayerNorm.beta', 'distilbert.embeddings.LayerNorm.gamma'].


  Model device after Trainer : cuda:0
  Class weight (PCL)        : 8.99
✓ DistilBERT final training complete.

  Final training: DeBERTa


Map: 100%|██████████| 8374/8374 [00:00<00:00, 15311.30 examples/s]
Map: 100%|██████████| 1047/1047 [00:00<00:00, 14695.73 examples/s]
Loading weights: 100%|██████████| 198/198 [00:00<00:00, 1175.93it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
mask_predictions.dense.bias             | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.bias   | UNEXPECTED | 
mask_predictions.dense.weight           | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
mask_predictio

  Model device before Trainer: cpu
  Mixed precision — fp16: False, bf16: True


Epoch,Training Loss,Validation Loss,Accuracy,Precision,Recall,F1
1,1.227539,0.777258,0.095511,0.095511,1.0,0.174368
2,0.812098,0.693162,0.904489,0.0,0.0,0.0
3,0.756232,0.688421,0.904489,0.0,0.0,0.0


Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.33it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.34it/s]
Writing model shards: 100%|██████████| 1/1 [00:00<00:00,  2.37it/s]
There were missing keys in the checkpoint model loaded: ['deberta.embeddings.LayerNorm.weight', 'deberta.embeddings.LayerNorm.bias', 'deberta.encoder.layer.0.attention.output.LayerNorm.weight', 'deberta.encoder.layer.0.attention.output.LayerNorm.bias', 'deberta.encoder.layer.0.output.LayerNorm.weight', 'deberta.encoder.layer.0.output.LayerNorm.bias', 'deberta.encoder.layer.1.attention.output.LayerNorm.weight', 'deberta.encoder.layer.1.attention.output.LayerNorm.bias', 'deberta.encoder.layer.1.output.LayerNorm.weight', 'deberta.encoder.layer.1.output.LayerNorm.bias', 'deberta.encoder.layer.2.attention.output.LayerNorm.weight', 'deberta.encoder.layer.2.attention.output.LayerNorm.bias', 'deberta.encoder.layer.2.output.LayerNorm.weight', 'deberta.encoder.layer.2.output.LayerNorm.bias', 'de

  Model device after Trainer : cuda:0
  Class weight (PCL)        : 8.61
✓ DeBERTa final training complete.


## 7. Per-Model Evaluation — Results & Confusion Matrices

Evaluate each model individually on the **test set**, print classification reports, and plot confusion matrices.

In [None]:
per_model_preds = {}  # {name: np.array of predictions on test set}

for name in MODEL_CATALOGUE:
    print(f"\n{'='*60}")
    print(f"  Test Evaluation: {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    test_tok = tokenize_dataset(test_dataset, tokenizer)
    trainer = trainers[name]

    # Predict on test set
    predictions = trainer.predict(test_tok)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    per_model_preds[name] = preds

    # Classification report
    print(f"\n{name} — Classification Report:")
    print(classification_report(labels, preds, target_names=LABEL_NAMES, digits=4))

    # Confusion matrix
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    cm = confusion_matrix(labels, preds)
    ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
        ax=axes[0], cmap="Blues", colorbar=False
    )
    axes[0].set_title(f"{name} — Counts")

    cm_norm = confusion_matrix(labels, preds, normalize="true")
    ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
        ax=axes[1], cmap="Blues", colorbar=False, values_format=".2%"
    )
    axes[1].set_title(f"{name} — Normalised")

    plt.tight_layout()
    plt.show()

## 8. Overall Ensemble — Majority Vote, Results & Confusion Matrix

Each of the 3 models votes; a sample is classified as **PCL** if **2 or more** models agree.

In [None]:
# Majority vote: PCL (1) if >= 2 out of 3 models predict PCL
votes = np.stack(list(per_model_preds.values()), axis=0)  # (3, n_test)
ensemble_preds = (votes.sum(axis=0) >= 2).astype(int)
true_labels = np.array(test_dataset["label"])

# ---------------------------------------------------------------------------
# Overall classification report
# ---------------------------------------------------------------------------
print("=" * 60)
print("  ENSEMBLE (Majority Vote) — Test Set Results")
print("=" * 60)
print(classification_report(true_labels, ensemble_preds, target_names=LABEL_NAMES, digits=4))

# Per-model vs ensemble summary table
rows = []
for name, preds in per_model_preds.items():
    p, r, f1, _ = precision_recall_fscore_support(true_labels, preds, average="binary", pos_label=1)
    acc = accuracy_score(true_labels, preds)
    rows.append({"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

p, r, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average="binary", pos_label=1)
acc = accuracy_score(true_labels, ensemble_preds)
rows.append({"Model": "ENSEMBLE", "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

summary_df = pd.DataFrame(rows).set_index("Model")
print("\nSummary comparison:")
display(summary_df.style.format("{:.4f}").highlight_max(axis=0, color="lightgreen"))

# ---------------------------------------------------------------------------
# Confusion matrices — ensemble
# ---------------------------------------------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

cm = confusion_matrix(true_labels, ensemble_preds)
ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
    ax=axes[0], cmap="Oranges", colorbar=False
)
axes[0].set_title("Ensemble — Counts")

cm_norm = confusion_matrix(true_labels, ensemble_preds, normalize="true")
ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
    ax=axes[1], cmap="Oranges", colorbar=False, values_format=".2%"
)
axes[1].set_title("Ensemble — Normalised")

plt.tight_layout()
plt.show()

# ---------------------------------------------------------------------------
# Voting agreement heatmap
# ---------------------------------------------------------------------------
print("\nPer-sample voting agreement:")
agreement = votes.sum(axis=0)
for v in [0, 1, 2, 3]:
    count = (agreement == v).sum()
    print(f"  {v}/3 models predict PCL: {count} samples ({count/len(agreement)*100:.1f}%)")