# Ensemble Voting Model ‚Äî Don't Patronize Me!

**Binary PCL classification** using RoBERTa, DistilBERT, and DeBERTa with majority-vote ensemble.

## 1. Imports

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorWithPadding,
)
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
import json
from pathlib import Path
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt
import optuna
from optuna.pruners import MedianPruner
import gc

print(f"PyTorch version : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device      : {torch.cuda.get_device_name(0)}")
    print(f"GPU memory      : {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

  from .autonotebook import tqdm as notebook_tqdm


PyTorch version : 2.10.0+cu128
CUDA available  : True
GPU device      : Tesla T4
GPU memory      : 15.6 GB


## 2. Device Setup

In [2]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 2
LABEL_NAMES = ["Non-PCL", "PCL"]

# Class weights for the ~9.5:1 imbalance (Non-PCL : PCL).
# Placing on DEVICE once avoids repeated .to() calls inside compute_loss.
CLASS_WEIGHTS = torch.tensor([1.0, 9.0], dtype=torch.float32).to(DEVICE)

# Mixed-precision strategy:
#   bf16 preferred (Ampere+ GPUs) ‚Äî works with all models including DeBERTa v3.
#   fp16 as fallback for older GPUs ‚Äî but NOT safe for DeBERTa v3 (FP16 gradient error).
_BF16_OK = torch.cuda.is_available() and torch.cuda.is_bf16_supported()

print(f"Using device    : {DEVICE}")
print(f"bf16 supported  : {_BF16_OK}")
print(f"Class weights   : {CLASS_WEIGHTS}  (device: {CLASS_WEIGHTS.device})")

Using device    : cuda
bf16 supported  : True
Class weights   : tensor([1., 9.], device='cuda:0')  (device: cuda:0)


## 3. Load & Preprocess Dataset

Binary labels as per the paper: labels 0-1 ‚Üí **Non-PCL (0)**, labels 2-4 ‚Üí **PCL (1)**.

We split 80/10/10 into train / val / test. The test set is held out entirely until final evaluation.

In [3]:
def load_data():
    """Load Don't Patronize Me PCL dataset and binarise labels."""
    pcl_columns = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
    df = pd.read_csv(
        "dontpatronizeme_pcl.tsv",
        sep="\t",
        skiprows=4,
        names=pcl_columns,
        on_bad_lines="skip",
        engine="python",
    )

    # Drop rows with missing text or labels
    df = df.dropna(subset=["text", "label"])
    df["label"] = df["label"].astype(int)

    # Binary: 0-1 ‚Üí Non-PCL (0),  2-4 ‚Üí PCL (1)
    df["binary_label"] = (df["label"] >= 2).astype(int)

    print(f"Total samples  : {len(df)}")
    print(f"Label distribution:\n{df['binary_label'].value_counts().rename({0: 'Non-PCL', 1: 'PCL'})}")
    print(f"Imbalance ratio: {(df['binary_label'] == 0).sum() / (df['binary_label'] == 1).sum():.2f}:1")

    return df


df = load_data()

# 80 / 10 / 10 stratified split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["binary_label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["binary_label"], random_state=42
)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["binary_label"].tolist()})
val_dataset   = Dataset.from_dict({"text": val_df["text"].tolist(),   "label": val_df["binary_label"].tolist()})
test_dataset  = Dataset.from_dict({"text": test_df["text"].tolist(),  "label": test_df["binary_label"].tolist()})

print(f"\nSplit sizes ‚Äî train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

Total samples  : 10468
Label distribution:
binary_label
Non-PCL    9475
PCL         993
Name: count, dtype: int64
Imbalance ratio: 9.54:1

Split sizes ‚Äî train: 8374, val: 1047, test: 1047


## 4. Model Definitions & Tokenisation

We define:
- **Model catalogue** ‚Äî three transformer architectures
- **`WeightedTrainer`** ‚Äî custom Trainer that uses class-weighted CrossEntropyLoss. The class weights tensor is moved to device **once** (at init), not on every forward pass.
- **`compute_metrics`** ‚Äî accuracy, precision, recall, F1
- Per-model tokenisation functions

In [4]:
MODEL_CATALOGUE = {
    "RoBERTa":    "FacebookAI/roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "DeBERTa":    "microsoft/deberta-v3-base",
}

MAX_LENGTH = 128  # EDA: median 42 word tokens, 95th pct ~105; subword inflation ~1.3x ‚Üí 128 is safe


# ---------------------------------------------------------------------------
# Mixed-precision helper ‚Äî decides fp16 vs bf16 per model
# ---------------------------------------------------------------------------
def get_mixed_precision_flags(model_name: str):
    """Return (fp16, bf16) flags for a given model.

    ‚Ä¢ bf16 is preferred for ALL models when the GPU supports it (Ampere+).
    ‚Ä¢ fp16 is used as fallback ‚Äî except for DeBERTa v3 which produces
      gradient-unscale errors under fp16.
    ‚Ä¢ DeBERTa v3 falls back to fp32 if bf16 is unavailable.
    """
    # if _BF16_OK:
    #     return False, True          # bf16 for everything
    # if model_name == "DeBERTa":
    #     return False, False         # fp32 fallback (fp16 is unsafe)
    # if torch.cuda.is_available():
    #     return True, False          # fp16 for other models
    return False, False             # Generic fallback


# ---------------------------------------------------------------------------
# Weighted Trainer ‚Äî class weights live on the same device as the model
# ---------------------------------------------------------------------------
class WeightedTrainer(Trainer):
    """Trainer that applies class weights to CrossEntropyLoss.

    Supports per-trial class_weight_pos override via self.args (set by
    Optuna hp search). Falls back to the weights passed at init.
    """

    def __init__(self, class_weights: torch.Tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self._class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits

        # ---- NaN guard: clamp logits to prevent NaN propagation ----------
        # if torch.isnan(logits).any() or torch.isinf(logits).any():
        #     logits = torch.nan_to_num(logits, nan=0.0, posinf=1e4, neginf=-1e4)

        # Use per-trial class_weight_pos if set by hp search, else default
        pos_w = getattr(self.args, "class_weight_pos", None)
        if pos_w is not None:
            weights = torch.tensor([1.0, pos_w], dtype=logits.dtype, device=logits.device)
        else:
            weights = self._class_weights.to(dtype=logits.dtype, device=logits.device)
            
        loss_fn = nn.CrossEntropyLoss(weight=weights, reduction="mean")
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, F1 for the positive class (PCL)."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1, zero_division=0
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# ---------------------------------------------------------------------------
# Tokenisers ‚Äî NO padding here; DataCollatorWithPadding pads per-batch
# (median text ~42 tokens ‚Üí dynamic padding is ~2x faster than pad-to-128)
# ---------------------------------------------------------------------------
tokenisers = {}
for name, path in MODEL_CATALOGUE.items():
    tokenisers[name] = AutoTokenizer.from_pretrained(path)
    print(f"Loaded tokeniser for {name}")


def tokenize_dataset(dataset, tokenizer):
    """Tokenise a HuggingFace Dataset with the given tokenizer (no padding)."""
    def _tok(examples):
        return tokenizer(
            examples["text"], truncation=True, max_length=MAX_LENGTH
        )
    return dataset.map(_tok, batched=True)



Loaded tokeniser for RoBERTa
Loaded tokeniser for DistilBERT
Loaded tokeniser for DeBERTa


## 5. Bayesian Hyperparameter Optimisation (Optuna)

For each model we run `trainer.hyperparameter_search` with an Optuna backend. This performs **Bayesian optimisation** (Tree-structured Parzen Estimator by default) over learning rate, number of epochs, batch size, weight decay, and **class weight for PCL** (searched 8‚Äì10 around the ~9.5:1 natural ratio).

Key design decisions:
- **`model_init`** function (not a pre-built model) so Trainer can reinitialise fresh weights each trial
- **`class_weight_pos`** in the search space ‚Äî the most impactful knob for imbalanced classification
- **Dynamic padding** (`DataCollatorWithPadding`) ‚Äî pads per-batch instead of to `MAX_LENGTH`, ~2√ó faster
- **DeBERTa v3** uses `bf16` (or fp32 fallback) instead of `fp16` which causes gradient unscale errors
- `direction="maximize"` because we optimise F1

Set `USE_PREVIOUS_HPARAMS` per-model in the cell below to **load saved results** from `best_hparams.json` or **re-run** the Optuna search (which overwrites the file on completion).

In [10]:
# ‚îÄ‚îÄ Toggle: reuse saved hyperparameters or re-run Optuna search ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ‚îÄ
# Set per-model: True  = load from best_hparams.json (fast, no GPU needed)
#                False = run Bayesian HP search with Optuna (overwrites file)

HPARAMS_FILE = Path("best_hparams.json")

USE_PREVIOUS_HPARAMS = {
    "RoBERTa":    True,
    "DistilBERT": True,
    "DeBERTa":    False,
}

N_TRIALS = 10


# 1. Load existing registry (or start fresh if missing)
hparams_registry = {}
if HPARAMS_FILE.exists():
    with open(HPARAMS_FILE) as f:
        hparams_registry = json.load(f)

In [11]:
def optuna_hp_space(trial):
    """Bayesian search over the highest-impact hyperparameters.

    Searched hyperparameters:
      ‚Ä¢ learning_rate
      ‚Ä¢ weight_decay
      ‚Ä¢ class_weight_pos (for imbalanced classification)
      ‚Ä¢ batch_size (16 or 32)

    Fixed with educated defaults (not worth searching):
      ‚Ä¢ num_train_epochs = 3              (standard for transformer fine-tuning)
    """
    return {
        "learning_rate":                trial.suggest_float("learning_rate", 5e-6, 1e-5, log=True),
        "weight_decay":                 trial.suggest_float("weight_decay", 0.0, 0.2),
        "per_device_train_batch_size":  trial.suggest_categorical("batch_size", [16, 32]),
        "class_weight_pos":             trial.suggest_float("class_weight_pos", 3.0, 8.0),
        "num_train_epochs":             4,  #trial.suggest_categorical("num_train_epochs", [3, 5]),
    }


best_hparams = {}

for name, model_path in MODEL_CATALOGUE.items():
    should_reuse = USE_PREVIOUS_HPARAMS.get(name, False)
    
    # CASE 1: LOAD SAVED PARAMS
    if should_reuse and name in hparams_registry:
        print(f"üìÇ {name:12s} | Loading saved (F1: {hparams_registry[name]['objective']:.4f})")
        best_hparams[name] = hparams_registry[name]["hyperparameters"]
        continue

    # CASE 2: RUN SEARCH
    print(f"\n{'='*40}\nüîç Searching: {name}\n{'='*40}")
    
    tokenizer = tokenisers[name]
    use_fp16, use_bf16 = get_mixed_precision_flags(name)

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        eval_strategy="epoch",
        save_strategy="no",
        metric_for_best_model="f1",
        fp16=use_fp16,
        bf16=use_bf16,
        report_to="none",
        disable_tqdm=True # Cleaner logs during HP search
    )

    trainer = WeightedTrainer(
        model_init=lambda: AutoModelForSequenceClassification.from_pretrained(model_path, num_labels=NUM_LABELS),
        args=training_args,
        train_dataset=tokenize_dataset(train_dataset, tokenizer),
        eval_dataset=tokenize_dataset(val_dataset, tokenizer),
        compute_metrics=compute_metrics,
        data_collator=DataCollatorWithPadding(tokenizer),
        class_weights=CLASS_WEIGHTS,
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=N_TRIALS,
        pruner=MedianPruner(n_startup_trials=2),
        compute_objective=lambda m: m["eval_f1"],
    )

    # Store results
    best_hparams[name] = best_run.hyperparameters
    hparams_registry[name] = {
        "objective": best_run.objective,
        "hyperparameters": best_run.hyperparameters
    }

    # Clean up GPU
    del trainer
    torch.cuda.empty_cache()
    gc.collect()

# ---------------------------------------------------------------------------
# Final Save
# ---------------------------------------------------------------------------
with open(HPARAMS_FILE, "w") as f:
    json.dump(hparams_registry, f, indent=4)

print(f"\n‚úÖ All hyperparameters synced to {HPARAMS_FILE}")

üìÇ RoBERTa      | Loading saved (F1: 0.4789)
üìÇ DistilBERT   | Loading saved (F1: 0.3947)

üîç Searching: DeBERTa


Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 8374/8374 [00:00<00:00, 10986.59 examples/s]
Map: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 1047/1047 [00:00<00:00, 14881.13 examples/s]
Loading weights: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 198/198 [00:00<00:00, 1101.52it/s, Materializing param=deberta.encoder.rel_embeddings.weight]                     
[1mDebertaV2ForSequenceClassification LOAD REPORT[0m from: microsoft/deberta-v3-base
Key                                     | Status     | 
----------------------------------------+------------+-
lm_predictions.lm_head.dense.bias       | UNEXPECTED | 
mask_predictions.LayerNorm.weight       | UNEXPECTED | 
mask_predictions.LayerNorm.bias         | UNEXPECTED | 
lm_predictions.lm_head.LayerNorm.weight | UNEXPECTED | 
lm_predictions.lm_head.dense.weight     | UNEXPECTED | 
mask_predictions.classifier.bias        | UNEXPECTED | 
mask_predictions.classifier.weight      | UNEXPECTED | 
lm_predictions.lm_head.bias             | UNEXPECTED | 
lm_predict

KeyboardInterrupt: 

## 6. Train Each Model with Best Hyperparameters

Re-train each model from scratch using the best hyperparameters found above.

In [None]:
trainers = {}  # keep trainers around for prediction

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Final training: {name}")
    print(f"{'='*60}")

    best = best_hparams[name]
    hp = best.hyperparameters

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # Build fresh model with best HPs
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=NUM_LABELS
    )

    # Verify the model is on the expected device (Trainer will move it, but let's show it)
    print(f"  Model device before Trainer: {next(model.parameters()).device}")

    # Mixed-precision: bf16 when available (all models); fp16 fallback (not DeBERTa)
    use_fp16, use_bf16 = get_mixed_precision_flags(name)
    print(f"  Mixed precision ‚Äî fp16: {use_fp16}, bf16: {use_bf16}")

    # Apply best batch size from hp search (backward compatible: default to 32)
    batch_size = hp.get("batch_size", 32)

    training_args = TrainingArguments(
        output_dir=f"./results/{name}_final",
        num_train_epochs=hp.get("num_train_epochs", 4),                                     
        per_device_train_batch_size=batch_size,                 
        per_device_eval_batch_size=batch_size,
        learning_rate=hp.get("learning_rate", 2e-5),
        weight_decay=hp.get("weight_decay", 0.01),
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=use_fp16,
        bf16=use_bf16,
        warmup_ratio=0.1,
        max_grad_norm=1.0,
        dataloader_num_workers=2,
        report_to="none",
    )

    # Apply best class weight from hp search
    class_w = hp.get("class_weight_pos", 9.0)
    final_weights = torch.tensor([1.0, class_w], dtype=torch.float32).to(DEVICE)

    print(f"  Batch size (train & eval)  : {batch_size}")
    print(f"  Class weight (PCL)        : {class_w:.2f}")

    trainer = WeightedTrainer(
        class_weights=final_weights,
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
        data_collator=DataCollatorWithPadding(tokenizer),
    )

    trainer.train()

    # After training, Trainer has moved the model to GPU (if available)
    print(f"  Model device after Trainer : {next(model.parameters()).device}")

    trained_models[name] = model
    trainers[name] = trainer
    print(f"‚úì {name} final training complete.")

## 7. Per-Model Evaluation ‚Äî Results & Confusion Matrices

Evaluate each model individually on the **test set**, print classification reports, and plot confusion matrices.

In [None]:
per_model_preds = {}  # {name: np.array of predictions on test set}

for name in MODEL_CATALOGUE:
    print(f"\n{'='*60}")
    print(f"  Test Evaluation: {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    test_tok = tokenize_dataset(test_dataset, tokenizer)
    trainer = trainers[name]

    # Predict on test set
    predictions = trainer.predict(test_tok)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    per_model_preds[name] = preds

    # Classification report
    print(f"\n{name} ‚Äî Classification Report:")
    print(classification_report(labels, preds, target_names=LABEL_NAMES, digits=4))

    # Confusion matrix
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    cm = confusion_matrix(labels, preds)
    ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
        ax=axes[0], cmap="Blues", colorbar=False
    )
    axes[0].set_title(f"{name} ‚Äî Counts")

    cm_norm = confusion_matrix(labels, preds, normalize="true")
    ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
        ax=axes[1], cmap="Blues", colorbar=False, values_format=".2%"
    )
    axes[1].set_title(f"{name} ‚Äî Normalised")

    plt.tight_layout()
    plt.show()

## 8. Overall Ensemble ‚Äî Majority Vote, Results & Confusion Matrix

Each of the 3 models votes; a sample is classified as **PCL** if **2 or more** models agree.

In [None]:
# Majority vote: PCL (1) if >= 2 out of 3 models predict PCL
votes = np.stack(list(per_model_preds.values()), axis=0)  # (3, n_test)
ensemble_preds = (votes.sum(axis=0) >= 2).astype(int)
true_labels = np.array(test_dataset["label"])

# ---------------------------------------------------------------------------
# Overall classification report
# ---------------------------------------------------------------------------
print("=" * 60)
print("  ENSEMBLE (Majority Vote) ‚Äî Test Set Results")
print("=" * 60)
print(classification_report(true_labels, ensemble_preds, target_names=LABEL_NAMES, digits=4))

# Per-model vs ensemble summary table
rows = []
for name, preds in per_model_preds.items():
    p, r, f1, _ = precision_recall_fscore_support(true_labels, preds, average="binary", pos_label=1)
    acc = accuracy_score(true_labels, preds)
    rows.append({"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

p, r, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average="binary", pos_label=1)
acc = accuracy_score(true_labels, ensemble_preds)
rows.append({"Model": "ENSEMBLE", "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

summary_df = pd.DataFrame(rows).set_index("Model")
print("\nSummary comparison:")
display(summary_df.style.format("{:.4f}").highlight_max(axis=0, color="lightgreen"))

# ---------------------------------------------------------------------------
# Confusion matrices ‚Äî ensemble
# ---------------------------------------------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

cm = confusion_matrix(true_labels, ensemble_preds)
ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
    ax=axes[0], cmap="Oranges", colorbar=False
)
axes[0].set_title("Ensemble ‚Äî Counts")

cm_norm = confusion_matrix(true_labels, ensemble_preds, normalize="true")
ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
    ax=axes[1], cmap="Oranges", colorbar=False, values_format=".2%"
)
axes[1].set_title("Ensemble ‚Äî Normalised")

plt.tight_layout()
plt.show()

# ---------------------------------------------------------------------------
# Voting agreement heatmap
# ---------------------------------------------------------------------------
print("\nPer-sample voting agreement:")
agreement = votes.sum(axis=0)
for v in [0, 1, 2, 3]:
    count = (agreement == v).sum()
    print(f"  {v}/3 models predict PCL: {count} samples ({count/len(agreement)*100:.1f}%)")