# Ensemble Voting Model — Don't Patronize Me!

**Binary PCL classification** using RoBERTa, DistilBERT, and DeBERTa with majority-vote ensemble.

## 1. Imports

In [1]:
from transformers import (
    AutoModelForSequenceClassification,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
)
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from datasets import Dataset
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score,
    precision_recall_fscore_support,
    confusion_matrix,
    classification_report,
    ConfusionMatrixDisplay,
)
import matplotlib.pyplot as plt
import optuna

print(f"PyTorch version : {torch.__version__}")
print(f"CUDA available  : {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU device      : {torch.cuda.get_device_name(0)}")
    print(f"GPU memory      : {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")

PyTorch version : 2.9.0+cu128
CUDA available  : False


## 2. Device Setup

In [None]:
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
NUM_LABELS = 2
LABEL_NAMES = ["Non-PCL", "PCL"]

# Class weights for the ~9.5:1 imbalance (Non-PCL : PCL).
# Placing on DEVICE once avoids repeated .to() calls inside compute_loss.
CLASS_WEIGHTS = torch.tensor([1.0, 5.0], dtype=torch.float32).to(DEVICE)

print(f"Using device    : {DEVICE}")
print(f"Class weights   : {CLASS_WEIGHTS}  (device: {CLASS_WEIGHTS.device})")

## 3. Load & Preprocess Dataset

Binary labels as per the paper: labels 0-1 → **Non-PCL (0)**, labels 2-4 → **PCL (1)**.

We split 80/10/10 into train / val / test. The test set is held out entirely until final evaluation.

In [None]:
def load_data():
    """Load Don't Patronize Me PCL dataset and binarise labels."""
    pcl_columns = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
    df = pd.read_csv(
        "dontpatronizeme_pcl.tsv",
        sep="\t",
        skiprows=4,
        names=pcl_columns,
        on_bad_lines="skip",
        engine="python",
    )

    # Drop rows with missing text or labels
    df = df.dropna(subset=["text", "label"])
    df["label"] = df["label"].astype(int)

    # Binary: 0-1 → Non-PCL (0),  2-4 → PCL (1)
    df["binary_label"] = (df["label"] >= 2).astype(int)

    print(f"Total samples  : {len(df)}")
    print(f"Label distribution:\n{df['binary_label'].value_counts().rename({0: 'Non-PCL', 1: 'PCL'})}")
    print(f"Imbalance ratio: {(df['binary_label'] == 0).sum() / (df['binary_label'] == 1).sum():.2f}:1")

    return df


df = load_data()

# 80 / 10 / 10 stratified split
train_df, temp_df = train_test_split(
    df, test_size=0.2, stratify=df["binary_label"], random_state=42
)
val_df, test_df = train_test_split(
    temp_df, test_size=0.5, stratify=temp_df["binary_label"], random_state=42
)

# Convert to HuggingFace Datasets
train_dataset = Dataset.from_dict({"text": train_df["text"].tolist(), "label": train_df["binary_label"].tolist()})
val_dataset   = Dataset.from_dict({"text": val_df["text"].tolist(),   "label": val_df["binary_label"].tolist()})
test_dataset  = Dataset.from_dict({"text": test_df["text"].tolist(),  "label": test_df["binary_label"].tolist()})

print(f"\nSplit sizes — train: {len(train_dataset)}, val: {len(val_dataset)}, test: {len(test_dataset)}")

## 4. Model Definitions & Tokenisation

We define:
- **Model catalogue** — three transformer architectures
- **`WeightedTrainer`** — custom Trainer that uses class-weighted CrossEntropyLoss. The class weights tensor is moved to device **once** (at init), not on every forward pass.
- **`compute_metrics`** — accuracy, precision, recall, F1
- Per-model tokenisation functions

In [None]:
MODEL_CATALOGUE = {
    "RoBERTa":    "FacebookAI/roberta-base",
    "DistilBERT": "distilbert-base-uncased",
    "DeBERTa":    "microsoft/deberta-v3-base",
}

MAX_LENGTH = 128  # EDA: median 42 word tokens, 95th pct ~105; subword inflation ~1.3x → 128 is safe


# ---------------------------------------------------------------------------
# Weighted Trainer — class weights live on the same device as the model
# ---------------------------------------------------------------------------
class WeightedTrainer(Trainer):
    """Trainer that applies class weights to CrossEntropyLoss.
    
    The weights are sent to `self.args.device` once when training begins,
    rather than on every forward pass.
    """

    def __init__(self, class_weights: torch.Tensor, *args, **kwargs):
        super().__init__(*args, **kwargs)
        # Store weights; they'll be moved to the training device in compute_loss
        # via logits.device (which Trainer guarantees is correct).
        self._class_weights = class_weights

    def compute_loss(self, model, inputs, return_outputs=False, **kwargs):
        labels = inputs.pop("labels")
        outputs = model(**inputs)
        logits = outputs.logits
        # Move weights to the same device as the logits (handles multi-GPU too)
        loss_fn = nn.CrossEntropyLoss(
            weight=self._class_weights.to(logits.device)
        )
        loss = loss_fn(logits, labels)
        return (loss, outputs) if return_outputs else loss


# ---------------------------------------------------------------------------
# Metrics
# ---------------------------------------------------------------------------
def compute_metrics(eval_pred):
    """Compute accuracy, precision, recall, F1 for the positive class (PCL)."""
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    precision, recall, f1, _ = precision_recall_fscore_support(
        labels, preds, average="binary", pos_label=1
    )
    acc = accuracy_score(labels, preds)
    return {"accuracy": acc, "precision": precision, "recall": recall, "f1": f1}


# ---------------------------------------------------------------------------
# Tokenisers
# ---------------------------------------------------------------------------
tokenisers = {}
for name, path in MODEL_CATALOGUE.items():
    tokenisers[name] = AutoTokenizer.from_pretrained(path)
    print(f"Loaded tokeniser for {name}")


def tokenize_dataset(dataset, tokenizer):
    """Tokenise a HuggingFace Dataset with the given tokenizer."""
    def _tok(examples):
        return tokenizer(
            examples["text"], truncation=True, padding="max_length", max_length=MAX_LENGTH
        )
    return dataset.map(_tok, batched=True)

## 5. Bayesian Hyperparameter Optimisation (Optuna)

For each model we run `trainer.hyperparameter_search` with an Optuna backend. This performs **Bayesian optimisation** (Tree-structured Parzen Estimator by default) over learning rate, number of epochs, batch size, and weight decay.

Key requirements:
- A **`model_init`** function (not a pre-built model) so Trainer can reinitialise fresh weights each trial
- An **`hp_space`** function defining the search ranges
- `direction="maximize"` because we optimise F1

In [None]:
def optuna_hp_space(trial):
    """Define the Bayesian search space."""
    return {
        "learning_rate":             trial.suggest_float("learning_rate", 1e-5, 5e-5, log=True),
        "num_train_epochs":          trial.suggest_int("num_train_epochs", 2, 5),
        "per_device_train_batch_size": trial.suggest_categorical("per_device_train_batch_size", [8, 16, 32]),
        "weight_decay":              trial.suggest_float("weight_decay", 0.0, 0.3),
    }


N_TRIALS = 10  # Increase for better search (20-30+), reduce if GPU-constrained

best_hparams = {}   # {model_name: BestRun}
trained_models = {} # {model_name: fine-tuned model}

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Hyperparameter search for {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # model_init: required so Trainer can create a fresh model each trial
    def make_model_init(path):
        def model_init():
            return AutoModelForSequenceClassification.from_pretrained(
                path, num_labels=NUM_LABELS
            )
        return model_init

    training_args = TrainingArguments(
        output_dir=f"./results/{name}",
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=torch.cuda.is_available(),  # mixed precision if GPU available
        report_to="none",                # disable W&B / MLflow
    )

    trainer = WeightedTrainer(
        class_weights=CLASS_WEIGHTS,
        model_init=make_model_init(model_path),
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
    )

    best_run = trainer.hyperparameter_search(
        direction="maximize",
        backend="optuna",
        hp_space=optuna_hp_space,
        n_trials=N_TRIALS,
        compute_objective=lambda metrics: metrics["eval_f1"],
    )

    best_hparams[name] = best_run
    print(f"\n✓ {name} best trial — F1: {best_run.objective:.4f}")
    print(f"  Hyperparameters: {best_run.hyperparameters}")

## 6. Train Each Model with Best Hyperparameters

Re-train each model from scratch using the best hyperparameters found above.

In [None]:
trainers = {}  # keep trainers around for prediction

for name, model_path in MODEL_CATALOGUE.items():

    print(f"\n{'='*60}")
    print(f"  Final training: {name}")
    print(f"{'='*60}")

    best = best_hparams[name]
    hp = best.hyperparameters

    tokenizer = tokenisers[name]
    train_tok = tokenize_dataset(train_dataset, tokenizer)
    val_tok   = tokenize_dataset(val_dataset, tokenizer)

    # Build fresh model with best HPs
    model = AutoModelForSequenceClassification.from_pretrained(
        model_path, num_labels=NUM_LABELS
    )

    # Verify the model is on the expected device (Trainer will move it, but let's show it)
    print(f"  Model device before Trainer: {next(model.parameters()).device}")

    training_args = TrainingArguments(
        output_dir=f"./results/{name}_final",
        num_train_epochs=hp.get("num_train_epochs", 3),
        per_device_train_batch_size=hp.get("per_device_train_batch_size", 16),
        per_device_eval_batch_size=32,
        learning_rate=hp.get("learning_rate", 2e-5),
        weight_decay=hp.get("weight_decay", 0.01),
        eval_strategy="epoch",
        save_strategy="epoch",
        save_total_limit=1,
        load_best_model_at_end=True,
        metric_for_best_model="f1",
        logging_steps=50,
        fp16=torch.cuda.is_available(),
        report_to="none",
    )

    trainer = WeightedTrainer(
        class_weights=CLASS_WEIGHTS,
        model=model,
        args=training_args,
        train_dataset=train_tok,
        eval_dataset=val_tok,
        compute_metrics=compute_metrics,
        processing_class=tokenizer,
    )

    trainer.train()

    # After training, Trainer has moved the model to GPU (if available)
    print(f"  Model device after Trainer : {next(model.parameters()).device}")

    trained_models[name] = model
    trainers[name] = trainer
    print(f"✓ {name} final training complete.")

## 7. Per-Model Evaluation — Results & Confusion Matrices

Evaluate each model individually on the **test set**, print classification reports, and plot confusion matrices.

In [None]:
per_model_preds = {}  # {name: np.array of predictions on test set}

for name in MODEL_CATALOGUE:
    print(f"\n{'='*60}")
    print(f"  Test Evaluation: {name}")
    print(f"{'='*60}")

    tokenizer = tokenisers[name]
    test_tok = tokenize_dataset(test_dataset, tokenizer)
    trainer = trainers[name]

    # Predict on test set
    predictions = trainer.predict(test_tok)
    preds = np.argmax(predictions.predictions, axis=-1)
    labels = predictions.label_ids
    per_model_preds[name] = preds

    # Classification report
    print(f"\n{name} — Classification Report:")
    print(classification_report(labels, preds, target_names=LABEL_NAMES, digits=4))

    # Confusion matrix
    fig, axes = plt.subplots(1, 2, figsize=(12, 4))

    cm = confusion_matrix(labels, preds)
    ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
        ax=axes[0], cmap="Blues", colorbar=False
    )
    axes[0].set_title(f"{name} — Counts")

    cm_norm = confusion_matrix(labels, preds, normalize="true")
    ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
        ax=axes[1], cmap="Blues", colorbar=False, values_format=".2%"
    )
    axes[1].set_title(f"{name} — Normalised")

    plt.tight_layout()
    plt.show()

## 8. Overall Ensemble — Majority Vote, Results & Confusion Matrix

Each of the 3 models votes; a sample is classified as **PCL** if **2 or more** models agree.

In [None]:
# Majority vote: PCL (1) if >= 2 out of 3 models predict PCL
votes = np.stack(list(per_model_preds.values()), axis=0)  # (3, n_test)
ensemble_preds = (votes.sum(axis=0) >= 2).astype(int)
true_labels = np.array(test_dataset["label"])

# ---------------------------------------------------------------------------
# Overall classification report
# ---------------------------------------------------------------------------
print("=" * 60)
print("  ENSEMBLE (Majority Vote) — Test Set Results")
print("=" * 60)
print(classification_report(true_labels, ensemble_preds, target_names=LABEL_NAMES, digits=4))

# Per-model vs ensemble summary table
rows = []
for name, preds in per_model_preds.items():
    p, r, f1, _ = precision_recall_fscore_support(true_labels, preds, average="binary", pos_label=1)
    acc = accuracy_score(true_labels, preds)
    rows.append({"Model": name, "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

p, r, f1, _ = precision_recall_fscore_support(true_labels, ensemble_preds, average="binary", pos_label=1)
acc = accuracy_score(true_labels, ensemble_preds)
rows.append({"Model": "ENSEMBLE", "Accuracy": acc, "Precision": p, "Recall": r, "F1": f1})

summary_df = pd.DataFrame(rows).set_index("Model")
print("\nSummary comparison:")
display(summary_df.style.format("{:.4f}").highlight_max(axis=0, color="lightgreen"))

# ---------------------------------------------------------------------------
# Confusion matrices — ensemble
# ---------------------------------------------------------------------------
fig, axes = plt.subplots(1, 2, figsize=(12, 4))

cm = confusion_matrix(true_labels, ensemble_preds)
ConfusionMatrixDisplay(cm, display_labels=LABEL_NAMES).plot(
    ax=axes[0], cmap="Oranges", colorbar=False
)
axes[0].set_title("Ensemble — Counts")

cm_norm = confusion_matrix(true_labels, ensemble_preds, normalize="true")
ConfusionMatrixDisplay(cm_norm, display_labels=LABEL_NAMES).plot(
    ax=axes[1], cmap="Oranges", colorbar=False, values_format=".2%"
)
axes[1].set_title("Ensemble — Normalised")

plt.tight_layout()
plt.show()

# ---------------------------------------------------------------------------
# Voting agreement heatmap
# ---------------------------------------------------------------------------
print("\nPer-sample voting agreement:")
agreement = votes.sum(axis=0)
for v in [0, 1, 2, 3]:
    count = (agreement == v).sum()
    print(f"  {v}/3 models predict PCL: {count} samples ({count/len(agreement)*100:.1f}%)")