# QEvasion â€“ Multi-Model Transformer Comparison (v2)

This notebook extends the **v2** pipeline to test and compare multiple Transformer architectures on the QEvasion dataset.

## Objectives
1.  **Iterate through 3 Models**:
    *   `distilbert-base-uncased` (Baseline)
    *   `bert-base-uncased` 
    *   `albert-base-v2` 
2.  **Train Multi-Task Models**: Jointly predict Clarity (3-way) and Evasion (9-way).
3.  **Visualize Per-Model Performance**: Training curves and Confusion Matrices.
4.  **Final Comparison**: Aggregate results and plot a comparative Bar Chart.

## Setup

In [None]:
# If Colab, run this cell to clone the repo
!rm -rf political-evasion-classifier
!git clone https://github.com/minh-de-rien/political-evasion-classifier.git
%cd political-evasion-classifier
import sys
sys.path.insert(0, "/content/political-evasion-classifier")

In [1]:
# If local
import os
import sys
from pathlib import Path

project_root = Path(os.getcwd()).parent
if str(project_root) not in sys.path:
    sys.path.append(str(project_root))

In [None]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer

# Import src modules
from src.data import (
    load_qevasion_prepared,
    prepare_task1_data,
    prepare_task2_data,
    CLARITY_LABELS, EVASION_LABELS,
    CLARITY_TO_ID, EVASION_TO_ID,
    ID_TO_CLARITY, ID_TO_EVASION,
    build_text_column, add_label_ids, get_annotator_labels
)
from src.models import MultiTaskTransformer
from src.training import train_model, EarlyStopping, evaluate_multitask
from src.metrics import evaluate_task2_multi_annotator, plot_confusion_matrix

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Device: {device}")

Device: cpu


## 1. Data Preparation
We load the dataset once. Tokenization happens inside the training loop as it depends on the specific tokenizer.

In [9]:
dataset = load_qevasion_prepared()
train_df_raw = dataset["train"].to_pandas()
test_df_raw = dataset["test"].to_pandas()

# Standard Preprocessing (Text & Labels)
# Note: In multi-task, we want rows that have at least a Clarity label.
# Evasion labels might be NaN for some (handled by mask).

def preprocess_for_multitask(df):
    df = build_text_column(df)
    df = add_label_ids(df)
    return df

train_full = preprocess_for_multitask(train_df_raw)
test_full = preprocess_for_multitask(test_df_raw)

# Add annotator labels for test set (for Evasion Acc calculation)
test_full["annotator_labels"] = test_full.apply(get_annotator_labels, axis=1)

print(f"Train size: {len(train_full)}")
print(f"Test size:  {len(test_full)}")

# Visualize Label Distribution (as requested)
fig, axes = plt.subplots(1, 2, figsize=(15, 5))

clarity_counts = train_full['clarity_label'].value_counts()
sns.barplot(x=clarity_counts.index, y=clarity_counts.values, ax=axes[0], palette='viridis')
axes[0].set_title('Clarity Label Distribution')
axes[0].tick_params(axis='x', rotation=45)

evasion_counts = train_full['evasion_label'].value_counts()
sns.barplot(x=evasion_counts.index, y=evasion_counts.values, ax=axes[1], palette='magma')
axes[1].set_title('Evasion Label Distribution')
axes[1].tick_params(axis='x', rotation=45, labelsize=8)

plt.tight_layout()
plt.show()

In [10]:
from torch.utils.data import Dataset

class MultiTaskDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=256):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.df)

    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = row['text']
        
        enc = self.tokenizer(
            text, 
            max_length=self.max_length, 
            padding='max_length', 
            truncation=True, 
            return_tensors='pt'
        )

        item = {
            'input_ids': enc['input_ids'].squeeze(0),
            'attention_mask': enc['attention_mask'].squeeze(0),
            'clarity_labels': torch.tensor(row['clarity_id'], dtype=torch.long)
        }

        # Handle Evasion (might be -1 if missing)
        ev_id = row['evasion_id']
        if ev_id != -1:
            item['evasion_labels'] = torch.tensor(ev_id, dtype=torch.long)
            item['evasion_mask'] = torch.tensor(1, dtype=torch.long)
        else:
            item['evasion_labels'] = torch.tensor(-1, dtype=torch.long)
            item['evasion_mask'] = torch.tensor(0, dtype=torch.long)
            
        return item

## 2. Experiment Configuration
We define the list of models to loop over.

In [11]:
MODELS_TO_TEST = [
    "distilbert-base-uncased",
    "bert-base-uncased",
    "albert-base-v2"
]

BATCH_SIZE = 16
MAX_LENGTH = 256
NUM_EPOCHS = 5
LEARNING_RATE = 2e-5

from sklearn.model_selection import train_test_split

# Create Train/Val split (stratified by Clarity for consistency)
train_indices, val_indices = train_test_split(
    np.arange(len(train_full)),
    test_size=0.1,
    stratify=train_full["clarity_id"].values,
    random_state=42
)

df_train = train_full.iloc[train_indices]
df_val = train_full.iloc[val_indices]
df_test = test_full

print(f"Train split: {len(df_train)}")
print(f"Val split:   {len(df_val)}")

## 3. Training & Evaluation Loop

In [None]:
all_results = []

for model_name in MODELS_TO_TEST:
    print("\n" + "="*60)
    print(f"MODEL: {model_name}")
    print("="*60)
    
    # 1. Tokenizer
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    
    # 2. DataLoaders
    train_ds = MultiTaskDataset(df_train, tokenizer, MAX_LENGTH)
    val_ds   = MultiTaskDataset(df_val, tokenizer, MAX_LENGTH)
    test_ds  = MultiTaskDataset(df_test, tokenizer, MAX_LENGTH)
    
    train_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
    val_loader   = DataLoader(val_ds, batch_size=BATCH_SIZE)
    test_loader  = DataLoader(test_ds, batch_size=BATCH_SIZE)
    
    # 3. Model Setup
    model = MultiTaskTransformer(
        model_name=model_name,
        num_clarity_labels=len(CLARITY_TO_ID),
        num_evasion_labels=len(EVASION_TO_ID)
    ).to(device)
    
    optimizer = torch.optim.AdamW(model.parameters(), lr=LEARNING_RATE)
    criterion = nn.CrossEntropyLoss()
    early_stopping = EarlyStopping(patience=2, mode='max', verbose=True)
    
    # 4. Training
    print(f"Training {model_name}...")
    history = train_model(
        model=model,
        train_loader=train_loader,
        val_loader=val_loader,
        optimizer=optimizer,
        scheduler=None,
        clarity_loss_fn=criterion,
        evasion_loss_fn=criterion,
        device=device,
        num_epochs=NUM_EPOCHS,
        early_stopping=early_stopping,
        is_multitask=True,
        verbose=True
    )

    # 5. Plot Training History
    plt.figure(figsize=(12, 4))
    
    # Loss
    plt.subplot(1, 2, 1)
    plt.plot(history['train_loss'], label='Train Loss')
    plt.title(f'{model_name}: Loss over Epochs')
    plt.xlabel('Epoch')
    plt.ylabel('Loss')
    plt.legend()
    
    # F1 (Clarity)
    plt.subplot(1, 2, 2)
    plt.plot(history['val_macro_f1'], label='Val Clarity F1', color='orange')
    plt.title(f'{model_name}: Validation Clarity F1')
    plt.xlabel('Epoch')
    plt.ylabel('Macro F1')
    plt.legend()
    
    plt.tight_layout()
    plt.show()

    # 6. Final Evaluation on Test
    # Get Standard metrics
    test_metrics = evaluate_multitask(model, test_loader, device)
    
    # Get Predictions for Multi-Annotator Evasion Acc
    model.eval()
    all_ev_preds = []
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            _, ev_logits = model(input_ids, mask)
            preds = torch.argmax(ev_logits, dim=-1)
            all_ev_preds.extend(preds.cpu().numpy())
    
    pred_labels_str = [ID_TO_EVASION[i] for i in all_ev_preds]
    gold_sets = df_test["annotator_labels"].tolist()
    
    # Filter to only rows that have annotators (should be all in test_df filter)
    correct_count = 0
    total_count = 0
    for pred, gold in zip(pred_labels_str, gold_sets):
        if len(gold) > 0:
            if pred in gold:
                correct_count += 1
            total_count += 1
    
    ev_multi_acc = correct_count / total_count if total_count > 0 else 0.0
    
    # Store Results (Updated with detailed metrics)
    res = {
        "Model": model_name,
        "Clarity_Acc": test_metrics.get("clarity_accuracy", 0),
        "Clarity_F1": test_metrics.get("clarity_macro_f1", 0),
        "Evasion_Acc_Std": test_metrics.get("evasion_accuracy", 0),
        "Evasion_F1_Std": test_metrics.get("evasion_macro_f1", 0),
        "Evasion_Acc_Multi": ev_multi_acc,
        "Total_Score": (test_metrics.get("clarity_macro_f1", 0) + ev_multi_acc) / 2
    }
    all_results.append(res)
    
    print(f"Results for {model_name}:")
    print(f"  Clarity Acc: {res['Clarity_Acc']:.4f}, F1: {res['Clarity_F1']:.4f}")
    print(f"  Evasion Acc (Std): {res['Evasion_Acc_Std']:.4f}, F1 (Std): {res['Evasion_F1_Std']:.4f}")
    print(f"  Evasion Acc (Multi): {res['Evasion_Acc_Multi']:.4f}")
    print(f"  Total Score: {res['Total_Score']:.4f}")
    
    # 7. Confusion Matrix (Clarity)
    # We need predictions again for Confusion Matrix
    all_cl_preds = []
    all_cl_labels = []
    model.eval()
    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch["input_ids"].to(device)
            mask = batch["attention_mask"].to(device)
            cl_labels = batch["clarity_labels"]
            cl_logits, _ = model(input_ids, mask)
            preds = torch.argmax(cl_logits, dim=-1)
            all_cl_preds.extend(preds.cpu().numpy())
            all_cl_labels.extend(cl_labels.cpu().numpy())
            
    plot_confusion_matrix(
        np.array(all_cl_labels),
        np.array(all_cl_preds),
        label_names=CLARITY_LABELS,
        title=f"{model_name}: Clarity Confusion Matrix"
    )
    
    # Free memory
    del model, optimizer, train_loader, val_loader
    torch.cuda.empty_cache()

## 4. Final Comparison
Aggregating results across all tested models.

In [None]:
results_df = pd.DataFrame(all_results)
print("\n=== FINAL SUMMARY ===")
display(results_df)

# Visualization
fig, axes = plt.subplots(1, 4, figsize=(20, 6))

# Clarity F1 Comparison
sns.barplot(data=results_df, x="Model", y="Clarity_F1", ax=axes[0], palette="Blues_d")
axes[0].set_title("Clarity F1", fontsize=12)
axes[0].set_ylim(0, 1.0)
for i, v in enumerate(results_df["Clarity_F1"]):
    axes[0].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold')

# Clarity Accuracy Comparison
sns.barplot(data=results_df, x="Model", y="Clarity_Acc", ax=axes[1], palette="Purples_d")
axes[1].set_title("Clarity Accuracy", fontsize=12)
axes[1].set_ylim(0, 1.0)
for i, v in enumerate(results_df["Clarity_Acc"]):
    axes[1].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold')

# Evasion Accuracy Comparison (Multi)
sns.barplot(data=results_df, x="Model", y="Evasion_Acc_Multi", ax=axes[2], palette="Greens_d")
axes[2].set_title("Evasion Acc (Multi)", fontsize=12)
axes[2].set_ylim(0, 1.0)
for i, v in enumerate(results_df["Evasion_Acc_Multi"]):
    axes[2].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold')

# Total Score Comparison
sns.barplot(data=results_df, x="Model", y="Total_Score", ax=axes[3], palette="Reds_d")
axes[3].set_title("Total Score (Avg)", fontsize=12)
axes[3].set_ylim(0, 1.0)
for i, v in enumerate(results_df["Total_Score"]):
    axes[3].text(i, v + 0.02, f"{v:.3f}", ha='center', fontweight='bold')

plt.tight_layout()
plt.show()

In [None]:
results_df.to_csv("multimodel_v2_results.csv", index=False)
print("Results saved to multimodel_v2_results.csv")