# üöÄ Final Training Notebook for Multilingual Hate Detection
This notebook is optimized for Google Colab (T4/A100 GPU).
It uses the **ENHANCED** dataset (`UNIFIED_ALL_SPLIT_ENHANCED.csv`) which includes:
- 14,000+ recovered neutral Bengali samples.
- Banglish keyword fixes.
- Strict consistency checks (No Safe=Hate errors).

**Steps:**
1. Mount Drive.
2. Install Dependencies.
3. Load Data.
4. Train with Mixed Precision (AMP).
5. Save Checkpoints.

In [None]:
# 1. Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create checkpoint directory
import os
CHECKPOINT_DIR = '/content/drive/MyDrive/thesis/checkpoints/'
os.makedirs(CHECKPOINT_DIR, exist_ok=True)
print(f"‚úÖ Checkpoints will be saved to: {CHECKPOINT_DIR}")

In [None]:
# 2. Install Dependencies & Check GPU
# Force upgrade to fix 'GenerationMixin' error and ensure compatibility
!pip install -U transformers accelerate wandb scikit-learn

import torch
if torch.cuda.is_available():
    print(f"‚úÖ GPU Available: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.2f} GB")
else:
    print("‚ùå No GPU detected. Please enable GPU in Runtime > Change runtime type.")

In [None]:
# 2.1 Pre-download Model (Turbo Mode üöÄ)
import os

print("‚è≥ Downloading XLM-RoBERTa Large (High Speed)...")

# 1. Install hf_transfer (Rust-based accelerator)
!pip install -q huggingface_hub hf_transfer

# 2. Enable the accelerator
os.environ["HF_HUB_ENABLE_HF_TRANSFER"] = "1"

# 3. Download
# Note: If this fails with "command not found", restart the runtime once.
!huggingface-cli download facebook/xlm-roberta-large --local-dir ./xlm-roberta-large --exclude "*.msgpack" "*.h5" "*.ot"

print("‚úÖ Model downloaded to ./xlm-roberta-large")

In [None]:
# 3. Load Dataset
import pandas as pd
import numpy as np

# Path to the uploaded BALANCED dataset
DATASET_PATH = '/content/drive/MyDrive/thesis/dataset/UNIFIED_BALANCED_GENERATED.csv'

if not os.path.exists(DATASET_PATH):
    print(f"‚ùå Error: Dataset not found at {DATASET_PATH}")
    print("Please upload 'UNIFIED_BALANCED_GENERATED.csv' to 'thesis/dataset/' in your Drive.")
else:
    df = pd.read_csv(DATASET_PATH)
    print(f"‚úÖ Loaded dataset: {len(df)} rows")
    print("\nSplit distribution:")
    print(df['split'].value_counts())
    print("\nLanguage distribution:")
    print(df['language'].value_counts())
    print("\nSeverity distribution:")
    print(df['severity'].value_counts().sort_index())
    print("\nHate Type distribution:")
    print(df['hate_type'].value_counts().sort_index())

In [None]:
# 4. Define Dataset Class
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer

class HateDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=160):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['text'])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        # Extract labels (use -1 for missing)
        hate_type = int(row['hate_type'])
        target_group = int(row['target_group'])
        severity = int(row['severity'])
        
        # Create masks: True if label is valid (not -1)
        hate_type_mask = hate_type != -1
        target_group_mask = target_group != -1
        severity_mask = severity != -1
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'hate_type': torch.tensor(max(0, hate_type), dtype=torch.long),
            'target_group': torch.tensor(max(0, target_group), dtype=torch.long),
            'severity': torch.tensor(max(0, severity), dtype=torch.long),
            'hate_type_mask': torch.tensor(hate_type_mask, dtype=torch.bool),
            'target_group_mask': torch.tensor(target_group_mask, dtype=torch.bool),
            'severity_mask': torch.tensor(severity_mask, dtype=torch.bool),
        }

# Initialize Tokenizer
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
print("‚úÖ Tokenizer loaded")

In [None]:
# 2. Load Dataset
import pandas as pd
import numpy as np
import os

# Mount Drive if needed (Colab specific)
try:
    from google.colab import drive
    drive.mount('/content/drive')
    DATASET_PATH = '/content/drive/MyDrive/thesis/dataset/UNIFIED_BALANCED_GENERATED.csv'
    if not os.path.exists(DATASET_PATH):
        print("Please upload 'UNIFIED_BALANCED_GENERATED.csv' to 'thesis/dataset/' in your Drive.")
except:
    DATASET_PATH = 'dataset/UNIFIED_BALANCED_GENERATED.csv' # Local fallback

print(f"üìÇ Loading dataset from: {DATASET_PATH}")
df = pd.read_csv(DATASET_PATH)

# Basic cleaning
df = df.fillna(-1)
print(f"‚úÖ Loaded {len(df)} samples")
print(df['language'].value_counts())


In [None]:
import torch.nn as nn
import torch.nn.functional as F
import torch

class FocalLoss(nn.Module):
    def __init__(self, alpha=None, gamma=2.0, reduction='mean'):
        super(FocalLoss, self).__init__()
        self.alpha = alpha
        self.gamma = gamma
        self.reduction = reduction

    def forward(self, inputs, targets):
        ce_loss = F.cross_entropy(inputs, targets, reduction='none', weight=self.alpha)
        pt = torch.exp(-ce_loss)
        focal_loss = (1 - pt) ** self.gamma * ce_loss

        if self.reduction == 'mean':
            return focal_loss.mean()
        elif self.reduction == 'sum':
            return focal_loss.sum()
        else:
            return focal_loss

# üî• UPDATED WEIGHTS FOR BALANCED DATASET (3:1 Ratio)
# Not Hate (~80k) vs Specific Hate (~30k each)
# The imbalance is now small (~2.7:1), so we use mild weights.
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Weights ~ Inverse Frequency
# Not Hate: 1.0 (Baseline)
# Specific Hate (Pol, Rel, Gen, Geo): ~2.5
# Personal: ~2.0
ht_weights = torch.tensor([1.0, 2.5, 2.5, 2.5, 2.0, 2.5], device=device, dtype=torch.float)

# Target Group: Community is dominant (~90k), Individual/Group (~20k)
# Weights: Community=1.0, Others=3.0
tg_weights = torch.tensor([1.0, 3.0, 3.0, 1.0], device=device, dtype=torch.float) 

# Severity: Balanced-ish
sv_weights = torch.tensor([1.0, 1.5, 1.5, 2.0], device=device, dtype=torch.float)

# Initialize Weighted Focal Loss
loss_fn_ht = FocalLoss(alpha=ht_weights, gamma=2.0)
loss_fn_tg = FocalLoss(alpha=tg_weights, gamma=2.0)
loss_fn_sv = FocalLoss(alpha=sv_weights, gamma=2.0)

def multitask_loss(hate_type_logits, target_group_logits, severity_logits,
                   targets, masks, task_weights=(1.0, 1.0, 1.0)):
    
    total_loss = 0.0
    
    # Hate type loss
    ht_mask = masks['hate_type'].bool()
    if ht_mask.any():
        loss_ht = loss_fn_ht(hate_type_logits[ht_mask], targets['hate_type'][ht_mask])
        total_loss += task_weights[0] * loss_ht
        
    # Target group loss
    tg_mask = masks['target_group'].bool()
    if tg_mask.any():
        loss_tg = loss_fn_tg(target_group_logits[tg_mask], targets['target_group'][tg_mask])
        total_loss += task_weights[1] * loss_tg
        
    # Severity loss
    sv_mask = masks['severity'].bool()
    if sv_mask.any():
        loss_sv = loss_fn_sv(severity_logits[sv_mask], targets['severity'][sv_mask])
        total_loss += task_weights[2] * loss_sv
        
    return total_loss

In [None]:
# 5. Define Model Architecture
import torch.nn as nn
import os
from transformers import XLMRobertaModel

class MultiTaskXLMRRoberta(nn.Module):
    def __init__(self, model_name='xlm-roberta-large', dropout=0.3):
        super(MultiTaskXLMRRoberta, self).__init__()
        
        # Check if local cache exists (from CLI download)
        if model_name == 'xlm-roberta-large' and os.path.exists('./xlm-roberta-large'):
            print(f"üìÇ Loading model from local cache: ./xlm-roberta-large")
            model_name = './xlm-roberta-large'
            
        self.backbone = XLMRobertaModel.from_pretrained(model_name)
        self.hidden_size = self.backbone.config.hidden_size
        self.dropout = nn.Dropout(dropout)
        
        # Classification Heads
        self.hate_type_head = nn.Linear(self.hidden_size, 6)    # 0-5
        self.target_group_head = nn.Linear(self.hidden_size, 4) # 0-3
        self.severity_head = nn.Linear(self.hidden_size, 4)     # 0-3
        
    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        # Use CLS token representation (first token)
        pooled_output = outputs.last_hidden_state[:, 0, :]
        pooled_output = self.dropout(pooled_output)
        
        hate_type_logits = self.hate_type_head(pooled_output)
        target_group_logits = self.target_group_head(pooled_output)
        severity_logits = self.severity_head(pooled_output)
        
        return hate_type_logits, target_group_logits, severity_logits

print("‚úÖ MultiTaskXLMRRoberta model defined")

In [None]:
# 7. Training Loop with Smart LR Adjustment
# We need to lower LR when unfreezing to prevent divergence

def train_model(model, train_loader, val_loader, total_epochs=8):
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=1e-2)
    
    # Scheduler: Linear warmup then decay
    total_steps = len(train_loader) * total_epochs
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                                num_warmup_steps=int(0.1 * total_steps), 
                                                num_training_steps=total_steps)
    
    scaler = amp.GradScaler()
    
    best_val_loss = float('inf')
    
    # Checkpoint names
    last_ckpt_path = os.path.join(CHECKPOINT_DIR, 'xlmr_smart_last.pt')
    best_ckpt_path = os.path.join(CHECKPOINT_DIR, 'xlmr_smart_best.pt')
    
    start_epoch = 0
    
    print(f"üöÄ Starting SMART Training Strategy")
    print(f"   Phase 1 (Epoch 1-2): Backbone Frozen (Heads only)")
    print(f"   Phase 2 (Epoch 3+):  Full Fine-Tuning (Lower LR)")
    
    for epoch in range(start_epoch, total_epochs):
        start_time = time.time()
        print(f"\nEpoch {epoch+1}/{total_epochs}")
        
        # SMART FREEZING & LR LOGIC
        if epoch < 2:
            freeze_backbone(model, freeze=True)
            # Standard LR for heads
            for param_group in optimizer.param_groups:
                param_group['lr'] = 2e-5
        else:
            freeze_backbone(model, freeze=False)
            # üî• CRITICAL FIX: Lower LR by 10x when unfreezing backbone
            # This prevents the "Epoch 4 Divergence" we saw earlier
            for param_group in optimizer.param_groups:
                param_group['lr'] = 2e-6 
            
        model.train()
        total_loss = 0
        
        loop = tqdm(train_loader, leave=True)
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            targets = {k: batch[k].to(device) for k in ['hate_type', 'target_group', 'severity']}
            masks = {k: batch[f'{k}_mask'].to(device) for k in ['hate_type', 'target_group', 'severity']}
            
            optimizer.zero_grad()
            
            with amp.autocast():
                ht_logits, tg_logits, sv_logits = model(input_ids, attention_mask)
                loss = multitask_loss(ht_logits, tg_logits, sv_logits, targets, masks)
            
            scaler.scale(loss).backward()
            scaler.unscale_(optimizer)
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
            scaler.step(optimizer)
            scaler.update()
            # scheduler.step() # Disable scheduler since we manually control LR
            
            total_loss += loss.item()
            loop.set_description(f"Loss: {loss.item():.4f}")
            
        avg_train_loss = total_loss / len(train_loader)
        print(f"Average Train Loss: {avg_train_loss:.4f}")
        
        # Validation
        model.eval()
        val_loss = 0
        with torch.no_grad():
            for batch in val_loader:
                input_ids = batch['input_ids'].to(device)
                attention_mask = batch['attention_mask'].to(device)
                targets = {k: batch[k].to(device) for k in ['hate_type', 'target_group', 'severity']}
                masks = {k: batch[f'{k}_mask'].to(device) for k in ['hate_type', 'target_group', 'severity']}
                
                ht_logits, tg_logits, sv_logits = model(input_ids, attention_mask)
                loss = multitask_loss(ht_logits, tg_logits, sv_logits, targets, masks)
                val_loss += loss.item()
                
        avg_val_loss = val_loss / len(val_loader)
        epoch_time = (time.time() - start_time) / 60
        print(f"Validation Loss: {avg_val_loss:.4f} | Time: {epoch_time:.1f} min")
        
        # Save Best Checkpoint
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), best_ckpt_path)
            print(f"üî• New Best Model Saved: {best_ckpt_path}")
            
        # Save Last Checkpoint
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'best_val_loss': best_val_loss
        }, last_ckpt_path)

In [None]:
# 7. Prepare Data Loaders
from torch.utils.data import DataLoader

# Note: We removed manual class weights because Focal Loss handles imbalance dynamically.
# This prevents the "Paranoia" issue where manual weights force the model to over-predict rare classes.

# Split Data
train_df = df[df['split'] == 'train']
val_df = df[df['split'] == 'val']

train_dataset = HateDataset(train_df, tokenizer)
val_dataset = HateDataset(val_df, tokenizer)

BATCH_SIZE = 16

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print(f"Train batches: {len(train_loader)}")
print(f"Val batches: {len(val_loader)}")

In [None]:
# 8. Training Loop (Smart Layer-Wise Learning)
from transformers import get_linear_schedule_with_warmup
from sklearn.metrics import f1_score
from tqdm.notebook import tqdm
import torch.cuda.amp as amp
import time
import os
import torch

def freeze_backbone(model, freeze=True):
    for param in model.backbone.parameters():
        param.requires_grad = not freeze
    status = "‚ùÑÔ∏è FROZEN" if freeze else "üî• UNFROZEN"
    print(f"Model Backbone is now {status}")

def evaluate(model, loader, device):
    model.eval()
    total_loss = 0
    
    all_preds = {'hate_type': [], 'target_group': [], 'severity': []}
    all_labels = {'hate_type': [], 'target_group': [], 'severity': []}
    
    with torch.no_grad():
        for batch in loader:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            targets = {k: batch[k].to(device) for k in ['hate_type', 'target_group', 'severity']}
            
            # Fix masks for validation
            masks = {
                'hate_type': batch['hate_type_mask'].to(device),
                'target_group': batch['target_group_mask'].to(device),
                'severity': batch['severity_mask'].to(device)
            }
            
            ht_logits, tg_logits, sv_logits = model(input_ids, attention_mask)
            loss = multitask_loss(ht_logits, tg_logits, sv_logits, targets, masks)
            total_loss += loss.item()
            
            # Collect preds for F1
            for task, logits in zip(['hate_type', 'target_group', 'severity'], [ht_logits, tg_logits, sv_logits]):
                preds = torch.argmax(logits, dim=1)
                mask = masks[task].bool()
                
                if mask.any():
                    all_preds[task].extend(preds[mask].cpu().numpy())
                    all_labels[task].extend(targets[task][mask].cpu().numpy())
                    
    avg_loss = total_loss / len(loader)
    
    metrics = {}
    for task in ['hate_type', 'target_group', 'severity']:
        if len(all_labels[task]) > 0:
            metrics[task] = {
                'macro_f1': f1_score(all_labels[task], all_preds[task], average='macro'),
                'micro_f1': f1_score(all_labels[task], all_preds[task], average='micro')
            }
        else:
            metrics[task] = {'macro_f1': 0.0, 'micro_f1': 0.0}
        
    return avg_loss, metrics

def train_model(total_epochs=10, resume_from_epoch=None):
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model = MultiTaskXLMRRoberta(dropout=0.3).to(device) 
    
    # Optimizer
    # Lower learning rate for stability over 10 epochs
    optimizer = torch.optim.AdamW(model.parameters(), lr=2e-5, weight_decay=0.01)
    
    total_steps = len(train_loader) * total_epochs
    
    # Warmup for 10% of steps
    scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=int(0.1*total_steps), num_training_steps=total_steps)
    scaler = torch.amp.GradScaler('cuda') 
    
    start_epoch = 0
    best_val_loss = float('inf')
    
    # Checkpoint names
    last_ckpt_path = os.path.join(CHECKPOINT_DIR, 'xlmr_smart_last.pt')
    best_ckpt_path = os.path.join(CHECKPOINT_DIR, 'xlmr_smart_best.pt')
    
    # Resume logic
    if os.path.exists(last_ckpt_path):
        print(f"üîÑ Found checkpoint at {last_ckpt_path}. Attempting to resume...")
        try:
            checkpoint = torch.load(last_ckpt_path)
            
            # CASE 1: Full Checkpoint (Ideal)
            if isinstance(checkpoint, dict) and 'model_state_dict' in checkpoint:
                model.load_state_dict(checkpoint['model_state_dict'])
                optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
                if 'scheduler_state_dict' in checkpoint:
                    scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
                if 'scaler_state_dict' in checkpoint:
                    scaler.load_state_dict(checkpoint['scaler_state_dict'])
                
                start_epoch = checkpoint['epoch'] + 1
                best_val_loss = checkpoint.get('best_val_loss', float('inf'))
                print(f"‚úÖ Successfully resumed from Epoch {start_epoch+1}")
                
            # CASE 2: Weights Only (Fallback)
            else:
                print("‚ö†Ô∏è Checkpoint contains weights only (no optimizer state).")
                model.load_state_dict(checkpoint) # Try loading directly
                print("‚úÖ Weights loaded.")
                
                # If user didn't specify where to start, we have to guess or start at 0
                if resume_from_epoch is not None:
                    start_epoch = resume_from_epoch
                    print(f"‚è© Forcing start from Epoch {start_epoch+1} (User specified)")
                else:
                    print("‚ö†Ô∏è No epoch info found. Starting from Epoch 1 (but with pre-trained weights).")
                    print("üí° Tip: Call train_model(total_epochs=10, resume_from_epoch=2) to set correct epoch.")

        except Exception as e:
            print(f"‚ùå Error loading checkpoint: {e}")
            print("üöÄ Starting FRESH Smart Mode training.")
    else:
        print("üöÄ Starting FRESH Smart Mode training.")

    # Override if user specified
    if resume_from_epoch is not None and start_epoch == 0:
         start_epoch = resume_from_epoch
         print(f"‚è© Manual override: Starting from Epoch {start_epoch+1}")

    if start_epoch >= total_epochs:
        print("‚úÖ Training already completed!")
        return
    
    print(f"üöÄ Starting SMART Training Strategy (10 Epochs)")
    print(f"   Phase 1 (Epoch 1):   Backbone Frozen (Heads Warmup)")
    print(f"   Phase 2 (Epoch 2-10): Full Fine-Tuning")
    
    for epoch in range(start_epoch, total_epochs):
        start_time = time.time()
        print(f"\nEpoch {epoch+1}/{total_epochs}")
        
        # SMART FREEZING LOGIC
        # Freeze only for the very first epoch to align the heads
        if epoch < 1:
            freeze_backbone(model, freeze=True)
        else:
            freeze_backbone(model, freeze=False)
            
        model.train()
        total_loss = 0
        
        loop = tqdm(train_loader, leave=True)
        
        for batch in loop:
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            # Forward (Mixed Precision)
            with torch.amp.autocast('cuda'):
                ht_logits, tg_logits, sv_logits = model(input_ids, attention_mask)
                
                # Move targets to device
                targets = {k: v.to(device) for k, v in batch.items() if k in ['hate_type', 'target_group', 'severity']}
                
                # üî• FIX: Map mask keys correctly for multitask_loss
                masks = {
                    'hate_type': batch['hate_type_mask'].to(device),
                    'target_group': batch['target_group_mask'].to(device),
                    'severity': batch['severity_mask'].to(device)
                }
                
                loss = multitask_loss(ht_logits, tg_logits, sv_logits, targets, masks)
            
            # Backward
            optimizer.zero_grad()
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            scheduler.step()
            
            total_loss += loss.item()
            loop.set_description(f"Loss: {loss.item():.4f}")
            
        avg_train_loss = total_loss / len(train_loader)
        
        # Validation
        val_loss, metrics = evaluate(model, val_loader, device)
        
        print(f"üìâ Train Loss: {avg_train_loss:.4f} | Val Loss: {val_loss:.4f}")
        print(f"üèÜ Val Macro F1 - HT: {metrics['hate_type']['macro_f1']:.3f} | TG: {metrics['target_group']['macro_f1']:.3f} | SV: {metrics['severity']['macro_f1']:.3f}")
        
        # Save Best Model
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), best_ckpt_path)
            print(f"üî• New Best Model Saved: {best_ckpt_path}")
            
        # Save Last Checkpoint (Include Scheduler & Scaler)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'best_val_loss': best_val_loss
        }, last_ckpt_path)

In [None]:
# üöÄ START TRAINING
# We run for 10 epochs as requested to ensure deep learning on the balanced dataset.
train_model(total_epochs=10)

In [None]:
# 9. Final Evaluation on Test Set (Smart Model)
from sklearn.metrics import classification_report
import torch
import os
from tqdm.notebook import tqdm

def evaluate_test_set():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    print("‚è≥ Loading best SMART model for evaluation...")
    model = MultiTaskXLMRRoberta(dropout=0.3) 
    # Load best checkpoint
    ckpt_path = os.path.join(CHECKPOINT_DIR, 'xlmr_smart_best.pt')
    if not os.path.exists(ckpt_path):
        print("‚ùå No smart checkpoint found. Run training first.")
        return
        
    model.load_state_dict(torch.load(ckpt_path))
    model.to(device)
    model.eval()
    
    # Prepare Test Loader
    test_df = df[df['split'] == 'test']
    test_dataset = HateDataset(test_df, tokenizer)
    test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False, num_workers=2)
    
    print(f"üöÄ Evaluating on {len(test_df)} test samples...")
    
    # Storage for predictions and labels
    task_metrics = {
        'hate_type': {'preds': [], 'labels': []},
        'target_group': {'preds': [], 'labels': []},
        'severity': {'preds': [], 'labels': []}
    }
    
    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            
            ht_logits, tg_logits, sv_logits = model(input_ids, attention_mask)
            
            # Helper to collect valid predictions
            def collect(logits, task_name):
                preds = torch.argmax(logits, dim=1).cpu().numpy()
                labels = batch[task_name].cpu().numpy()
                masks = batch[f'{task_name}_mask'].cpu().numpy()
                
                for p, l, m in zip(preds, labels, masks):
                    if m: # Only keep valid labels
                        task_metrics[task_name]['preds'].append(p)
                        task_metrics[task_name]['labels'].append(l)

            collect(ht_logits, 'hate_type')
            collect(tg_logits, 'target_group')
            collect(sv_logits, 'severity')
            
    # Print Reports
    for task, data in task_metrics.items():
        print(f"\nüìä --- {task.upper().replace('_', ' ')} Report ---")
        if len(data['labels']) > 0:
            print(classification_report(data['labels'], data['preds'], digits=4))
        else:
            print("No valid labels found for this task in test set.")

evaluate_test_set()