In [None]:
# 0. Install dependencies and verify GPU
!pip install transformers scikit-learn wandb sentencepiece -q

import torch
print(f'PyTorch version: {torch.__version__}')
print(f'CUDA available: {torch.cuda.is_available()}')
if torch.cuda.is_available():
    print(f'GPU: {torch.cuda.get_device_name(0)}')
    print(f'GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB')
else:
    print('‚ö†Ô∏è NO GPU DETECTED! Go to Runtime ‚Üí Change runtime type ‚Üí GPU')

In [None]:
# 1. Mount Google Drive & Setup (RUN THIS FIRST!)
from google.colab import drive
import os

# Mount Google Drive
drive.mount('/content/drive')

# Setup paths
DRIVE_FOLDER = '/content/drive/MyDrive/thesis_training'
os.makedirs(DRIVE_FOLDER, exist_ok=True)
os.makedirs(f'{DRIVE_FOLDER}/checkpoints_enhanced', exist_ok=True)
os.makedirs('dataset', exist_ok=True)

# =====================================================
# CHOOSE YOUR DATASET:
# - FILTERED: High quality labels, smaller size (25K), 86% hate_type coverage
# - ENHANCED: Auto-labeled, larger size (75K), 95% hate_type coverage ‚≠ê RECOMMENDED
# =====================================================

# OPTION 1: Filtered dataset (no toxic_comments) - Fast training
# DRIVE_DATASET = f'{DRIVE_FOLDER}/UNIFIED_ALL_SPLIT_FILTERED.csv'

# OPTION 2: Enhanced dataset (with auto-labeled toxic_comments) - Best performance ‚≠ê
DRIVE_DATASET = f'{DRIVE_FOLDER}/UNIFIED_ALL_SPLIT_ENHANCED.csv'

LOCAL_DATASET = 'dataset/UNIFIED_ALL_SPLIT.csv'

if os.path.exists(DRIVE_DATASET):
    !cp '{DRIVE_DATASET}' '{LOCAL_DATASET}'
    print('‚úÖ Dataset loaded from Google Drive!')
    print(f'   Source: {DRIVE_DATASET.split("/")[-1]}')
else:
    print(f'‚ùå Dataset not found! Please upload to:')
    print(f'   Google Drive ‚Üí My Drive ‚Üí thesis_training/')
    print(f'   File needed: UNIFIED_ALL_SPLIT_ENHANCED.csv')
    print(f'   (or UNIFIED_ALL_SPLIT_FILTERED.csv if using filtered option)')

# Checkpoint directory (saves to Drive - survives disconnects!)
CHECKPOINT_DIR = f'{DRIVE_FOLDER}/checkpoints_enhanced/'
print(f'‚úÖ Checkpoints will save to: {CHECKPOINT_DIR}')

!ls -lh dataset/


In [None]:
# 2. Load and verify dataset
import pandas as pd

df = pd.read_csv('dataset/UNIFIED_ALL_SPLIT.csv')

print(f'üìä Dataset: {len(df)} samples from {df["source_dataset"].nunique()} sources')
print('='*60)

# Split sizes
train_df = df[df['split'] == 'train']
val_df = df[df['split'] == 'val']
test_df = df[df['split'] == 'test']
print(f'Train: {len(train_df)} | Val: {len(val_df)} | Test: {len(test_df)}\n')

# Label coverage
ht_valid = df[df['hate_type'] != -1]
tg_valid = df[df['target_group'] != -1]
print(f'Label Coverage:')
print(f'  Hate Type:    {len(ht_valid)/len(df)*100:.1f}% ({len(ht_valid)}/{len(df)})')
print(f'  Target Group: {len(tg_valid)/len(df)*100:.1f}% ({len(tg_valid)}/{len(df)})')

# Key stats
print(f'\nClass Distribution:')
print(f'  Hate Type:    {ht_valid["hate_type"].value_counts().to_dict()}')
print(f'  Target Group: {tg_valid["target_group"].value_counts().to_dict()}')

print('\n' + '='*60)
if 'toxic_comments_labeled' in df['source_dataset'].values:
    print('‚úÖ ENHANCED dataset loaded (auto-labeled toxic_comments)')
    print('   ‚Üí 95% hate_type | 77% target_group coverage')
elif len(df) < 30000:
    print('‚úÖ FILTERED dataset loaded (no toxic_comments)')
    print('   ‚Üí 86% hate_type | 34% target_group coverage')
print('='*60)


In [None]:
# 3. HateDataset: PyTorch Dataset with tokenization and masking
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import XLMRobertaTokenizer

class HateDataset(Dataset):
    def __init__(self, df, tokenizer, max_length=160):
        self.df = df.reset_index(drop=True)
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        text = str(row['text'])
        
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        hate_type = int(row['hate_type'])
        target_group = int(row['target_group'])
        severity = int(row['severity'])
        
        hate_type_mask = hate_type != -1
        target_group_mask = target_group != -1
        severity_mask = severity != -1
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'hate_type': torch.tensor(max(0, hate_type), dtype=torch.long),
            'target_group': torch.tensor(max(0, target_group), dtype=torch.long),
            'severity': torch.tensor(max(0, severity), dtype=torch.long),
            'hate_type_mask': torch.tensor(hate_type_mask, dtype=torch.bool),
            'target_group_mask': torch.tensor(target_group_mask, dtype=torch.bool),
            'severity_mask': torch.tensor(severity_mask, dtype=torch.bool),
        }

tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-large')
print('Tokenizer loaded.')

In [None]:
# 4. MultiTaskXLMRRoberta Model
import torch.nn as nn
from transformers import XLMRobertaModel

class MultiTaskXLMRRoberta(nn.Module):
    def __init__(self, model_name='xlm-roberta-large', dropout=0.2,
                 n_hate_type=6, n_target_group=4, n_severity=4):
        super().__init__()
        self.backbone = XLMRobertaModel.from_pretrained(model_name)
        hidden_size = self.backbone.config.hidden_size
        
        self.dropout = nn.Dropout(dropout)
        self.hate_type_head = nn.Linear(hidden_size, n_hate_type)
        self.target_group_head = nn.Linear(hidden_size, n_target_group)
        self.severity_head = nn.Linear(hidden_size, n_severity)
    
    def forward(self, input_ids, attention_mask):
        outputs = self.backbone(input_ids=input_ids, attention_mask=attention_mask)
        cls_output = outputs.last_hidden_state[:, 0, :]
        cls_output = self.dropout(cls_output)
        
        return (
            self.hate_type_head(cls_output),
            self.target_group_head(cls_output),
            self.severity_head(cls_output)
        )

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f'Using device: {device}')

In [None]:
# 5. Loss and evaluation functions WITH CLASS WEIGHTS
import torch.nn.functional as F
from sklearn.metrics import f1_score, classification_report
import numpy as np

def compute_class_weights(df, column, n_classes, smoothing=0.1):
    """Compute inverse frequency class weights with smoothing."""
    valid = df[df[column] != -1][column]
    counts = valid.value_counts().reindex(range(n_classes), fill_value=1).values
    weights = 1.0 / (counts + smoothing * len(valid))
    weights = weights / weights.sum() * n_classes
    return torch.tensor(weights, dtype=torch.float32)

def multitask_loss(hate_type_logits, target_group_logits, severity_logits,
                   targets, masks, task_weights=(1.0, 1.0, 1.0),
                   ht_class_weights=None, tg_class_weights=None, sv_class_weights=None):
    """Masked cross-entropy loss with optional class weights."""
    total_loss = 0.0
    n_tasks = 0
    
    ht_mask = masks['hate_type'].bool()
    if ht_mask.any():
        loss_ht = F.cross_entropy(hate_type_logits[ht_mask], targets['hate_type'][ht_mask], weight=ht_class_weights)
        total_loss += task_weights[0] * loss_ht
        n_tasks += 1
    
    tg_mask = masks['target_group'].bool()
    if tg_mask.any():
        loss_tg = F.cross_entropy(target_group_logits[tg_mask], targets['target_group'][tg_mask], weight=tg_class_weights)
        total_loss += task_weights[1] * loss_tg
        n_tasks += 1
    
    sv_mask = masks['severity'].bool()
    if sv_mask.any():
        loss_sv = F.cross_entropy(severity_logits[sv_mask], targets['severity'][sv_mask], weight=sv_class_weights)
        total_loss += task_weights[2] * loss_sv
        n_tasks += 1
    
    return total_loss / max(1, n_tasks)

def move_batch_to_device(batch):
    return {k: v.to(device) for k, v in batch.items()}

def evaluate(model, data_loader, task_weights=(1.0, 1.0, 1.0),
             ht_class_weights=None, tg_class_weights=None, sv_class_weights=None, verbose=False):
    model.eval()
    total_loss = 0.0
    n_batches = 0
    all_preds = {'hate_type': [], 'target_group': [], 'severity': []}
    all_labels = {'hate_type': [], 'target_group': [], 'severity': []}
    all_masks = {'hate_type': [], 'target_group': [], 'severity': []}
    
    with torch.no_grad():
        for batch in data_loader:
            batch = move_batch_to_device(batch)
            ht_logits, tg_logits, sv_logits = model(batch['input_ids'], batch['attention_mask'])
            targets = {k: batch[k] for k in ['hate_type', 'target_group', 'severity']}
            masks = {k: batch[f'{k}_mask'] for k in targets.keys()}
            loss = multitask_loss(ht_logits, tg_logits, sv_logits, targets, masks, task_weights,
                                  ht_class_weights, tg_class_weights, sv_class_weights)
            total_loss += loss.item()
            n_batches += 1
            all_preds['hate_type'].extend(ht_logits.argmax(dim=1).cpu().numpy())
            all_preds['target_group'].extend(tg_logits.argmax(dim=1).cpu().numpy())
            all_preds['severity'].extend(sv_logits.argmax(dim=1).cpu().numpy())
            for task in ['hate_type', 'target_group', 'severity']:
                all_labels[task].extend(targets[task].cpu().numpy())
                all_masks[task].extend(masks[task].cpu().numpy())
    
    metrics = {'loss': total_loss / max(1, n_batches)}
    for task in ['hate_type', 'target_group', 'severity']:
        mask = np.array(all_masks[task]).astype(bool)
        if mask.sum() > 0:
            preds = np.array(all_preds[task])[mask]
            labels = np.array(all_labels[task])[mask]
            metrics[f'{task}_macro_f1'] = f1_score(labels, preds, average='macro', zero_division=0)
            metrics[f'{task}_micro_f1'] = f1_score(labels, preds, average='micro', zero_division=0)
            if verbose:
                print(f'\n{task.upper()} Classification Report:')
                print(classification_report(labels, preds, zero_division=0))
        else:
            metrics[f'{task}_macro_f1'] = None
            metrics[f'{task}_micro_f1'] = None
    return metrics

print('‚úÖ Loss and evaluation functions defined with CLASS WEIGHTS support.')

In [None]:
# 6. Create data loaders + class weights
SEED = 1337
MAX_LENGTH = 160
BATCH_SIZE = 16

torch.manual_seed(SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(SEED)

# Already loaded in Cell 2
print(f'Splits: Train={len(train_df)} | Val={len(val_df)} | Test={len(test_df)}')

# Compute class weights (handles imbalanced classes)
ht_weights = compute_class_weights(train_df, 'hate_type', 6).to(device)
tg_weights = compute_class_weights(train_df, 'target_group', 4).to(device)
sv_weights = compute_class_weights(train_df, 'severity', 4).to(device)

print(f'\nüìä Class Weights:')
print(f'  hate_type:    {[f"{w:.2f}" for w in ht_weights.tolist()]}')
print(f'  target_group: {[f"{w:.2f}" for w in tg_weights.tolist()]}')
print(f'  severity:     {[f"{w:.2f}" for w in sv_weights.tolist()]}')

# Create datasets and loaders
train_dataset = HateDataset(train_df, tokenizer, max_length=MAX_LENGTH)
val_dataset = HateDataset(val_df, tokenizer, max_length=MAX_LENGTH)
test_dataset = HateDataset(test_df, tokenizer, max_length=MAX_LENGTH)

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=2, pin_memory=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=2, pin_memory=True)

print(f'\n‚úÖ Data loaders ready: {len(train_loader)} train batches | {len(val_loader)} val batches')


In [None]:
# 7. Training function with SPACE-SAVING checkpoint strategy
import time
from tqdm import tqdm

def train_model(train_loader, val_loader, config, run_name='xlmr',
                use_wandb=False, resume_from=None,
                ht_class_weights=None, tg_class_weights=None, sv_class_weights=None):
    
    model = MultiTaskXLMRRoberta(dropout=config.get('dropout', 0.3)).to(device)
    optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'], weight_decay=config['weight_decay'])
    
    total_steps = len(train_loader) * config['epochs']
    warmup_steps = int(total_steps * config['warmup_ratio'])
    scheduler = torch.optim.lr_scheduler.LinearLR(optimizer, start_factor=0.1, total_iters=warmup_steps)
    
    if use_wandb:
        import wandb
        wandb.init(project='multilingual-hate-detection', name=run_name, config=config)
    
    best_val_loss = float('inf')
    best_macro_f1 = 0.0
    patience_counter = 0
    history = []
    start_epoch = 1
    
    best_ckpt_path = os.path.join(CHECKPOINT_DIR, f'{run_name}_best.pt')
    
    if resume_from and os.path.exists(resume_from):
        print(f'Resuming from checkpoint: {resume_from}')
        checkpoint = torch.load(resume_from, map_location=device)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        best_val_loss = checkpoint.get('best_val_loss', float('inf'))
        best_macro_f1 = checkpoint.get('best_macro_f1', 0.0)
        patience_counter = checkpoint.get('patience_counter', 0)
        history = checkpoint.get('history', [])
        print(f'Resumed from epoch {checkpoint["epoch"]}. Starting epoch {start_epoch}.')
    
    for epoch in range(start_epoch, config['epochs'] + 1):
        model.train()
        start = time.time()
        running_loss = 0.0
        
        pbar = tqdm(train_loader, desc=f'Epoch {epoch}/{config["epochs"]}', leave=True)
        for batch_idx, batch in enumerate(pbar):
            batch = move_batch_to_device(batch)
            optimizer.zero_grad()
            logits = model(batch['input_ids'], batch['attention_mask'])
            targets = {k: batch[k] for k in ['hate_type', 'target_group', 'severity']}
            masks = {k: batch[f'{k}_mask'] for k in targets.keys()}
            loss = multitask_loss(*logits, targets, masks, config['task_weights'],
                                  ht_class_weights, tg_class_weights, sv_class_weights)
            loss.backward()
            torch.nn.utils.clip_grad_norm_(model.parameters(), config['grad_clip'])
            optimizer.step()
            scheduler.step()
            running_loss += loss.item()
            avg_loss = running_loss / (batch_idx + 1)
            pbar.set_postfix({'loss': f'{avg_loss:.4f}'})
        
        train_loss = running_loss / max(1, len(train_loader))
        
        print(f'Evaluating on validation set...')
        val_metrics = evaluate(model, val_loader, config['task_weights'],
                               ht_class_weights, tg_class_weights, sv_class_weights)
        val_loss = val_metrics['loss']
        
        # Compute average macro F1 across tasks
        macro_f1s = [val_metrics.get(f'{t}_macro_f1', 0) or 0 for t in ['hate_type', 'target_group', 'severity']]
        avg_macro_f1 = sum(macro_f1s) / len(macro_f1s)
        
        epoch_time = time.time() - start
        log_payload = {'epoch': epoch, 'train_loss': train_loss, 'val_loss': val_loss,
                       'avg_macro_f1': avg_macro_f1, 'epoch_time': epoch_time, **val_metrics}
        history.append(log_payload)
        if use_wandb: wandb.log(log_payload)
        
        print(f'Epoch {epoch}: train_loss={train_loss:.4f}, val_loss={val_loss:.4f}, avg_macro_f1={avg_macro_f1:.4f}, time={epoch_time:.1f}s')
        print(f'  hate_type_macro_f1={val_metrics.get("hate_type_macro_f1", 0):.4f}, target_group_macro_f1={val_metrics.get("target_group_macro_f1", 0):.4f}, severity_macro_f1={val_metrics.get("severity_macro_f1", 0):.4f}')
        
        # ‚ö° SPACE-SAVING: Save epoch checkpoint ONLY if needed for resume
        epoch_ckpt_path = os.path.join(CHECKPOINT_DIR, f'{run_name}_epoch{epoch}.pt')
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'best_val_loss': best_val_loss,
            'best_macro_f1': best_macro_f1,
            'patience_counter': patience_counter,
            'history': history,
            'config': config
        }, epoch_ckpt_path)
        print(f'  üíæ Epoch checkpoint saved to {epoch_ckpt_path}')
        
        # Save best model based on MACRO F1 (better for imbalanced data)
        if avg_macro_f1 > best_macro_f1:
            best_macro_f1 = avg_macro_f1
            best_val_loss = val_loss
            patience_counter = 0
            torch.save(model.state_dict(), best_ckpt_path)
            print(f'  ‚úì New best checkpoint saved! (avg_macro_f1={avg_macro_f1:.4f})')
            
            # ‚ö° DELETE OLD EPOCH CHECKPOINT after saving best (saves 7GB per epoch!)
            # Keep only the most recent epoch checkpoint for resume capability
            if epoch > 1:
                old_epoch_ckpt = os.path.join(CHECKPOINT_DIR, f'{run_name}_epoch{epoch-1}.pt')
                if os.path.exists(old_epoch_ckpt):
                    os.remove(old_epoch_ckpt)
                    print(f'  üóëÔ∏è Deleted old checkpoint: {old_epoch_ckpt}')
        else:
            patience_counter += 1
            print(f'  No improvement. Patience: {patience_counter}/{config["patience"]}')
            if patience_counter >= config['patience']:
                print('Early stopping triggered.')
                break
    
    # ‚ö° FINAL CLEANUP: Delete last epoch checkpoint, keep only best
    final_epoch_ckpt = os.path.join(CHECKPOINT_DIR, f'{run_name}_epoch{epoch}.pt')
    if os.path.exists(final_epoch_ckpt):
        os.remove(final_epoch_ckpt)
        print(f'üóëÔ∏è Training complete. Deleted final epoch checkpoint. Only keeping: {best_ckpt_path}')
    
    if use_wandb: wandb.finish()
    return best_ckpt_path, history

print('‚úÖ Training function defined with SPACE-SAVING checkpoint strategy!')
print('üíæ Saves: Best model (~2.5GB) + Latest epoch for resume (~7GB)')
print('üóëÔ∏è Auto-deletes old epoch checkpoints after each epoch')
print('üìä Total space needed: ~10GB max (vs 35GB for 5 epochs)')


In [None]:
# 8. START TRAINING (Run this cell to begin full 5-epoch training)
# =====================================================
# Training Configuration - Optimized for Enhanced Dataset
# =====================================================

full_training_config = {
    'epochs': 5,
    'learning_rate': 1e-5,      # Stable for auto-labeled data
    'weight_decay': 1e-2,
    'warmup_ratio': 0.1,
    'grad_clip': 1.0,
    'patience': 3,              # Early stopping after 3 epochs no improvement
    'dropout': 0.3,
    'task_weights': (1.0, 1.0, 1.0),
    'use_class_weights': True
}

print('üöÄ Starting training with ENHANCED dataset...')
print(f'üìç Device: {device}')
print(f'üìÅ Checkpoints save to: {CHECKPOINT_DIR}')
print(f'‚è±Ô∏è Estimated time: 45-60 min (T4 GPU)\n')

best_checkpoint_full, history_full = train_model(
    train_loader, val_loader, 
    config=full_training_config,
    run_name='xlmr_enhanced',
    use_wandb=False,  # Set to True if you want W&B tracking
    ht_class_weights=ht_weights,
    tg_class_weights=tg_weights,
    sv_class_weights=sv_weights
)

print('\n' + '='*60)
print('‚úÖ TRAINING COMPLETE!')
print(f'üìÅ Best model: {best_checkpoint_full}')
print(f'üìä Final metrics: {history_full[-1]}')
print('='*60)


In [None]:
# 10. DOWNLOAD MODEL - Download trained checkpoint to your computer
from google.colab import files

print('üì• Preparing checkpoint for download...')
print(f'File: {best_checkpoint_full}')
print(f'Size: ~2.5GB')

# Download the best checkpoint
files.download(best_checkpoint_full)

print('''
‚úÖ Download started!

The model is also saved permanently in Google Drive at:
  My Drive/thesis_training/checkpoints_enhanced/xlmr_enhanced_best.pt

You can access it anytime!
''')


In [None]:
# 9. EVALUATE MODEL - Load best checkpoint and test
print('üìä Loading best model for evaluation...')

best_model = MultiTaskXLMRRoberta(dropout=0.3).to(device)
best_model.load_state_dict(torch.load(best_checkpoint_full, map_location=device))

print('\nüîç Evaluating on TEST SET...')
test_results = evaluate(best_model, test_loader, 
                       task_weights=(1.0, 1.0, 1.0),
                       ht_class_weights=ht_weights,
                       tg_class_weights=tg_weights,
                       sv_class_weights=sv_weights,
                       verbose=True)

print('\n' + '='*60)
print('üìä TEST SET RESULTS')
print('='*60)
print(f"Loss: {test_results['loss']:.4f}")
print(f"\nHate Type:")
print(f"  Macro F1: {test_results['hate_type_macro_f1']:.4f}")
print(f"  Micro F1: {test_results['hate_type_micro_f1']:.4f}")
print(f"\nTarget Group:")
print(f"  Macro F1: {test_results['target_group_macro_f1']:.4f}")
print(f"  Micro F1: {test_results['target_group_micro_f1']:.4f}")
print(f"\nSeverity:")
print(f"  Macro F1: {test_results['severity_macro_f1']:.4f}")
print(f"  Micro F1: {test_results['severity_micro_f1']:.4f}")
print('='*60)
