# DistilBERT Hate Speech Classification with Unsloth Optimization

This notebook uses Unsloth for faster and more memory-efficient training of DistilBERT for Bengali hate speech classification.

**Optimized Configuration Based on Previous Results:**
- Target Accuracy: 78%
- Optimal hyperparameters from successful runs
- Memory-efficient training with Unsloth

## 1. Install and Import Unsloth

In [None]:
# Install Unsloth for faster training
!pip install unsloth[colab-new] --quiet
!pip install --no-deps "xformers<0.0.27" "trl<0.9.0" peft accelerate bitsandbytes --quiet

In [None]:
# Import required libraries
import os
import pandas as pd
import numpy as np
import torch
from datasets import Dataset, DatasetDict
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score
from sklearn.utils.class_weight import compute_class_weight

# Unsloth imports
from unsloth import FastLanguageModel
from unsloth.chat_templates import get_chat_template
from transformers import TrainingArguments, AutoTokenizer
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

# Disable wandb
os.environ["WANDB_DISABLED"] = "true"

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f} GB")

## 2. Load and Prepare Dataset

In [None]:
# Load datasets
train_df = pd.read_csv('blp25_hatespeech_subtask_1A_train.tsv', sep='\t')
dev_df = pd.read_csv('blp25_hatespeech_subtask_1A_dev.tsv', sep='\t')
test_df = pd.read_csv('blp25_hatespeech_subtask_1A_dev_test.tsv', sep='\t')

print(f"Train dataset: {len(train_df)} samples")
print(f"Dev dataset: {len(dev_df)} samples")
print(f"Test dataset: {len(test_df)} samples")

# Check class distribution
print("\nClass distribution in training data:")
print(train_df['label'].value_counts())

# Check text length distribution
train_lengths = train_df['text'].str.len()
print(f"\nText length stats:")
print(f"Mean: {train_lengths.mean():.1f}")
print(f"Max: {train_lengths.max()}")
print(f"95th percentile: {train_lengths.quantile(0.95):.1f}")
print(f"Samples > 128 chars: {(train_lengths > 128).sum()} ({(train_lengths > 128).mean()*100:.1f}%)")
print(f"Samples > 256 chars: {(train_lengths > 256).sum()} ({(train_lengths > 256).mean()*100:.1f}%)")

# Convert string labels to binary integers for hate speech detection
# Map all hate speech categories to 1, non-hate to 0
def convert_labels_to_binary(df):
    # Create binary labels: 1 for any hate speech, 0 for non-hate
    hate_categories = ['Abusive', 'Political Hate', 'Profane', 'Religious Hate', 'Sexism']
    df['label'] = df['label'].apply(lambda x: 1 if x in hate_categories else 0)
    return df

# Apply label conversion
train_df = convert_labels_to_binary(train_df)
dev_df = convert_labels_to_binary(dev_df)
test_df = convert_labels_to_binary(test_df)

print("\nBinary label distribution after conversion:")
print(f"Train - Non-hate (0): {(train_df['label'] == 0).sum()}, Hate (1): {(train_df['label'] == 1).sum()}")
print(f"Dev - Non-hate (0): {(dev_df['label'] == 0).sum()}, Hate (1): {(dev_df['label'] == 1).sum()}")
print(f"Test - Non-hate (0): {(test_df['label'] == 0).sum()}, Hate (1): {(test_df['label'] == 1).sum()}")

## 3. Load Model with Unsloth Optimization

In [None]:
# Model configuration
model_name = "csebuetnlp/banglabert"  # Base model
max_seq_length = 256  # Optimal for 13.9% of samples >128 chars
dtype = None  # Auto detection
load_in_4bit = True  # Use 4bit quantization for memory efficiency

# Load model and tokenizer with Unsloth optimization
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name=model_name,
    max_seq_length=max_seq_length,
    dtype=dtype,
    load_in_4bit=load_in_4bit,
    # token="hf_...", # use one if using gated models like meta-llama/Llama-2-7b-hf
)

print(f"Model loaded: {model_name}")
print(f"Max sequence length: {max_seq_length}")
print(f"Model dtype: {model.dtype}")

## 4. Configure Model for Classification

In [None]:
# Configure model for LoRA fine-tuning
model = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank
    target_modules=["query", "key", "value", "dense"],  # DistilBERT attention modules
    lora_alpha=32,  # LoRA alpha
    lora_dropout=0.1,  # LoRA dropout
    bias="none",  # Bias setting
    use_gradient_checkpointing="unsloth",  # Memory optimization
    random_state=42,
    use_rslora=False,  # Rank stabilized LoRA
    loftq_config=None,  # LoftQ quantization
)

# Add classification head
from transformers import AutoModelForSequenceClassification
import torch.nn as nn

# Get the base model
base_model = model.get_base_model()

# Add classification head for binary classification
num_labels = 2
classifier = nn.Linear(base_model.config.hidden_size, num_labels)
base_model.classifier = classifier

print("Model configured for classification with LoRA")
print(f"Number of trainable parameters: {sum(p.numel() for p in model.parameters() if p.requires_grad):,}")
print(f"Total parameters: {sum(p.numel() for p in model.parameters()):,}")

## 5. Prepare Data for Training

In [None]:
# Tokenization function
def tokenize_function(examples):
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=max_seq_length,
        return_tensors=None
    )

# Convert to HuggingFace datasets
train_dataset = Dataset.from_pandas(train_df)
eval_dataset = Dataset.from_pandas(dev_df)
test_dataset = Dataset.from_pandas(test_df)

# Tokenize datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)
test_dataset = test_dataset.map(tokenize_function, batched=True)

# Set format for PyTorch
train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])
test_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'label'])

print(f"Tokenized datasets prepared")
print(f"Train: {len(train_dataset)}, Eval: {len(eval_dataset)}, Test: {len(test_dataset)}")

## 6. Compute Class Weights for Imbalanced Data

In [None]:
# Calculate class weights for imbalanced dataset
train_labels = train_df['label'].values
unique_labels = np.unique(train_labels)
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=unique_labels,
    y=train_labels
)

# Convert to tensor
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float32)
if torch.cuda.is_available():
    class_weights_tensor = class_weights_tensor.cuda()

print(f"Class weights: {dict(zip(unique_labels, class_weights))}")
print(f"Class 0 (non-hate): {class_weights[0]:.3f}")
print(f"Class 1 (hate): {class_weights[1]:.3f}")

## 7. Define Metrics and Custom Trainer

In [None]:
# Define metrics computation
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = accuracy_score(labels, predictions)
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    precision_macro = precision_score(labels, predictions, average='macro')
    recall_macro = recall_score(labels, predictions, average='macro')
    
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted,
        'precision_macro': precision_macro,
        'recall_macro': recall_macro
    }

# Custom Trainer with weighted loss
from transformers import Trainer
import torch.nn.functional as F

class WeightedTrainer(Trainer):
    def __init__(self, class_weights=None, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.class_weights = class_weights
    
    def compute_loss(self, model, inputs, return_outputs=False):
        labels = inputs.get("labels")
        outputs = model(**inputs)
        logits = outputs.get('logits')
        
        if self.class_weights is not None:
            loss = F.cross_entropy(logits, labels, weight=self.class_weights)
        else:
            loss = F.cross_entropy(logits, labels)
        
        return (loss, outputs) if return_outputs else loss

print("Custom trainer with weighted loss defined")

## 8. Training Configuration - Optimized Hyperparameters

In [None]:
# Optimal hyperparameters based on previous successful runs
training_args = TrainingArguments(
    output_dir="./results_unsloth",
    
    # Optimal hyperparameters from memory
    learning_rate=2e-5,  # Optimal: 2e-5 (not 3e-5)
    num_train_epochs=6,  # Optimal: 5-7 epochs
    warmup_ratio=0.06,  # Optimal: 0.06 (not 0.0)
    lr_scheduler_type="cosine",  # Optimal: "cosine" (not "linear")
    weight_decay=0.01,  # Optimal: 0.01 (not 0.0)
    label_smoothing_factor=0.05,  # For regularization
    
    # Batch size and gradient accumulation
    per_device_train_batch_size=16,
    per_device_eval_batch_size=32,
    gradient_accumulation_steps=2,
    
    # Evaluation and saving
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,
    
    # Early stopping
    early_stopping_patience=3,
    
    # Optimization
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    optim="adamw_8bit",  # Memory efficient optimizer
    gradient_checkpointing=True,
    dataloader_pin_memory=False,
    
    # Logging
    logging_dir="./logs_unsloth",
    logging_steps=100,
    report_to=None,  # Disable wandb
    
    # Reproducibility
    seed=42,
    data_seed=42,
)

print("Training arguments configured with optimal hyperparameters:")
print(f"Learning rate: {training_args.learning_rate}")
print(f"Epochs: {training_args.num_train_epochs}")
print(f"Warmup ratio: {training_args.warmup_ratio}")
print(f"LR scheduler: {training_args.lr_scheduler_type}")
print(f"Weight decay: {training_args.weight_decay}")
print(f"Label smoothing: {training_args.label_smoothing_factor}")

## 9. Initialize Trainer and Start Training

In [None]:
# Initialize trainer with Unsloth optimizations
trainer = WeightedTrainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics,
    class_weights=class_weights_tensor,
)

print("Trainer initialized with:")
print(f"- Weighted loss for class imbalance")
print(f"- Unsloth memory optimizations")
print(f"- Optimal hyperparameters from previous runs")
print(f"- Target accuracy: 78%")

# Show memory usage before training
if torch.cuda.is_available():
    print(f"\nGPU Memory before training:")
    print(f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")

In [None]:
# Start training
print("Starting training with Unsloth optimization...")
print("Expected improvements:")
print("- 2x faster training speed")
print("- 70% less VRAM usage")
print("- Better convergence with optimal hyperparameters")
print("\n" + "="*50)

trainer.train()

print("\n" + "="*50)
print("Training completed!")

## 10. Evaluation and Results

In [None]:
# Evaluate on validation set
print("Evaluating on validation set...")
eval_results = trainer.evaluate()

print("\nValidation Results:")
print(f"Accuracy: {eval_results['eval_accuracy']:.4f} ({eval_results['eval_accuracy']*100:.2f}%)")
print(f"F1-macro: {eval_results['eval_f1_macro']:.4f}")
print(f"F1-weighted: {eval_results['eval_f1_weighted']:.4f}")
print(f"Precision-macro: {eval_results['eval_precision_macro']:.4f}")
print(f"Recall-macro: {eval_results['eval_recall_macro']:.4f}")

# Check if target accuracy achieved
target_accuracy = 0.78
achieved_accuracy = eval_results['eval_accuracy']

if achieved_accuracy >= target_accuracy:
    print(f"\n🎉 SUCCESS! Target accuracy of {target_accuracy*100:.1f}% achieved!")
    print(f"Achieved: {achieved_accuracy*100:.2f}%")
else:
    print(f"\n📈 Progress: {achieved_accuracy*100:.2f}% (Target: {target_accuracy*100:.1f}%)")
    gap = target_accuracy - achieved_accuracy
    print(f"Gap to target: {gap*100:.2f} percentage points")

In [None]:
# Evaluate on test set
print("Evaluating on test set...")
test_results = trainer.evaluate(eval_dataset=test_dataset)

print("\nTest Results:")
print(f"Accuracy: {test_results['eval_accuracy']:.4f} ({test_results['eval_accuracy']*100:.2f}%)")
print(f"F1-macro: {test_results['eval_f1_macro']:.4f}")
print(f"F1-weighted: {test_results['eval_f1_weighted']:.4f}")
print(f"Precision-macro: {test_results['eval_precision_macro']:.4f}")
print(f"Recall-macro: {test_results['eval_recall_macro']:.4f}")

## 11. Save Model

In [None]:
# Save the fine-tuned model
model_save_path = "./bangla_hate_speech_unsloth"
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)

print(f"Model saved to: {model_save_path}")

# Save training results
results_summary = {
    'model_name': model_name,
    'optimization': 'Unsloth',
    'max_seq_length': max_seq_length,
    'learning_rate': training_args.learning_rate,
    'num_epochs': training_args.num_train_epochs,
    'warmup_ratio': training_args.warmup_ratio,
    'lr_scheduler': training_args.lr_scheduler_type,
    'weight_decay': training_args.weight_decay,
    'label_smoothing': training_args.label_smoothing_factor,
    'validation_accuracy': eval_results['eval_accuracy'],
    'validation_f1_macro': eval_results['eval_f1_macro'],
    'test_accuracy': test_results['eval_accuracy'],
    'test_f1_macro': test_results['eval_f1_macro'],
    'target_achieved': achieved_accuracy >= target_accuracy
}

import json
with open(f"{model_save_path}/training_results.json", 'w') as f:
    json.dump(results_summary, f, indent=2)

print("Training results saved to training_results.json")

## 12. Memory Usage Summary

In [None]:
# Show final memory usage
if torch.cuda.is_available():
    print("Final GPU Memory Usage:")
    print(f"Allocated: {torch.cuda.memory_allocated()/1024**3:.2f} GB")
    print(f"Reserved: {torch.cuda.memory_reserved()/1024**3:.2f} GB")
    print(f"Max allocated during training: {torch.cuda.max_memory_allocated()/1024**3:.2f} GB")
    
    # Calculate memory savings compared to standard training
    max_memory_gb = torch.cuda.max_memory_allocated()/1024**3
    estimated_standard_memory = max_memory_gb / 0.3  # Unsloth uses ~30% of standard memory
    memory_saved = estimated_standard_memory - max_memory_gb
    
    print(f"\nEstimated memory savings with Unsloth:")
    print(f"Standard training would use: ~{estimated_standard_memory:.1f} GB")
    print(f"Unsloth training used: {max_memory_gb:.1f} GB")
    print(f"Memory saved: ~{memory_saved:.1f} GB ({(memory_saved/estimated_standard_memory)*100:.0f}%)")

print("\n" + "="*60)
print("TRAINING SUMMARY")
print("="*60)
print(f"Model: {model_name}")
print(f"Optimization: Unsloth + Optimal Hyperparameters")
print(f"Validation Accuracy: {eval_results['eval_accuracy']*100:.2f}%")
print(f"Test Accuracy: {test_results['eval_accuracy']*100:.2f}%")
print(f"Target (78%): {'✅ ACHIEVED' if achieved_accuracy >= target_accuracy else '❌ NOT YET'}")
print("="*60)