# WebSafety Multilingual Text Classifier - Training Notebook

Fine-tuning XLM-RoBERTa on 9,000 multilingual web safety samples

**Dataset**: WebSafety 9K (English, Hinglish, Telenglish)  
**Model**: XLM-RoBERTa-base  
**Task**: Multi-class text classification (7 categories)

## ‚ö†Ô∏è Important: Enable GPU!
Settings ‚Üí Accelerator ‚Üí **GPU T4 x2**

In [None]:
# Check GPU availability
!nvidia-smi

In [None]:
# Install dependencies
!pip install -q transformers datasets sentencepiece accelerate scikit-learn matplotlib seaborn

In [None]:
# Imports
import json
import torch
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.metrics import classification_report, confusion_matrix, f1_score
from transformers import (
    XLMRobertaTokenizer, 
    XLMRobertaForSequenceClassification,
    Trainer, 
    TrainingArguments,
    EarlyStoppingCallback
)
from datasets import Dataset
import matplotlib.pyplot as plt
import seaborn as sns

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Configuration
LABEL_MAP = {
    "safe": 0,
    "phishing": 1,
    "malware": 2,
    "hate_speech": 3,
    "cyberbullying": 4,
    "sexual_content": 5,
    "violence": 6
}

ID_TO_LABEL = {v: k for k, v in LABEL_MAP.items()}

# File paths - UPDATE THESE to match your dataset!
TRAIN_FILE = "/kaggle/input/websafety-9k/train_9k.jsonl"
VAL_FILE = "/kaggle/input/websafety-9k/validation_9k.jsonl"
TEST_FILE = "/kaggle/input/websafety-9k/test_9k.jsonl"
OUTPUT_DIR = "/kaggle/working/websafety-xlm-roberta"

In [None]:
# Dataset loader
def load_jsonl(file_path):
    data = []
    with open(file_path, 'r', encoding='utf-8') as f:
        for line in f:
            data.append(json.loads(line))
    return data

def prepare_dataset(file_path, tokenizer, max_length=256):
    print(f"Loading {file_path}...")
    raw_data = load_jsonl(file_path)
    
    texts = [item['text'] for item in raw_data]
    labels = [LABEL_MAP[item['primary_label']] for item in raw_data]
    
    dataset = Dataset.from_dict({'text': texts, 'label': labels})
    
    def tokenize_function(examples):
        return tokenizer(
            examples['text'],
            padding='max_length',
            truncation=True,
            max_length=max_length
        )
    
    tokenized = dataset.map(tokenize_function, batched=True)
    print(f"  ‚úì Loaded {len(tokenized)} samples")
    return tokenized

In [None]:
# Load tokenizer and prepare datasets
print("Loading XLM-RoBERTa tokenizer...")
tokenizer = XLMRobertaTokenizer.from_pretrained('xlm-roberta-base')

train_dataset = prepare_dataset(TRAIN_FILE, tokenizer)
val_dataset = prepare_dataset(VAL_FILE, tokenizer)
test_dataset = prepare_dataset(TEST_FILE, tokenizer)

print(f"\nDataset sizes:")
print(f"  Train: {len(train_dataset)}")
print(f"  Validation: {len(val_dataset)}")
print(f"  Test: {len(test_dataset)}")

In [None]:
# Load model
print("Loading XLM-RoBERTa model...")
model = XLMRobertaForSequenceClassification.from_pretrained(
    'xlm-roberta-base',
    num_labels=len(LABEL_MAP),
    problem_type="single_label_classification"
)

print(f"Model loaded with {sum(p.numel() for p in model.parameters()):,} parameters")

In [None]:
# Metrics function
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    
    accuracy = (predictions == labels).mean()
    f1_macro = f1_score(labels, predictions, average='macro')
    f1_weighted = f1_score(labels, predictions, average='weighted')
    
    return {
        'accuracy': accuracy,
        'f1_macro': f1_macro,
        'f1_weighted': f1_weighted
    }

In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=4,
    weight_decay=0.01,
    warmup_steps=500,
    logging_dir=f'{OUTPUT_DIR}/logs',
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model='f1_macro',
    greater_is_better=True,
    save_total_limit=2,
    fp16=True,  # Mixed precision
    report_to='none'
)

# Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

In [None]:
# Train!
print("üèãÔ∏è Starting training...\n")
trainer.train()

In [None]:
# Evaluate on test set
print("\nüìä Evaluating on test set...")
test_results = trainer.evaluate(test_dataset)

print("\nTest Results:")
for key, value in test_results.items():
    print(f"  {key}: {value:.4f}")

In [None]:
# Detailed classification report
predictions = trainer.predict(test_dataset)
y_pred = np.argmax(predictions.predictions, axis=1)
y_true = predictions.label_ids

print("\nüìã Classification Report:")
print(classification_report(
    y_true, 
    y_pred, 
    target_names=list(LABEL_MAP.keys()),
    digits=4
))

In [None]:
# Confusion matrix visualization
cm = confusion_matrix(y_true, y_pred)

plt.figure(figsize=(10, 8))
sns.heatmap(
    cm, 
    annot=True, 
    fmt='d', 
    cmap='Blues',
    xticklabels=list(LABEL_MAP.keys()),
    yticklabels=list(LABEL_MAP.keys())
)
plt.title('Confusion Matrix - WebSafety Classifier', fontsize=14, fontweight='bold')
plt.ylabel('True Label', fontsize=12)
plt.xlabel('Predicted Label', fontsize=12)
plt.tight_layout()
plt.savefig(f'{OUTPUT_DIR}/confusion_matrix.png', dpi=300)
plt.show()

print("‚úì Confusion matrix saved")

In [None]:
# Save model and tokenizer
print("üíæ Saving model...")
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)

# Save label mapping
with open(f'{OUTPUT_DIR}/label_mapping.json', 'w') as f:
    json.dump(LABEL_MAP, f, indent=2)

print(f"\n‚úÖ Training complete!")
print(f"üìÅ Model saved to: {OUTPUT_DIR}")
print("\nüéâ Ready for deployment!")

In [None]:
# Test predictions on sample texts
test_samples = [
    "Had a great time at the beach today!",
    "You're so ugly, nobody likes you",
    "Your account has been locked! Click here to verify",
    "Yaar, ye movie bahut acchi thi!",
    "Abbai, ee movie chala bagundi!"
]

print("\nüß™ Testing on sample texts:\n")
for text in test_samples:
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=256)
    inputs = {k: v.to(model.device) for k, v in inputs.items()}
    
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = torch.nn.functional.softmax(outputs.logits, dim=-1)
        predicted_class = torch.argmax(predictions, dim=1).item()
        confidence = predictions[0][predicted_class].item()
    
    print(f"Text: {text}")
    print(f"Prediction: {ID_TO_LABEL[predicted_class]} (confidence: {confidence:.2%})\n")