# Toxic Language Detection with XLM-RoBERTa (Regression)

This notebook trains an XLM-RoBERTa model to detect toxic language in text, outputting a score from 0 to 1 indicating how unacceptable the text is.

In [None]:
# Install required packages
!pip install transformers datasets torch scikit-learn pandas numpy tqdm

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import AutoTokenizer, AutoModel, PreTrainedModel, AutoConfig, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
import pandas as pd
import numpy as np
from tqdm import tqdm
import warnings
warnings.filterwarnings('ignore')

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Configuration
MODEL_NAME = "xlm-roberta-base"
MAX_LENGTH = 256
BATCH_SIZE = 16
EPOCHS = 3
LEARNING_RATE = 2e-5
WARMUP_STEPS = 500
WEIGHT_DECAY = 0.01

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
print(f"Tokenizer loaded: {MODEL_NAME}")

In [None]:
# Custom Dataset for Regression
class ToxicDataset(Dataset):
    def __init__(self, texts, scores, tokenizer, max_length):
        self.texts = texts
        self.scores = scores
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self):
        return len(self.texts)
    
    def __getitem__(self, idx):
        text = str(self.texts[idx])
        score = float(self.scores[idx])
        
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'labels': torch.tensor(score, dtype=torch.float)
        }

In [None]:
class ToxicRegressor(PreTrainedModel):
    def __init__(self, config):
        super().__init__(config)
        self.num_labels = 1  # Single output for regression
        self.bert = AutoModel.from_config(config)
        self.dropout = nn.Dropout(0.3)
        self.regressor = nn.Linear(config.hidden_size, 1)

        self.post_init()  # ✅ ensures weights and config are set correctly

    def forward(self, input_ids=None, attention_mask=None, labels=None):
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)
        
        # Try pooler_output first, fallback to mean pooling if not available
        pooled_output = getattr(outputs, "pooler_output", None)
        if pooled_output is None:
            pooled_output = outputs.last_hidden_state[:, 0]  # CLS token

        pooled_output = self.dropout(pooled_output)
        logits = self.regressor(pooled_output)
        
        # Apply sigmoid to ensure output is between 0 and 1
        predictions = torch.sigmoid(logits).squeeze(-1)

        loss = None
        if labels is not None:
            # Use MSE loss for regression
            loss_fct = nn.MSELoss()
            loss = loss_fct(predictions, labels)

        return (loss, predictions) if loss is not None else predictions

config = AutoConfig.from_pretrained(MODEL_NAME, num_labels=1)
model = ToxicRegressor(config)
model.to(device)
print(f"Model loaded on {device}")

In [None]:
import pandas as pd
import numpy as np

# Read the CSV file
df = pd.read_csv("score_train_new.csv")

# Drop duplicate rows
df_unique = df.drop_duplicates()

# Shuffle the rows
shuffled_df = df_unique.sample(frac=1).reset_index(drop=True)

# Extract texts and scores
texts = shuffled_df['label'].values  # 'label' column contains the text
scores = shuffled_df['score'].values  # 'score' column contains the toxicity score

# Convert scores to float
scores = scores.astype(float)

print(f"Dataset loaded: {len(texts)} samples")
print(f"Score range: {scores.min():.4f} to {scores.max():.4f}")
print(f"Mean score: {scores.mean():.4f}")
print(f"Std score: {scores.std():.4f}")

In [None]:
# Split dataset
train_texts, val_texts, train_scores, val_scores = train_test_split(
    texts, scores, test_size=0.2, random_state=42
)

# Create datasets
train_dataset = ToxicDataset(train_texts, train_scores, tokenizer, MAX_LENGTH)
val_dataset = ToxicDataset(val_texts, val_scores, tokenizer, MAX_LENGTH)

# Create dataloaders
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False)

print(f"Train samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")

In [None]:
# Training setup
optimizer = AdamW(model.parameters(), lr=LEARNING_RATE, weight_decay=WEIGHT_DECAY)
total_steps = len(train_loader) * EPOCHS
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=WARMUP_STEPS, num_training_steps=total_steps)

def train_epoch(model, dataloader, optimizer, scheduler, device):
    model.train()
    total_loss = 0
    progress_bar = tqdm(dataloader, desc="Training")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['labels'].to(device)
        
        optimizer.zero_grad()
        loss, _ = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
        optimizer.step()
        scheduler.step()
        
        total_loss += loss.item()
        progress_bar.set_postfix({'loss': f'{loss.item():.4f}'})
    
    return total_loss / len(dataloader)

def evaluate(model, dataloader, device):
    model.eval()
    total_loss = 0
    predictions = []
    true_scores = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Evaluating"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['labels'].to(device)
            
            loss, preds = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)
            total_loss += loss.item()
            
            predictions.extend(preds.cpu().numpy())
            true_scores.extend(labels.cpu().numpy())
    
    # Calculate regression metrics
    mse = mean_squared_error(true_scores, predictions)
    mae = mean_absolute_error(true_scores, predictions)
    r2 = r2_score(true_scores, predictions)
    
    return total_loss / len(dataloader), mse, mae, r2

In [None]:
# Training loop
best_r2 = -float('inf')
training_history = []

for epoch in range(EPOCHS):
    print(f"\nEpoch {epoch + 1}/{EPOCHS}")
    print("-" * 50)
    
    # Train
    train_loss = train_epoch(model, train_loader, optimizer, scheduler, device)
    
    # Evaluate
    val_loss, val_mse, val_mae, val_r2 = evaluate(model, val_loader, device)
    
    print(f"Train Loss: {train_loss:.4f}")
    print(f"Val Loss: {val_loss:.4f}")
    print(f"Val MSE: {val_mse:.4f}")
    print(f"Val MAE: {val_mae:.4f}")
    print(f"Val R²: {val_r2:.4f}")
    
    training_history.append({
        'epoch': epoch + 1,
        'train_loss': train_loss,
        'val_loss': val_loss,
        'val_mse': val_mse,
        'val_mae': val_mae,
        'val_r2': val_r2
    })
    
    if val_r2 > best_r2:
        best_r2 = val_r2
        torch.save(model.state_dict(), 'best_toxic_regression_model.pth')
        print("New best model saved!")

print("\nTraining completed!")

In [None]:
# Load best model
model.load_state_dict(torch.load('best_toxic_regression_model.pth'))
model.eval()

# Final evaluation
val_loss, val_mse, val_mae, val_r2 = evaluate(model, val_loader, device)
print("\nFinal Model Performance:")
print(f"MSE: {val_mse:.4f}")
print(f"MAE: {val_mae:.4f}")
print(f"R² Score: {val_r2:.4f}")

In [None]:
# Prediction function for regression
def predict_toxic_score(text, model, tokenizer, device):
    model.eval()
    
    encoding = tokenizer(
        text,
        truncation=True,
        padding='max_length',
        max_length=MAX_LENGTH,
        return_tensors='pt'
    )
    
    input_ids = encoding['input_ids'].to(device)
    attention_mask = encoding['attention_mask'].to(device)
    
    with torch.no_grad():
        predictions = model(input_ids=input_ids, attention_mask=attention_mask)
        score = predictions.item()
    
    return score

# Test predictions
test_texts = [
    "This is a normal message",
    "You are stupid and worthless",
    "Have a great day!",
    "I hate you so much",
    "Thanks for your help",
    "You're a complete idiot and I hope you fail",
    "The weather is nice today",
    "Go to hell you worthless piece of trash"
]

print("\nTest Predictions:")
for text in test_texts:
    score = predict_toxic_score(text, model, tokenizer, device)
    print(f"Text: '{text}'")
    print(f"Toxicity Score: {score:.4f}")
    print("-" * 50)

In [None]:
# Save the model
model.save_pretrained("./toxic_regression_model")
tokenizer.save_pretrained("./toxic_regression_model")

In [None]:
!pip install huggingface_hub

In [None]:
from huggingface_hub import notebook_login
notebook_login()

In [None]:
from huggingface_hub import HfApi
api = HfApi()
api.create_repo(repo_id="chalique/detox-regression", repo_type="model")

In [None]:
api.upload_folder(
    folder_path="./toxic_regression_model",
    repo_id="chalique/detox-regression",
    repo_type="model"
)