In [None]:
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW
from transformers import (
    AutoTokenizer,
    AutoModel,
    get_linear_schedule_with_warmup
)
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_recall_fscore_support
from sklearn.preprocessing import LabelEncoder
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import warnings
import random
warnings.filterwarnings('ignore')

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

In [None]:
class ArabicTextDataset(Dataset):
    """Custom dataset for Arabic text multi-task classification"""

    def __init__(self, texts, emotions, offensive, hate, tokenizer, max_length=512):
        self.texts = texts
        self.emotions = emotions
        self.offensive = offensive
        self.hate = hate
        self.tokenizer = tokenizer
        self.max_length = max_length

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = str(self.texts[idx])

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        return {
            'input_ids': encoding['input_ids'].flatten(),
            'attention_mask': encoding['attention_mask'].flatten(),
            'emotion': torch.tensor(self.emotions[idx], dtype=torch.long),
            'offensive': torch.tensor(self.offensive[idx], dtype=torch.long),
            'hate': torch.tensor(self.hate[idx], dtype=torch.long)
        }

print("Dataset class defined successfully!")

Dataset class defined successfully!


In [None]:
class MultiTaskAraBERT(nn.Module):
    """Multi-task classification model using AraBERTv2"""

    def __init__(self, model_name, num_emotions, num_offensive, num_hate, dropout=0.3):
        super(MultiTaskAraBERT, self).__init__()

        # Load pre-trained AraBERT model
        self.bert = AutoModel.from_pretrained(model_name)

        # Dropout for regularization
        self.dropout = nn.Dropout(dropout)

        # Classification heads for each task
        hidden_size = self.bert.config.hidden_size

        self.emotion_classifier = nn.Linear(hidden_size, num_emotions)
        self.offensive_classifier = nn.Linear(hidden_size, num_offensive)
        self.hate_classifier = nn.Linear(hidden_size, num_hate)

    def forward(self, input_ids, attention_mask):
        # Get BERT outputs
        outputs = self.bert(input_ids=input_ids, attention_mask=attention_mask)

        # Use pooled output (CLS token representation)
        pooled_output = outputs.pooler_output
        pooled_output = self.dropout(pooled_output)

        # Get predictions for each task
        emotion_logits = self.emotion_classifier(pooled_output)
        offensive_logits = self.offensive_classifier(pooled_output)
        hate_logits = self.hate_classifier(pooled_output)

        return emotion_logits, offensive_logits, hate_logits

print("Multi-task model class defined successfully!")

Multi-task model class defined successfully!


In [None]:
# Cell 4: Main Classifier Class
# =============================

class ArabicMultiTaskClassifier:
    """Main classifier class for Arabic multi-task text classification"""

    def __init__(self, model_name='aubmindlab/bert-base-arabertv2', max_length=512):
        self.model_name = model_name
        self.max_length = max_length
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

        # Initialize tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)

        # Label encoders for each task
        self.emotion_encoder = LabelEncoder()
        self.offensive_encoder = LabelEncoder()
        self.hate_encoder = LabelEncoder()

        print(f"Using device: {self.device}")

    def load_data(self, file_path):
        """Load and preprocess data from Excel or CSV file"""
        print("Loading data...")

        # Determine file type and read accordingly
        if file_path.endswith('.csv'):
            df = pd.read_csv(file_path)
        elif file_path.endswith(('.xlsx', '.xls')):
            df = pd.read_excel(file_path)
        else:
            raise ValueError("Unsupported file format. Please use CSV (.csv) or Excel (.xlsx, .xls) files.")

        # Basic data info
        print(f"Dataset shape: {df.shape}")
        print("\nColumn names:", df.columns.tolist())
        print("\nFirst few rows:")
        print(df.head())

        # Check for missing values
        print("\nMissing values:")
        print(df.isnull().sum())

        # Remove rows with missing text
        df = df.dropna(subset=['text'])

        # Fill missing labels with 'unknown' or most frequent value
        df['Emotion'] = df['Emotion'].fillna('neutral')
        df['Offensive'] = df['Offensive'].fillna('no')
        df['Hate'] = df['Hate'].fillna('not_hate')

        # Encode labels
        df['emotion_encoded'] = self.emotion_encoder.fit_transform(df['Emotion'])
        df['offensive_encoded'] = self.offensive_encoder.fit_transform(df['Offensive'])
        df['hate_encoded'] = self.hate_encoder.fit_transform(df['Hate'])

        # Print label distributions
        print("\nLabel distributions:")
        print("Emotions:", df['Emotion'].value_counts())
        print("Offensive:", df['Offensive'].value_counts())
        print("Hate:", df['Hate'].value_counts())

        return df

    def prepare_data(self, df, test_size=0.2, val_size=0.1):
        """Prepare train, validation, and test datasets"""
        print("Preparing datasets...")

        # Split data
        train_df, test_df = train_test_split(
            df, test_size=test_size, random_state=42, stratify=df['Emotion']
        )

        train_df, val_df = train_test_split(
            train_df, test_size=val_size/(1-test_size), random_state=42, stratify=train_df['Emotion']
        )

        print(f"Train size: {len(train_df)}")
        print(f"Validation size: {len(val_df)}")
        print(f"Test size: {len(test_df)}")

        # Create datasets
        train_dataset = ArabicTextDataset(
            train_df['text'].values,
            train_df['emotion_encoded'].values,
            train_df['offensive_encoded'].values,
            train_df['hate_encoded'].values,
            self.tokenizer,
            self.max_length
        )

        val_dataset = ArabicTextDataset(
            val_df['text'].values,
            val_df['emotion_encoded'].values,
            val_df['offensive_encoded'].values,
            val_df['hate_encoded'].values,
            self.tokenizer,
            self.max_length
        )

        test_dataset = ArabicTextDataset(
            test_df['text'].values,
            test_df['emotion_encoded'].values,
            test_df['offensive_encoded'].values,
            test_df['hate_encoded'].values,
            self.tokenizer,
            self.max_length
        )

        return train_dataset, val_dataset, test_dataset, train_df, val_df, test_df

    def create_model(self, num_emotions, num_offensive, num_hate):
        """Create and initialize the multi-task model"""
        print("Creating model...")

        model = MultiTaskAraBERT(
            self.model_name,
            num_emotions,
            num_offensive,
            num_hate
        )

        model.to(self.device)
        return model

    def train_model(self, model, train_loader, val_loader, num_epochs=5, learning_rate=2e-5):
        """Train the multi-task model"""
        print("Starting training...")

        # Loss functions for each task
        criterion_emotion = nn.CrossEntropyLoss()
        criterion_offensive = nn.CrossEntropyLoss()
        criterion_hate = nn.CrossEntropyLoss()

        # Optimizer and scheduler
        optimizer = AdamW(model.parameters(), lr=learning_rate)
        total_steps = len(train_loader) * num_epochs
        scheduler = get_linear_schedule_with_warmup(
            optimizer,
            num_warmup_steps=0,
            num_training_steps=total_steps
        )

        # Training history
        train_losses = []
        val_losses = []

        for epoch in range(num_epochs):
            print(f"\nEpoch {epoch + 1}/{num_epochs}")

            # Training phase
            model.train()
            total_train_loss = 0

            for batch in tqdm(train_loader, desc="Training"):
                optimizer.zero_grad()

                input_ids = batch['input_ids'].to(self.device)
                attention_mask = batch['attention_mask'].to(self.device)
                emotion_labels = batch['emotion'].to(self.device)
                offensive_labels = batch['offensive'].to(self.device)
                hate_labels = batch['hate'].to(self.device)

                # Forward pass
                emotion_logits, offensive_logits, hate_logits = model(input_ids, attention_mask)

                # Calculate losses
                emotion_loss = criterion_emotion(emotion_logits, emotion_labels)
                offensive_loss = criterion_offensive(offensive_logits, offensive_labels)
                hate_loss = criterion_hate(hate_logits, hate_labels)

                # Combined loss (weighted sum)
                total_loss = emotion_loss + offensive_loss + hate_loss

                # Backward pass
                total_loss.backward()
                torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)
                optimizer.step()
                scheduler.step()

                total_train_loss += total_loss.item()

            avg_train_loss = total_train_loss / len(train_loader)
            train_losses.append(avg_train_loss)

            # Validation phase
            model.eval()
            total_val_loss = 0

            with torch.no_grad():
                for batch in tqdm(val_loader, desc="Validation"):
                    input_ids = batch['input_ids'].to(self.device)
                    attention_mask = batch['attention_mask'].to(self.device)
                    emotion_labels = batch['emotion'].to(self.device)
                    offensive_labels = batch['offensive'].to(self.device)
                    hate_labels = batch['hate'].to(self.device)

                    emotion_logits, offensive_logits, hate_logits = model(input_ids, attention_mask)

                    emotion_loss = criterion_emotion(emotion_logits, emotion_labels)
                    offensive_loss = criterion_offensive(offensive_logits, offensive_labels)
                    hate_loss = criterion_hate(hate_logits, hate_labels)

                    total_loss = emotion_loss + offensive_loss + hate_loss
                    total_val_loss += total_loss.item()

            avg_val_loss = total_val_loss / len(val_loader)
            val_losses.append(avg_val_loss)

            print(f"Average training loss: {avg_train_loss:.4f}")
            print(f"Average validation loss: {avg_val_loss:.4f}")

        return train_losses, val_losses

    def predict_text(self, model, text):
        """Predict labels for a single text"""
        model.eval()

        # Tokenize text
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_length,
            return_tensors='pt'
        )

        input_ids = encoding['input_ids'].to(self.device)
        attention_mask = encoding['attention_mask'].to(self.device)

        with torch.no_grad():
            emotion_logits, offensive_logits, hate_logits = model(input_ids, attention_mask)

            # Get predictions
            emotion_pred = torch.argmax(emotion_logits, dim=1).item()
            offensive_pred = torch.argmax(offensive_logits, dim=1).item()
            hate_pred = torch.argmax(hate_logits, dim=1).item()

            # Get probabilities
            emotion_probs = torch.softmax(emotion_logits, dim=1)[0]
            offensive_probs = torch.softmax(offensive_logits, dim=1)[0]
            hate_probs = torch.softmax(hate_logits, dim=1)[0]

        # Convert to original labels
        emotion_label = self.emotion_encoder.inverse_transform([emotion_pred])[0]
        offensive_label = self.offensive_encoder.inverse_transform([offensive_pred])[0]
        hate_label = self.hate_encoder.inverse_transform([hate_pred])[0]

        return {
            'emotion': {
                'label': emotion_label,
                'confidence': emotion_probs[emotion_pred].item()
            },
            'offensive': {
                'label': offensive_label,
                'confidence': offensive_probs[offensive_pred].item()
            },
            'hate': {
                'label': hate_label,
                'confidence': hate_probs[hate_pred].item()
            }
        }

print("Main classifier class defined successfully!")

Main classifier class defined successfully!


In [None]:
# Cell 5: Training Function
# =========================

def train_arabic_classifier(data_file_path, num_epochs=1, batch_size=16, learning_rate=2e-5):
    """
    Main training function for Arabic multi-task text classification

    Args:
        data_file_path: Path to training data (CSV or Excel)
        num_epochs: Number of training epochs
        batch_size: Batch size for training
        learning_rate: Learning rate for optimizer

    Returns:
        classifier: Trained classifier object
        model: Trained model
        results: Training results and metrics
    """

    print("="*60)
    print("ARABIC MULTI-TASK TEXT CLASSIFICATION TRAINING")
    print("="*60)

    # Initialize classifier
    classifier = ArabicMultiTaskClassifier()

    # Load data
    df = classifier.load_data(data_file_path)

    # Prepare datasets
    train_dataset, val_dataset, test_dataset, train_df, val_df, test_df = classifier.prepare_data(df)

    # Create data loaders
    train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)
    test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

    # Get number of classes for each task
    num_emotions = len(classifier.emotion_encoder.classes_)
    num_offensive = len(classifier.offensive_encoder.classes_)
    num_hate = len(classifier.hate_encoder.classes_)

    print(f"\nNumber of emotion classes: {num_emotions}")
    print(f"Number of offensive classes: {num_offensive}")
    print(f"Number of hate classes: {num_hate}")

    # Create model
    model = classifier.create_model(num_emotions, num_offensive, num_hate)

    # Train model
    train_losses, val_losses = classifier.train_model(
        model, train_loader, val_loader, num_epochs=num_epochs, learning_rate=learning_rate
    )

    # Save model
    torch.save(model.state_dict(), 'arabic_multitask_model.pth')
    print("\nModel saved as 'arabic_multitask_model.pth'")

    # Return everything needed for evaluation
    return {
        'classifier': classifier,
        'model': model,
        'test_loader': test_loader,
        'test_df': test_df,
        'full_df': df,
        'train_losses': train_losses,
        'val_losses': val_losses
    }

print("Training function defined successfully!")

Training function defined successfully!


In [None]:
# Cell 6: Evaluation Functions
# ============================

def calculate_detailed_metrics(emotion_true, emotion_pred, offensive_true, offensive_pred, hate_true, hate_pred):
    """Calculate detailed metrics for each task"""

    # Emotion metrics
    emotion_accuracy = accuracy_score(emotion_true, emotion_pred)
    emotion_precision, emotion_recall, emotion_f1, _ = precision_recall_fscore_support(
        emotion_true, emotion_pred, average='weighted', zero_division=0
    )

    # Offensive metrics
    offensive_accuracy = accuracy_score(offensive_true, offensive_pred)
    offensive_precision, offensive_recall, offensive_f1, _ = precision_recall_fscore_support(
        offensive_true, offensive_pred, average='weighted', zero_division=0
    )

    # Hate metrics
    hate_accuracy = accuracy_score(hate_true, hate_pred)
    hate_precision, hate_recall, hate_f1, _ = precision_recall_fscore_support(
        hate_true, hate_pred, average='weighted', zero_division=0
    )

    # Calculate averages
    avg_accuracy = (emotion_accuracy + offensive_accuracy + hate_accuracy) / 3
    avg_precision = (emotion_precision + offensive_precision + hate_precision) / 3
    avg_recall = (emotion_recall + offensive_recall + hate_recall) / 3
    avg_f1 = (emotion_f1 + offensive_f1 + hate_f1) / 3

    metrics = {
        'emotion': {
            'accuracy': emotion_accuracy,
            'precision': emotion_precision,
            'recall': emotion_recall,
            'f1_score': emotion_f1
        },
        'offensive': {
            'accuracy': offensive_accuracy,
            'precision': offensive_precision,
            'recall': offensive_recall,
            'f1_score': offensive_f1
        },
        'hate': {
            'accuracy': hate_accuracy,
            'precision': hate_precision,
            'recall': hate_recall,
            'f1_score': hate_f1
        },
        'average': {
            'accuracy': avg_accuracy,
            'precision': avg_precision,
            'recall': avg_recall,
            'f1_score': avg_f1
        }
    }

    return metrics

In [None]:
def print_detailed_results(emotion_true, emotion_pred, offensive_true, offensive_pred, hate_true, hate_pred, metrics):
    """Print detailed classification results"""

    print("\n" + "="*80)
    print("DETAILED CLASSIFICATION RESULTS")
    print("="*80)

    # Individual task results
    print("\n" + "="*50)
    print("EMOTION CLASSIFICATION")
    print("="*50)
    print(classification_report(emotion_true, emotion_pred))
    print(f"Accuracy: {metrics['emotion']['accuracy']:.4f}")
    print(f"Precision: {metrics['emotion']['precision']:.4f}")
    print(f"Recall: {metrics['emotion']['recall']:.4f}")
    print(f"F1-Score: {metrics['emotion']['f1_score']:.4f}")

    print("\n" + "="*50)
    print("OFFENSIVE LANGUAGE DETECTION")
    print("="*50)
    print(classification_report(offensive_true, offensive_pred))
    print(f"Accuracy: {metrics['offensive']['accuracy']:.4f}")
    print(f"Precision: {metrics['offensive']['precision']:.4f}")
    print(f"Recall: {metrics['offensive']['recall']:.4f}")
    print(f"F1-Score: {metrics['offensive']['f1_score']:.4f}")

    print("\n" + "="*50)
    print("HATE SPEECH DETECTION")
    print("="*50)
    print(classification_report(hate_true, hate_pred))
    print(f"Accuracy: {metrics['hate']['accuracy']:.4f}")
    print(f"Precision: {metrics['hate']['precision']:.4f}")
    print(f"Recall: {metrics['hate']['recall']:.4f}")
    print(f"F1-Score: {metrics['hate']['f1_score']:.4f}")

    # Summary table
    print("\n" + "="*80)
    print("METRICS SUMMARY TABLE")
    print("="*80)
    print(f"{'Task':<20} {'Accuracy':<12} {'Precision':<12} {'Recall':<12} {'F1-Score':<12}")
    print("-" * 80)
    print(f"{'Emotion':<20} {metrics['emotion']['accuracy']:<12.4f} {metrics['emotion']['precision']:<12.4f} {metrics['emotion']['recall']:<12.4f} {metrics['emotion']['f1_score']:<12.4f}")
    print(f"{'Offensive':<20} {metrics['offensive']['accuracy']:<12.4f} {metrics['offensive']['precision']:<12.4f} {metrics['offensive']['recall']:<12.4f} {metrics['offensive']['f1_score']:<12.4f}")
    print(f"{'Hate':<20} {metrics['hate']['accuracy']:<12.4f} {metrics['hate']['precision']:<12.4f} {metrics['hate']['recall']:<12.4f} {metrics['hate']['f1_score']:<12.4f}")
    print("-" * 80)
    print(f"{'AVERAGE':<20} {metrics['average']['accuracy']:<12.4f} {metrics['average']['precision']:<12.4f} {metrics['average']['recall']:<12.4f} {metrics['average']['f1_score']:<12.4f}")
    print("="*80)

In [None]:
def evaluate_model(classifier, model, test_loader, test_df):
    """Evaluate the model on test data with detailed metrics"""
    print("Evaluating model...")

    model.eval()
    all_emotion_preds = []
    all_offensive_preds = []
    all_hate_preds = []
    all_emotion_labels = []
    all_offensive_labels = []
    all_hate_labels = []

    with torch.no_grad():
        for batch in tqdm(test_loader, desc="Testing"):
            input_ids = batch['input_ids'].to(classifier.device)
            attention_mask = batch['attention_mask'].to(classifier.device)

            emotion_logits, offensive_logits, hate_logits = model(input_ids, attention_mask)

            # Get predictions
            emotion_preds = torch.argmax(emotion_logits, dim=1)
            offensive_preds = torch.argmax(offensive_logits, dim=1)
            hate_preds = torch.argmax(hate_logits, dim=1)

            all_emotion_preds.extend(emotion_preds.cpu().numpy())
            all_offensive_preds.extend(offensive_preds.cpu().numpy())
            all_hate_preds.extend(hate_preds.cpu().numpy())

            all_emotion_labels.extend(batch['emotion'].numpy())
            all_offensive_labels.extend(batch['offensive'].numpy())
            all_hate_labels.extend(batch['hate'].numpy())

    # Convert predictions back to original labels
    emotion_pred_labels = classifier.emotion_encoder.inverse_transform(all_emotion_preds)
    offensive_pred_labels = classifier.offensive_encoder.inverse_transform(all_offensive_preds)
    hate_pred_labels = classifier.hate_encoder.inverse_transform(all_hate_preds)

    emotion_true_labels = classifier.emotion_encoder.inverse_transform(all_emotion_labels)
    offensive_true_labels = classifier.offensive_encoder.inverse_transform(all_offensive_labels)
    hate_true_labels = classifier.hate_encoder.inverse_transform(all_hate_labels)

    # Calculate detailed metrics
    metrics = calculate_detailed_metrics(
        emotion_true_labels, emotion_pred_labels,
        offensive_true_labels, offensive_pred_labels,
        hate_true_labels, hate_pred_labels
    )

    # Print detailed results
    print_detailed_results(
        emotion_true_labels, emotion_pred_labels,
        offensive_true_labels, offensive_pred_labels,
        hate_true_labels, hate_pred_labels,
        metrics
    )

    # Create results DataFrame with predictions
    results_df = test_df.copy()
    results_df['emotion_predicted'] = emotion_pred_labels
    results_df['offensive_predicted'] = offensive_pred_labels
    results_df['hate_predicted'] = hate_pred_labels

    # Add correctness indicators
    results_df['emotion_correct'] = results_df['Emotion'] == results_df['emotion_predicted']
    results_df['offensive_correct'] = results_df['Offensive'] == results_df['offensive_predicted']
    results_df['hate_correct'] = results_df['Hate'] == results_df['hate_predicted']

    # Save results to file
    results_df.to_excel('test_predictions_with_original_data.xlsx', index=False)
    print(f"\nTest predictions saved to 'test_predictions_with_original_data.xlsx'")

    return {
        'metrics': metrics,
        'results_df': results_df
    }

print("Evaluation functions defined successfully!")

Evaluation functions defined successfully!


In [None]:
# Cell 7: External Test File Prediction and Evaluation
# ===================================================

def predict_on_external_test_file(classifier, model, test_file_path):
    """
    Predict on an external test CSV file provided by the user

    Args:
        classifier: Trained classifier object
        model: Trained model
        test_file_path: Path to external test CSV/Excel file

    Returns:
        results_df: DataFrame with predictions and evaluations
    """
    print(f"Loading external test file: {test_file_path}")

    # Load the external test file
    if test_file_path.endswith('.csv'):
        test_df = pd.read_csv(test_file_path)
    elif test_file_path.endswith(('.xlsx', '.xls')):
        test_df = pd.read_excel(test_file_path)
    else:
        raise ValueError("Unsupported file format. Please use CSV (.csv) or Excel (.xlsx, .xls) files.")

    print(f"External test file loaded with {len(test_df)} samples")
    print("Columns in test file:", test_df.columns.tolist())

    # Check if 'text' column exists
    if 'text' not in test_df.columns:
        raise ValueError("Test file must contain a 'text' column")

    # Check if ground truth labels exist
    has_labels = all(col in test_df.columns for col in ['Emotion', 'Offensive', 'Hate'])

    if has_labels:
        print("Ground truth labels found. Will calculate accuracy metrics.")
        # Encode labels if they exist
        try:
            test_df['emotion_encoded'] = classifier.emotion_encoder.transform(test_df['Emotion'])
            test_df['offensive_encoded'] = classifier.offensive_encoder.transform(test_df['Offensive'])
            test_df['hate_encoded'] = classifier.hate_encoder.transform(test_df['Hate'])
        except ValueError as e:
            print(f"Warning: Some labels in test file are not seen during training: {e}")
            print("Proceeding with prediction only...")
            has_labels = False
    else:
        print("No ground truth labels found. Proceeding with prediction only...")

    # Get predictions for all samples
    model.eval()
    all_texts = test_df['text'].tolist()
    all_predictions = []

    print("Making predictions...")
    for i, text in enumerate(tqdm(all_texts, desc="Predicting")):
        try:
            prediction = classifier.predict_text(model, text)
            all_predictions.append(prediction)
        except Exception as e:
            print(f"Error predicting text at index {i}: {e}")
            # Add default prediction for failed cases
            all_predictions.append({
                'emotion': {'label': 'unknown', 'confidence': 0.0},
                'offensive': {'label': 'unknown', 'confidence': 0.0},
                'hate': {'label': 'unknown', 'confidence': 0.0}
            })

    # Create results DataFrame
    results_df = test_df.copy()
    results_df['emotion_predicted'] = [pred['emotion']['label'] for pred in all_predictions]
    results_df['emotion_confidence'] = [pred['emotion']['confidence'] for pred in all_predictions]
    results_df['offensive_predicted'] = [pred['offensive']['label'] for pred in all_predictions]
    results_df['offensive_confidence'] = [pred['offensive']['confidence'] for pred in all_predictions]
    results_df['hate_predicted'] = [pred['hate']['label'] for pred in all_predictions]
    results_df['hate_confidence'] = [pred['hate']['confidence'] for pred in all_predictions]

    # Calculate accuracy if ground truth is available
    if has_labels:
        results_df['emotion_correct'] = results_df['Emotion'] == results_df['emotion_predicted']
        results_df['offensive_correct'] = results_df['Offensive'] == results_df['offensive_predicted']
        results_df['hate_correct'] = results_df['Hate'] == results_df['hate_predicted']

        # Calculate and print accuracies
        emotion_accuracy = (results_df['emotion_correct'].sum() / len(results_df)) * 100
        offensive_accuracy = (results_df['offensive_correct'].sum() / len(results_df)) * 100
        hate_accuracy = (results_df['hate_correct'].sum() / len(results_df)) * 100
        avg_accuracy = (emotion_accuracy + offensive_accuracy + hate_accuracy) / 3

        print(f"\nExternal Test File Accuracy:")
        print(f"Emotion: {emotion_accuracy:.2f}%")
        print(f"Offensive: {offensive_accuracy:.2f}%")
        print(f"Hate: {hate_accuracy:.2f}%")
        print(f"Average: {avg_accuracy:.2f}%")

        # Calculate detailed metrics if labels are available
        try:
            # Calculate detailed metrics
            metrics = calculate_detailed_metrics(
                results_df['Emotion'], results_df['emotion_predicted'],
                results_df['Offensive'], results_df['offensive_predicted'],
                results_df['Hate'], results_df['hate_predicted']
            )

            # Print detailed results
            print_detailed_results(
                results_df['Emotion'], results_df['emotion_predicted'],
                results_df['Offensive'], results_df['offensive_predicted'],
                results_df['Hate'], results_df['hate_predicted'],
                metrics
            )

        except Exception as e:
            print(f"Error calculating detailed metrics: {e}")

    # Save results
    output_filename = f"external_test_predictions_{test_file_path.split('/')[-1].split('.')[0]}.xlsx"
    results_df.to_excel(output_filename, index=False)
    print(f"\nPredictions saved to '{output_filename}'")

    # Show sample predictions
    show_sample_predictions(results_df, has_labels, num_samples=5)

    return results_df

def show_sample_predictions(results_df, has_labels, num_samples=5):
    """Show sample predictions from the results"""
    print("\n" + "="*80)
    print("SAMPLE PREDICTIONS FROM EXTERNAL TEST FILE")
    print("="*80)

    # Select random samples
    sample_indices = random.sample(range(len(results_df)), min(num_samples, len(results_df)))

    for i, idx in enumerate(sample_indices):
        row = results_df.iloc[idx]
        text = str(row['text'])

        print(f"\nExample {i+1}:")
        print(f"Text: {text[:100]}{'...' if len(text) > 100 else ''}")

        if has_labels:
            print(f"TRUE LABELS  -> Emotion: {row['Emotion']}, Offensive: {row['Offensive']}, Hate: {row['Hate']}")

        print(f"PREDICTIONS -> Emotion: {row['emotion_predicted']} (conf: {row['emotion_confidence']:.3f})")
        print(f"               Offensive: {row['offensive_predicted']} (conf: {row['offensive_confidence']:.3f})")
        print(f"               Hate: {row['hate_predicted']} (conf: {row['hate_confidence']:.3f})")

        if has_labels:
            emotion_correct = "✓" if row['emotion_predicted'] == row['Emotion'] else "✗"
            offensive_correct = "✓" if row['offensive_predicted'] == row['Offensive'] else "✗"
            hate_correct = "✓" if row['hate_predicted'] == row['Hate'] else "✗"
            print(f"CORRECTNESS -> Emotion: {emotion_correct}, Offensive: {offensive_correct}, Hate: {hate_correct}")

        print("-" * 80)

print("External test file prediction functions defined successfully!")

External test file prediction functions defined successfully!


In [None]:
# Cell 8: Usage Examples
# ======================

# Training Example:

# To train the model:
training_results = train_arabic_classifier(
    data_file_path='/content/sample_data/train.csv',  # Replace with your training file
    num_epochs=1,
    batch_size=16,
    learning_rate=2e-5
)

# Extract components
classifier = training_results['classifier']
model = training_results['model']
test_loader = training_results['test_loader']
test_df = training_results['test_df']

# Evaluate on test set
evaluation_results = evaluate_model(classifier, model, test_loader, test_df)

ARABIC MULTI-TASK TEXT CLASSIFICATION TRAINING
Using device: cuda
Loading data...
Dataset shape: (5960, 5)

Column names: ['id', 'text', 'Emotion', 'Offensive', 'Hate']

First few rows:
     id                                               text       Emotion  \
0  2537  أحد التجار الشباب العمانيين يقول للاسف لما يكو...       neutral   
1  5579  @JALHARBISKY مجموعه القدرة الجنسيه👍<LF> <LF>بد...      optimism   
2  6092        @rwn4o حبيبييي والله اكثثثرر يارب امين🥺♥️♥️          love   
3  2540  #وصال_دوت_FM<LF>مع سميرة الفطيسية @Samira_Alfu...       neutral   
4  3159  من ينتزع ارواح اطفالنا من أجسادها بكل وحشية عل...  anticipation   

  Offensive Hate  
0        no  NaN  
1        no  NaN  
2        no  NaN  
3        no  NaN  
4        no  NaN  

Missing values:
id              0
text            0
Emotion         0
Offensive       0
Hate         4216
dtype: int64

Label distributions:
Emotions: Emotion
anger           1551
disgust          777
neutral          661
love             593

Training: 100%|██████████| 261/261 [06:22<00:00,  1.47s/it]
Validation: 100%|██████████| 38/38 [00:17<00:00,  2.14it/s]


Average training loss: 2.6509
Average validation loss: 2.4087

Model saved as 'arabic_multitask_model.pth'
Evaluating model...


Testing: 100%|██████████| 75/75 [00:35<00:00,  2.10it/s]



DETAILED CLASSIFICATION RESULTS

EMOTION CLASSIFICATION
              precision    recall  f1-score   support

       anger       0.49      0.92      0.64       310
anticipation       0.33      0.03      0.06        98
  confidence       0.00      0.00      0.00        42
     disgust       0.00      0.00      0.00       155
        fear       0.00      0.00      0.00        11
         joy       0.51      0.41      0.45       106
        love       0.31      0.78      0.45       119
     neutral       0.45      0.67      0.54       132
    optimism       0.45      0.06      0.11        84
   pessimism       0.00      0.00      0.00        39
     sadness       0.33      0.07      0.12        67
    surprise       0.00      0.00      0.00        29

    accuracy                           0.44      1192
   macro avg       0.24      0.25      0.20      1192
weighted avg       0.33      0.44      0.33      1192

Accuracy: 0.4388
Precision: 0.3331
Recall: 0.4388
F1-Score: 0.3304

OFFENSIV

In [None]:
# Predict on external test file
external_results = predict_on_external_test_file(
    classifier=classifier,
    model=model,
    test_file_path='/content/sample_data/validation.csv'
)

Loading external test file: /content/sample_data/validation.csv
External test file loaded with 1277 samples
Columns in test file: ['id', 'text', 'Emotion', 'Offensive', 'Hate']
Ground truth labels found. Will calculate accuracy metrics.
Proceeding with prediction only...
Making predictions...


Predicting: 100%|██████████| 1277/1277 [00:45<00:00, 28.35it/s]



Predictions saved to 'external_test_predictions_validation.xlsx'

SAMPLE PREDICTIONS FROM EXTERNAL TEST FILE

Example 1:
Text: RT @fayz35510671: طيب وين اللي متفاعلين نبي نكمل ٥٠٠
PREDICTIONS -> Emotion: love (conf: 0.200)
               Offensive: no (conf: 0.941)
               Hate: not_hate (conf: 0.985)
--------------------------------------------------------------------------------

Example 2:
Text: ياريت نوفر قوتنا واتحادنا في هاشتاج حق الخطيب ومين بيحمي الزلنطحي انتوا بتمثلوا علينا لسه الخطيب مخد...
PREDICTIONS -> Emotion: anger (conf: 0.616)
               Offensive: yes (conf: 0.741)
               Hate: not_hate (conf: 0.914)
--------------------------------------------------------------------------------

Example 3:
Text: لو الغـــلا يوخذ ويعطي على الكيف<LF>ترى الغـــــلا منـــــي لك اول هــديه<LF><LF>هرجي معك صادق ولا ا...
PREDICTIONS -> Emotion: love (conf: 0.157)
               Offensive: no (conf: 0.913)
               Hate: not_hate (conf: 0.982)
---------------------