# Detecting Toxic Content with Neural Network Architectures

This notebook implements deep learning models: GRU and ALBERT.

In [None]:
# Standard library imports
import re
import time
import random
import warnings
from collections import Counter
from itertools import combinations
warnings.filterwarnings("ignore")

# Third-party imports - Data manipulation
import numpy as np
import pandas as pd

# Third-party imports - Visualization
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

# Third-party imports - Machine Learning
from sklearn.model_selection import train_test_split
from sklearn.metrics import (
    accuracy_score, classification_report, confusion_matrix,
    roc_curve, auc, f1_score
)
from sklearn.utils.class_weight import compute_class_weight
from sklearn.utils import resample

# Third-party imports - Deep Learning
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from torch.optim import AdamW

# Third-party imports - Transformers
from transformers import (
    AlbertTokenizer, AlbertForSequenceClassification
)

# Third-party imports - Utilities
from tqdm import tqdm
from statsmodels.stats.contingency_tables import mcnemar
from statsmodels.stats.multitest import multipletests

# Device configuration
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

# Google Colab setup
from google.colab import drive
drive.mount('/content/drive/', force_remount=False)


In [None]:
# Configuration and Constants
RANDOM_SEED = 1234
TEST_SIZE = 0.2
VAL_SIZE = 0.25
DATA_PATH = '/content/drive/MyDrive/hatespeech/hatexplain_detailed.csv'
MAX_SEQUENCE_LENGTH_ALBERT = 200
MAX_SEQUENCE_LENGTH_GRU = 200
VOCAB_SIZE_GRU = 10000
BATCH_SIZE_ALBERT = 4
BATCH_SIZE_GRU = 16
N_RANDOM_SEARCH_ITERATIONS_ALBERT = 10
N_RANDOM_SEARCH_ITERATIONS_GRU = 10
N_BOOTSTRAP_ITERATIONS = 1000
ALBERT_MODEL_NAME = 'albert-base-v2'
MODEL_SAVE_PATH_ALBERT = '/content/drive/MyDrive/best_albert_binary_model_v1.pth'
MODEL_SAVE_PATH_GRU = 'GRU_Binary.pth'


In [None]:
# Data Loading and Initial Processing
def load_and_prepare_data(file_path, random_seed=RANDOM_SEED):
    """Load data and perform initial transformations."""
    raw_data = pd.read_csv(file_path, index_col=0)
    processed_data = raw_data.reset_index()
    processed_data = processed_data[['text', 'text_type']]
    
    # Convert labels using map() instead of apply()
    processed_data['text_type'] = processed_data['text_type'].map(
        lambda x: 0 if x == 'normal' else 1
    )
    
    # Shuffle using sample() instead of shuffle()
    processed_data = processed_data.sample(
        frac=1, random_state=random_seed
    ).reset_index(drop=True)
    
    return processed_data

dataframe_main = load_and_prepare_data(DATA_PATH)
print(f"Dataset shape: {dataframe_main.shape}")
print(f"\nDataset info:")
print(dataframe_main.describe().T)


In [None]:
# Data Splitting Function
def split_data_for_dl(features, labels, test_size=TEST_SIZE, val_size=VAL_SIZE, random_seed=RANDOM_SEED):
    """Split data into train, validation, and test sets."""
    X_temp, X_test, y_temp, y_test = train_test_split(
        features, labels, test_size=test_size, random_state=random_seed
    )
    X_train, X_val, y_train, y_val = train_test_split(
        X_temp, y_temp, test_size=val_size, random_state=random_seed
    )
    return X_train, X_val, X_test, y_train, y_val, y_test


In [None]:
# Text Preprocessing for Deep Learning Models
class DLTextPreprocessor:
    """Handles text preprocessing for deep learning models."""
    
    @staticmethod
    def clean_text_albert(text):
        """Clean text for ALBERT (minimal preprocessing)."""
        text = str(text).lower()
        text = re.sub(pattern=r"http\S+", repl="<URL>", string=text)
        text = re.sub(pattern=r"@\w+", repl="<USER>", string=text)
        text = re.sub(pattern=r"#\w+", repl="<HASHTAG>", string=text)
        text = re.sub(pattern=r"[^\w\s]", repl="", string=text)
        text = re.sub(pattern=r"\s+", repl=" ", string=text)
        return text.strip()
    
    @staticmethod
    def clean_text_gru(text):
        """Clean text for GRU (minimal preprocessing)."""
        text = str(text).lower()
        text = re.sub(pattern=r"http\S+", repl="<URL>", string=text)
        text = re.sub(pattern=r"@\w+", repl="<USER>", string=text)
        text = re.sub(pattern=r"#\w+", repl="<HASHTAG>", string=text)
        text = re.sub(pattern=r"[^\w\s]", repl="", string=text)
        text = re.sub(pattern=r"\s+", repl=" ", string=text)
        return text.strip()

dl_preprocessor = DLTextPreprocessor()


# Section 3: Neural Network Model Implementation

## 3.1 Recurrent Neural Network: GRU Architecture

In [None]:
# Prepare data for GRU
dataframe_gru = dataframe_main.copy()
dataframe_gru['text'] = list(map(dl_preprocessor.clean_text_gru, dataframe_gru['text']))

# Split data
features_gru = list(dataframe_gru['text'])
labels_gru = list(dataframe_gru['text_type'])

train_texts_gru, val_texts_gru, test_texts_gru, \
train_labels_gru, val_labels_gru, test_labels_gru = split_data_for_dl(
    features_gru, labels_gru
)

print(f"GRU - Training: {len(train_texts_gru)}, Validation: {len(val_texts_gru)}, Test: {len(test_texts_gru)}")


In [None]:
# Build Vocabulary and Convert to Sequences
def build_vocabulary(texts, vocab_size=VOCAB_SIZE_GRU):
    """Build vocabulary from training texts."""
    all_words = ' '.join(texts).split()
    most_common_words = [word for word, _ in Counter(all_words).most_common(vocab_size - 1)]
    word_to_index = {word: idx + 1 for idx, word in enumerate(most_common_words)}
    return word_to_index

def convert_text_to_sequence_indices(text, word_to_index, max_len=MAX_SEQUENCE_LENGTH_GRU):
    """Convert text to sequence of indices."""
    tokens = text.split()
    sequence = [word_to_index.get(word, 0) for word in tokens]
    return sequence[:max_len]

# Build vocabulary from training data only
word_to_index_gru = build_vocabulary(train_texts_gru, VOCAB_SIZE_GRU)

# Convert texts to sequences using map()
train_sequences_gru = list(map(lambda t: convert_text_to_sequence_indices(t, word_to_index_gru), train_texts_gru))
val_sequences_gru = list(map(lambda t: convert_text_to_sequence_indices(t, word_to_index_gru), val_texts_gru))
test_sequences_gru = list(map(lambda t: convert_text_to_sequence_indices(t, word_to_index_gru), test_texts_gru))

# Pad sequences
train_sequences_gru = nn.utils.rnn.pad_sequence(
    [torch.tensor(seq) for seq in train_sequences_gru], 
    batch_first=True, 
    padding_value=0
)
val_sequences_gru = nn.utils.rnn.pad_sequence(
    [torch.tensor(seq) for seq in val_sequences_gru], 
    batch_first=True, 
    padding_value=0
)
test_sequences_gru = nn.utils.rnn.pad_sequence(
    [torch.tensor(seq) for seq in test_sequences_gru], 
    batch_first=True, 
    padding_value=0
)

# Convert labels to tensors
train_labels_gru = torch.tensor(train_labels_gru, dtype=torch.long)
val_labels_gru = torch.tensor(val_labels_gru, dtype=torch.long)
test_labels_gru = torch.tensor(test_labels_gru, dtype=torch.long)


In [None]:
# GRU Dataset Class
class GRUSentimentDataset(Dataset):
    """Dataset class for GRU model."""
    
    def __init__(self, sequences, labels):
        self.sequences = sequences
        self.labels = labels

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return {
            'input_ids': self.sequences[idx],
            'label': self.labels[idx]
        }

# Create datasets
train_dataset_gru = GRUSentimentDataset(train_sequences_gru, train_labels_gru)
val_dataset_gru = GRUSentimentDataset(val_sequences_gru, val_labels_gru)
test_dataset_gru = GRUSentimentDataset(test_sequences_gru, test_labels_gru)

# Create DataLoaders
train_loader_gru = DataLoader(train_dataset_gru, batch_size=BATCH_SIZE_GRU, shuffle=True)
val_loader_gru = DataLoader(val_dataset_gru, batch_size=BATCH_SIZE_GRU)
test_loader_gru = DataLoader(test_dataset_gru, batch_size=BATCH_SIZE_GRU)


In [None]:
# GRU Model Definition
class GRUSentimentModel(nn.Module):
    """GRU-based sentiment classification model."""
    
    def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, dropout_rate=0.3):
        super(GRUSentimentModel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.gru = nn.GRU(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        self.dropout = nn.Dropout(dropout_rate)

    def forward(self, input_ids):
        embedded = self.embedding(input_ids)
        _, hidden = self.gru(embedded)
        output = self.fc(self.dropout(hidden[-1]))
        return output


In [None]:
# GRU Training and Evaluation Function
def train_and_evaluate_gru_model(hyperparams, train_loader, val_loader, vocab_size, output_dim, device):
    """Train and evaluate GRU model with given hyperparameters."""
    embedding_dim = int(hyperparams['embedding_dim'])
    hidden_dim = int(hyperparams['hidden_dim'])
    lr = hyperparams['lr']

    # Initialize model
    model = GRUSentimentModel(vocab_size, embedding_dim, hidden_dim, output_dim).to(device)

    # Compute class weights
    all_train_labels = [label.item() for batch in train_loader for label in batch['label']]
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(all_train_labels),
        y=all_train_labels
    )
    class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

    criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
    optimizer = torch.optim.Adam(model.parameters(), lr=lr)

    # Training loop using while instead of for
    epoch_count = 0
    while epoch_count < hyperparams['epochs']:
        model.train()
        total_loss = 0
        train_loop = tqdm(train_loader, desc=f"Training Epoch {epoch_count + 1}/{hyperparams['epochs']}", leave=True, position=0)
        
        for batch in train_loop:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            optimizer.zero_grad()
            outputs = model(input_ids)
            loss = criterion(outputs, labels)
            total_loss += loss.item()
            loss.backward()
            optimizer.step()

            train_loop.set_postfix(loss=loss.item())
        
        print(f"Epoch {epoch_count + 1}/{hyperparams['epochs']}, Loss: {total_loss / len(train_loader):.4f}")
        epoch_count += 1

    # Validation evaluation
    model.eval()
    val_predictions, val_labels = [], []
    val_loop = tqdm(val_loader, desc="Validating", leave=True, position=0)
    
    with torch.no_grad():
        for batch in val_loop:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            outputs = model(input_ids)
            preds = torch.argmax(outputs, dim=1)
            val_predictions.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())

    # Calculate metrics
    f1 = f1_score(val_labels, val_predictions, average='weighted')
    accuracy = accuracy_score(val_labels, val_predictions)
    print(f"Validation F-1 Score: {f1:.4f}, Accuracy: {accuracy:.4f}")
    return f1, accuracy, model


In [None]:
# GRU Hyperparameter Tuning
random.seed(RANDOM_SEED)

def tune_gru_hyperparameters_random_search(train_loader, val_loader, vocab_size, output_dim, device, n_iter=N_RANDOM_SEARCH_ITERATIONS_GRU):
    """Perform random search hyperparameter tuning for GRU."""
    param_space = {
        'embedding_dim': (150, 250),
        'hidden_dim': (256, 768),
        'lr': (np.log10(1e-4), np.log10(1e-3)),
        'epochs': (5, 10)
    }

    best_f1_score_gru = 0
    best_params_gru = None
    best_model_gru = None

    # Random search using while loop
    iteration_count = 0
    with tqdm(total=n_iter, desc="Random Search Tuning", leave=True, position=0) as pbar:
        while iteration_count < n_iter:
            # Sample hyperparameters
            hyperparams = {
                'embedding_dim': np.random.uniform(*param_space['embedding_dim']),
                'hidden_dim': np.random.uniform(*param_space['hidden_dim']),
                'lr': 10**np.random.uniform(*param_space['lr']),
                'epochs': np.random.randint(*param_space['epochs'])
            }
            
            f1, accuracy, model = train_and_evaluate_gru_model(
                hyperparams, train_loader, val_loader, vocab_size, output_dim, device
            )
            
            pbar.set_postfix(f1=f1, accuracy=accuracy)
            pbar.update(1)

            if f1 > best_f1_score_gru:
                best_f1_score_gru = f1
                best_params_gru = hyperparams.copy()
                best_model_gru = model
            
            iteration_count += 1

    print(f"Best Validation F1 Score: {best_f1_score_gru:.4f}")
    print(f"Best Hyperparameters: {best_params_gru}")
    
    # Save best model
    if best_model_gru is not None:
        torch.save(best_model_gru.state_dict(), MODEL_SAVE_PATH_GRU)
    
    return best_params_gru, best_f1_score_gru, best_model_gru

# Run hyperparameter tuning
start_time = time.time()
best_params_gru, best_f1_score_gru, best_model_gru = tune_gru_hyperparameters_random_search(
    train_loader_gru,
    val_loader_gru,
    vocab_size=VOCAB_SIZE_GRU,
    output_dim=2,
    device=device,
    n_iter=N_RANDOM_SEARCH_ITERATIONS_GRU
)
end_time = time.time()
print(f"Total Parameter Tuning Time: {end_time - start_time:.2f} seconds")


In [None]:
# Train final GRU model with best hyperparameters
hyperparams_gru = dict(
    embedding_dim=156.02979350260784,
    hidden_dim=467.03628008471236,
    lr=0.000411395570516993,
    epochs=5
)

embedding_dim = int(hyperparams_gru['embedding_dim'])
hidden_dim = int(hyperparams_gru['hidden_dim'])
lr = hyperparams_gru['lr']

# Initialize model
gru_model = GRUSentimentModel(VOCAB_SIZE_GRU, embedding_dim, hidden_dim, 2).to(device)

# Compute class weights
all_train_labels = [label.item() for batch in train_loader_gru for label in batch['label']]
class_weights = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(all_train_labels),
    y=all_train_labels
)
class_weights_tensor = torch.tensor(class_weights, dtype=torch.float).to(device)

criterion = nn.CrossEntropyLoss(weight=class_weights_tensor)
optimizer = torch.optim.Adam(gru_model.parameters(), lr=lr)

# Training loop
epoch_count = 0
while epoch_count < hyperparams_gru['epochs']:
    gru_model.train()
    total_loss = 0
    train_loop = tqdm(train_loader_gru, desc=f"Training Epoch {epoch_count + 1}/{hyperparams_gru['epochs']}", leave=True, position=0)
    
    for batch in train_loop:
        input_ids = batch['input_ids'].to(device)
        labels = batch['label'].to(device)

        optimizer.zero_grad()
        outputs = gru_model(input_ids)
        loss = criterion(outputs, labels)
        total_loss += loss.item()
        loss.backward()
        optimizer.step()

        train_loop.set_postfix(loss=loss.item())
    
    print(f"Epoch {epoch_count + 1}/{hyperparams_gru['epochs']}, Loss: {total_loss / len(train_loader_gru):.4f}")
    epoch_count += 1

# Save model
torch.save(gru_model.state_dict(), MODEL_SAVE_PATH_GRU)


In [None]:
# Evaluate GRU model
gru_model = GRUSentimentModel(VOCAB_SIZE_GRU, embedding_dim, hidden_dim, 2).to(device)
gru_model.load_state_dict(torch.load(MODEL_SAVE_PATH_GRU, map_location=device))

evaluate_deep_learning_model(
    gru_model, 
    test_loader_gru, 
    device, 
    model_name="GRU", 
    num_classes=2, 
    use_attention_mask=False
)


## 3.2 Transformer-Based Model: ALBERT Architecture

In [None]:
# Prepare data for ALBERT
dataframe_albert = dataframe_main.copy()
dataframe_albert['text'] = list(map(dl_preprocessor.clean_text_albert, dataframe_albert['text']))

# Split data
features_albert = list(dataframe_albert['text'])
labels_albert = list(dataframe_albert['text_type'])

train_texts_albert, val_texts_albert, test_texts_albert, \
train_labels_albert, val_labels_albert, test_labels_albert = split_data_for_dl(
    features_albert, labels_albert
)

print(f"ALBERT - Training: {len(train_texts_albert)}, Validation: {len(val_texts_albert)}, Test: {len(test_texts_albert)}")


In [None]:
# ALBERT Dataset Class
class AlbertSentimentDataset(Dataset):
    """Dataset class for ALBERT model."""
    
    def __init__(self, texts, labels, tokenizer, max_len):
        self.texts = list(texts)
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_len = max_len

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        text = str(self.texts[idx])
        label = self.labels[idx]
        encoding = self.tokenizer(
            text,
            truncation=True,
            padding='max_length',
            max_length=self.max_len,
            return_tensors="pt"
        )
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Initialize tokenizer and create datasets
albert_tokenizer = AlbertTokenizer.from_pretrained(ALBERT_MODEL_NAME)

train_dataset_albert = AlbertSentimentDataset(train_texts_albert, train_labels_albert, albert_tokenizer, MAX_SEQUENCE_LENGTH_ALBERT)
val_dataset_albert = AlbertSentimentDataset(val_texts_albert, val_labels_albert, albert_tokenizer, MAX_SEQUENCE_LENGTH_ALBERT)
test_dataset_albert = AlbertSentimentDataset(test_texts_albert, test_labels_albert, albert_tokenizer, MAX_SEQUENCE_LENGTH_ALBERT)

# Create DataLoaders
train_loader_albert = DataLoader(train_dataset_albert, batch_size=BATCH_SIZE_ALBERT, shuffle=True)
val_loader_albert = DataLoader(val_dataset_albert, batch_size=BATCH_SIZE_ALBERT)
test_loader_albert = DataLoader(test_dataset_albert, batch_size=BATCH_SIZE_ALBERT)


In [None]:
# ALBERT Hyperparameter Tuning
random.seed(RANDOM_SEED)

def tune_albert_hyperparameters_random_search(train_texts, train_labels, val_texts, val_labels, tokenizer, n_iter=N_RANDOM_SEARCH_ITERATIONS_ALBERT):
    """Perform random search hyperparameter tuning for ALBERT."""
    param_space = {
        'lr': (np.log10(1e-5), np.log10(1e-4)),
        'epochs': (3, 5),
        'dropout': (0.1, 0.5)
    }

    best_f1_score_albert = 0
    best_params_albert = None
    best_model_albert = None

    # Compute class weights
    class_weights = compute_class_weight(
        class_weight='balanced',
        classes=np.unique(train_labels),
        y=train_labels
    )
    class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

    # Random search using while loop instead of for loop
    iteration_count = 0
    with tqdm(total=n_iter, desc="Random Search Tuning", leave=True) as pbar:
        while iteration_count < n_iter:
            # Randomly sample hyperparameters
            hyperparams = {
                'lr': 10**np.random.uniform(*param_space['lr']),
                'epochs': np.random.randint(*param_space['epochs']),
                'dropout': np.random.uniform(*param_space['dropout'])
            }

            # Initialize model
            model = AlbertForSequenceClassification.from_pretrained(
                ALBERT_MODEL_NAME,
                num_labels=2,
                hidden_dropout_prob=hyperparams['dropout']
            ).to(device)

            optimizer = AdamW(model.parameters(), lr=hyperparams['lr'])
            criterion = torch.nn.CrossEntropyLoss(weight=class_weights)

            # Training loop
            epoch_count = 0
            while epoch_count < hyperparams['epochs']:
                model.train()
                for batch in train_loader_albert:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)

                    optimizer.zero_grad()
                    outputs = model(input_ids, attention_mask=attention_mask)
                    loss = criterion(outputs.logits, labels)
                    loss.backward()
                    optimizer.step()
                epoch_count += 1

            # Validation evaluation
            model.eval()
            val_predictions, val_labels_batch = [], []
            with torch.no_grad():
                for batch in val_loader_albert:
                    input_ids = batch['input_ids'].to(device)
                    attention_mask = batch['attention_mask'].to(device)
                    labels = batch['label'].to(device)

                    outputs = model(input_ids, attention_mask=attention_mask)
                    preds = torch.argmax(outputs.logits, dim=1)
                    val_predictions.extend(preds.cpu().numpy())
                    val_labels_batch.extend(labels.cpu().numpy())

            # Calculate F1-score
            f1 = f1_score(val_labels_batch, val_predictions, average='weighted')
            accuracy = accuracy_score(val_labels_batch, val_predictions)

            # Update best model
            if f1 > best_f1_score_albert:
                best_f1_score_albert = f1
                best_params_albert = hyperparams.copy()
                best_model_albert = model
                torch.save(model.state_dict(), "best_albert_model.pth")
                torch.save(model.state_dict(), MODEL_SAVE_PATH_ALBERT)

            pbar.set_postfix(f1=f1, accuracy=accuracy)
            pbar.update(1)
            iteration_count += 1

    print(f"Best F1 Score: {best_f1_score_albert:.4f}")
    print(f"Best Hyperparameters: {best_params_albert}")
    return best_params_albert, best_f1_score_albert, best_model_albert

# Run hyperparameter tuning
start_time = time.time()
best_params_albert, best_f1_score_albert, best_model_albert = tune_albert_hyperparameters_random_search(
    train_texts=train_texts_albert,
    train_labels=train_labels_albert,
    val_texts=val_texts_albert,
    val_labels=val_labels_albert,
    tokenizer=albert_tokenizer,
    n_iter=N_RANDOM_SEARCH_ITERATIONS_ALBERT
)
end_time = time.time()
print(f"Total Parameter Tuning Time: {end_time - start_time:.2f} seconds")


In [None]:
# Load and evaluate ALBERT model
albert_model = AlbertForSequenceClassification.from_pretrained(ALBERT_MODEL_NAME, num_labels=2)
albert_model.load_state_dict(torch.load(MODEL_SAVE_PATH_ALBERT, map_location=device))
albert_model.to(device)

evaluate_deep_learning_model(
    albert_model, 
    test_loader_albert, 
    device, 
    model_name="ALBERT", 
    num_classes=2, 
    use_attention_mask=True
)


In [None]:
# Deep Learning Model Evaluation Functions
def evaluate_deep_learning_model(model, test_loader, device, model_name="DL Model", num_classes=2, use_attention_mask=True):
    """Evaluate deep learning model with bootstrap confidence intervals."""
    model.eval()
    model.to(device)

    all_preds = []
    all_probs = []
    all_labels = []

    with torch.no_grad():
        for batch in test_loader:
            input_ids = batch['input_ids'].to(device)
            labels = batch['label'].to(device)

            if use_attention_mask:
                attention_mask = batch['attention_mask'].to(device)
                outputs = model(input_ids, attention_mask=attention_mask)
            else:
                outputs = model(input_ids)

            logits = outputs.logits if hasattr(outputs, 'logits') else outputs
            probs = torch.softmax(logits, dim=1)
            preds = torch.argmax(probs, dim=1)

            all_preds.extend(preds.cpu().numpy())
            all_probs.extend(probs.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    all_probs = np.array(all_probs)

    if num_classes == 2:
        positive_probs = all_probs[:, 1]
    else:
        positive_probs = None

    acc = accuracy_score(all_labels, all_preds)
    f1 = f1_score(all_labels, all_preds, average='weighted')
    print(f"\n--- Evaluation Report: {model_name} ---")
    print(f"Accuracy: {acc:.4f}")
    print(f"Weighted F1 Score: {f1:.4f}")

    # F1 CI using while loop instead of for loop
    f1_scores = []
    iteration_count = 0
    while iteration_count < N_BOOTSTRAP_ITERATIONS:
        idx = resample(np.arange(len(all_labels)))
        if len(np.unique(all_labels[idx])) < 2:
            continue
        f1_bs = f1_score(all_labels[idx], all_preds[idx], average='weighted')
        f1_scores.append(f1_bs)
        iteration_count += 1
    
    print(f"95% CI for F1 Score: [{np.percentile(f1_scores, 2.5):.4f}, {np.percentile(f1_scores, 97.5):.4f}]")

    # Confusion matrix using OOP style
    conf_matrix = confusion_matrix(all_labels, all_preds)
    fig, ax = plt.subplots(figsize=(6, 6))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', ax=ax)
    ax.set_xlabel("Predicted")
    ax.set_ylabel("Actual")
    ax.set_title(f"Confusion Matrix - {model_name}")
    plt.tight_layout()
    plt.show()

    # ROC using OOP style
    if num_classes == 2:
        fpr, tpr, _ = roc_curve(all_labels, positive_probs)
        roc_auc = auc(fpr, tpr)

        auc_scores = []
        iteration_count = 0
        while iteration_count < N_BOOTSTRAP_ITERATIONS:
            idx = resample(np.arange(len(all_labels)))
            if len(np.unique(all_labels[idx])) < 2:
                continue
            fpr_bs, tpr_bs, _ = roc_curve(all_labels[idx], positive_probs[idx])
            auc_scores.append(auc(fpr_bs, tpr_bs))
            iteration_count += 1
        
        print(f"AUC: {roc_auc:.4f}")
        print(f"95% CI for AUC: [{np.percentile(auc_scores, 2.5):.4f}, {np.percentile(auc_scores, 97.5):.4f}]")

        fig, ax = plt.subplots(figsize=(8, 6))
        ax.plot(fpr, tpr, label=f"AUC = {roc_auc:.2f}")
        ax.plot([0, 1], [0, 1], linestyle='--', color='gray')
        ax.set_xlabel("False Positive Rate")
        ax.set_ylabel("True Positive Rate")
        ax.set_title(f"ROC Curve - {model_name}")
        ax.legend(loc='lower right')
        plt.tight_layout()
        plt.show()


In [None]:
# Statistical Comparison of Models
def run_pairwise_mcnemar_test_for_all_models(y_true, model_preds, method='holm'):
    """Perform pairwise McNemar's test for all model combinations."""
    assert all(len(y_true) == len(preds) for preds in model_preds.values()), "All predictions must match y_true length."

    results = []
    model_names = list(model_preds.keys())

    # Run pairwise McNemar tests
    for m1, m2 in combinations(model_names, 2):
        y_pred_1, y_pred_2 = model_preds[m1], model_preds[m2]

        # Create contingency table
        both_correct = np.sum((y_pred_1 == y_true) & (y_pred_2 == y_true))
        model1_correct = np.sum((y_pred_1 == y_true) & (y_pred_2 != y_true))
        model2_correct = np.sum((y_pred_1 != y_true) & (y_pred_2 == y_true))
        both_wrong = np.sum((y_pred_1 != y_true) & (y_pred_2 != y_true))

        table = [[both_correct, model2_correct],
                 [model1_correct, both_wrong]]

        result = mcnemar(table, exact=False, correction=True)

        # Store results
        results.append(dict(
            Model_1=m1,
            Model_2=m2,
            Statistic=result.statistic,
            p_value=result.pvalue,
            Winner=m1 if model1_correct > model2_correct else (m2 if model2_correct > model1_correct else "Tie")
        ))

    # Correct for multiple testing
    raw_pvals = [r["p_value"] for r in results]
    _, corrected_pvals, _, _ = multipletests(raw_pvals, method=method)

    for i, p_corr in enumerate(corrected_pvals):
        results[i]["Corrected_p_value"] = p_corr
        results[i]["Significant"] = "Yes" if p_corr < 0.05 else "No"

    # Print results
    df_results = pd.DataFrame(results)
    print("\n📊 Pairwise McNemar's Test Results (corrected using '{}'):\n".format(method))
    print(df_results[["Model_1", "Model_2", "Winner", "Statistic", "Corrected_p_value", "Significant"]])

    return df_results


In [None]:
# Generate predictions for comparison
# Note: This assumes test_labels_albert and test_labels_gru are the same (they should be)
# For comparison, we'll use test_labels_albert as ground truth

# ALBERT predictions
albert_model.eval()
test_predictions_albert = []
with torch.no_grad():
    for batch in test_loader_albert:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        outputs = albert_model(input_ids, attention_mask=attention_mask)
        preds = torch.argmax(outputs.logits, dim=1)
        test_predictions_albert.extend(preds.cpu().numpy())

# GRU predictions
gru_model.eval()
test_predictions_gru = []
with torch.no_grad():
    for batch in test_loader_gru:
        input_ids = batch['input_ids'].to(device)
        outputs = gru_model(input_ids)
        preds = torch.argmax(outputs, dim=1)
        test_predictions_gru.extend(preds.cpu().numpy())

# Prepare for comparison (use ALBERT test labels as ground truth)
test_labels_for_comparison = test_labels_albert

model_predictions_dict = {
    "ALBERT": np.array(test_predictions_albert),
    "GRU": np.array(test_predictions_gru)
}

# Run comparison
run_pairwise_mcnemar_test_for_all_models(test_labels_for_comparison, model_predictions_dict)
