Defaulting to user installation because normal site-packages is not writeable



[notice] A new release of pip is available: 25.0.1 -> 25.3
[notice] To update, run: C:\Users\R6RW5M6\AppData\Local\Microsoft\WindowsApps\PythonSoftwareFoundation.Python.3.12_qbz5n2kfra8p0\python.exe -m pip install --upgrade pip
ERROR: Invalid requirement: 'sckit-learn,transformers,seaborn,pandas': Expected end or semicolon (after name and no valid version specifier)
    sckit-learn,transformers,seaborn,pandas
               ^


# Marathi Sentiment Analysis: Hybrid XLM-RoBERTa + CNN Architecture

## Project Overview
This notebook implements a comprehensive sentiment analysis system for Marathi social media text using:
- **Traditional ML Baselines**: SVM, Random Forest, Logistic Regression, KNN
- **Deep Learning Baselines**: LSTM, BiLSTM, CNN, Multi-CNN
- **PLM Fine-tuning**: XLM-RoBERTa
- **Hybrid Architecture**: XLM-RoBERTa + CNN with Mean Pooling

### Dataset: MahaSent
- **Total Samples**: 60,864 (perfectly balanced 3-class)
- **Train**: 48,114 | **Test**: 6,750 | **Val**: 6,000
- **Labels**: Negative (-1), Neutral (0), Positive (1)
- **Language**: Marathi (Devanagari script)

---

## 1. Environment Setup and Imports

Install required packages and import all necessary libraries.

In [None]:
# Import our pre-built modules
import sys
import os

# Add src to path
sys.path.insert(0, os.path.abspath('src'))

# Import project modules
from src.config import Config
from src.data_loader import get_dataloaders_from_config
from src.models.hybrid_model import HybridSentimentModel
from src.train import train_epoch, validate, setup_training
from src.evaluate import evaluate_model, calculate_all_metrics
from src.visualize import plot_training_history, plot_confusion_matrix, plot_per_class_metrics
from src.utils.logger import ExperimentLogger

# Standard imports
import torch
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer
from pathlib import Path
import time
from tqdm.auto import tqdm

# Configuration
print("‚úÖ All modules imported successfully!")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")

In [None]:
# Configuration Class
@dataclass
class Config:
    """Configuration for Marathi Sentiment Analysis project."""
    
    # Dataset paths
    data_dir: str = "."
    train_file: str = "MahaSent_All_Train.csv"
    test_file: str = "MahaSent_All_Test.csv"
    val_file: str = "MahaSent_All_Val.csv"
    
    # Model hyperparameters
    plm_name: str = "xlm-roberta-base"
    max_seq_length: int = 256
    batch_size: int = 64
    learning_rate_hybrid: float = 1e-4
    learning_rate_plm: float = 2e-5
    num_epochs: int = 20
    early_stopping_patience: int = 3
    
    # Architecture parameters
    plm_hidden_size: int = 768vc
    cnn_out_channels: int = 256
    cnn_kernel_size: int = 3
    dense_hidden_size: int = 512
    dropout_rate: float = 0.3
    num_classes: int = 3
    
    # Training parameters
    warmup_steps: int = 500
    weight_decay: float = 0.01
    max_grad_norm: float = 1.0
    
    # Traditional ML parameters
    tfidf_max_features: int = 5000
    
    # DL baseline parameters
    embedding_dim: int = 300
    lstm_hidden_size: int = 128
    vocab_size: int = 10000
    
    # Paths
    results_dir: str = "results"
    figures_dir: str = "results/figures"
    models_dir: str = "results/models"
    
    # Label mapping
    label_map: Dict[int, int] = field(default_factory=lambda: {-1: 0, 0: 1, 1: 2})
    label_names: List[str] = field(default_factory=lambda: ['Negative', 'Neutral', 'Positive'])
    
    # Device
    device: torch.device = field(default_factory=lambda: torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    
    def to_dict(self) -> Dict[str, Any]:
        """Convert config to dictionary."""
        return {
            'plm_name': self.plm_name,
            'max_seq_length': self.max_seq_length,
            'batch_size': self.batch_size,
            'learning_rate_hybrid': self.learning_rate_hybrid,
            'learning_rate_plm': self.learning_rate_plm,
            'num_epochs': self.num_epochs,
            'early_stopping_patience': self.early_stopping_patience,
        }
    
    def __post_init__(self):
        """Create necessary directories."""
        Path(self.results_dir).mkdir(exist_ok=True)
        Path(self.figures_dir).mkdir(exist_ok=True)
        Path(self.models_dir).mkdir(exist_ok=True)

# Initialize configuration
config = Config()
print("‚úÖ Configuration initialized")
print(f"   Device: {config.device}")
print(f"   PLM: {config.plm_name}")
print(f"   Batch size: {config.batch_size}")
print(f"   Max sequence length: {config.max_seq_length}")

## 2. Data Loading and Exploration

Load the MahaSent dataset and verify its structure.

In [None]:
# Load datasets
try:
    train_df = pd.read_csv(os.path.join(config.data_dir, config.train_file))
    test_df = pd.read_csv(os.path.join(config.data_dir, config.test_file))
    val_df = pd.read_csv(os.path.join(config.data_dir, config.val_file))
    
    print("‚úÖ Datasets loaded successfully!")
    print(f"\nüìä Dataset Statistics:")
    print(f"   Train: {len(train_df):,} samples")
    print(f"   Test:  {len(test_df):,} samples")
    print(f"   Val:   {len(val_df):,} samples")
    print(f"   Total: {len(train_df) + len(test_df) + len(val_df):,} samples")
    
except FileNotFoundError as e:
    print(f"‚ùå Error loading datasets: {e}")
    print(f"   Please ensure CSV files are in: {config.data_dir}")

In [None]:
# Verify class balance
print("\nüìà Label Distribution:")
print("\nTrain Set:")
print(train_df['label'].value_counts().sort_index())
print(f"Balance: {train_df['label'].value_counts(normalize=True).values}")

print("\nTest Set:")
print(test_df['label'].value_counts().sort_index())

print("\nVal Set:")
print(val_df['label'].value_counts().sort_index())

# Display sample data
print("\nüìù Sample Texts by Sentiment:")
for label in [-1, 0, 1]:
    sentiment_name = config.label_names[config.label_map[label]]
    print(f"\n{sentiment_name} (Label {label}):")
    samples = train_df[train_df['label'] == label]['text'].head(2).values
    for i, text in enumerate(samples, 1):
        print(f"  {i}. {text[:100]}...")

In [None]:
# Text length analysis
train_df['text_length'] = train_df['text'].str.len()
test_df['text_length'] = test_df['text'].str.len()
val_df['text_length'] = val_df['text'].str.len()

print("\nüìè Text Length Statistics (characters):")
for name, df in [('Train', train_df), ('Test', test_df), ('Val', val_df)]:
    print(f"\n{name}:")
    print(f"  Mean: {df['text_length'].mean():.1f}")
    print(f"  Median: {df['text_length'].median():.1f}")
    print(f"  Min: {df['text_length'].min()}")
    print(f"  Max: {df['text_length'].max()}")
    print(f"  Std: {df['text_length'].std():.1f}")

# Visualize text length distribution
fig, axes = plt.subplots(1, 3, figsize=(15, 4))
for idx, (name, df) in enumerate([('Train', train_df), ('Test', test_df), ('Val', val_df)]):
    axes[idx].hist(df['text_length'], bins=50, edgecolor='black', alpha=0.7)
    axes[idx].set_title(f'{name} Set - Text Length Distribution')
    axes[idx].set_xlabel('Character Count')
    axes[idx].set_ylabel('Frequency')
    axes[idx].axvline(df['text_length'].mean(), color='red', linestyle='--', label=f'Mean: {df["text_length"].mean():.1f}')
    axes[idx].legend()

plt.tight_layout()
plt.savefig(os.path.join(config.figures_dir, 'text_length_distribution.png'), dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úÖ Exploratory analysis complete!")

## 3. Text Preprocessing and Tokenization

Implement Marathi-specific preprocessing and create PyTorch datasets.

In [None]:
def preprocess_marathi_text(text: str) -> str:
    """
    Preprocess Marathi text for sentiment analysis.
    
    Args:
        text: Input Marathi text
        
    Returns:
        Preprocessed text
    """
    if not isinstance(text, str):
        return ""
    
    # Remove URLs
    text = re.sub(r'http(s)?://\S+', '', text)
    
    # Remove @mentions
    text = re.sub(r'@\w+', '', text)
    
    # Keep text after hashtags (remove # but keep word)
    text = re.sub(r'#(\w+)', r'\1', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    # Remove leading/trailing whitespace
    text = text.strip()
    
    return text

# Apply preprocessing
print("üîÑ Preprocessing text data...")
train_df['text_clean'] = train_df['text'].apply(preprocess_marathi_text)
test_df['text_clean'] = test_df['text'].apply(preprocess_marathi_text)
val_df['text_clean'] = val_df['text'].apply(preprocess_marathi_text)

# Map labels: -1‚Üí0, 0‚Üí1, 1‚Üí2
train_df['label_mapped'] = train_df['label'].map(config.label_map)
test_df['label_mapped'] = test_df['label'].map(config.label_map)
val_df['label_mapped'] = val_df['label'].map(config.label_map)

print("‚úÖ Text preprocessing complete!")
print(f"\nSample preprocessed text:")
print(f"Original: {train_df['text'].iloc[0][:80]}...")
print(f"Cleaned:  {train_df['text_clean'].iloc[0][:80]}...")
print(f"Label mapping: {train_df['label'].iloc[0]} ‚Üí {train_df['label_mapped'].iloc[0]}")

In [None]:
# Initialize XLM-RoBERTa tokenizer
print("üîÑ Loading XLM-RoBERTa tokenizer...")
tokenizer = XLMRobertaTokenizer.from_pretrained(config.plm_name)
print(f"‚úÖ Tokenizer loaded: {config.plm_name}")

# PyTorch Dataset class
class MarathiSentimentDataset(Dataset):
    """
    PyTorch Dataset for Marathi sentiment analysis.
    
    Args:
        texts: List of preprocessed text strings
        labels: List of mapped labels (0, 1, 2)
        tokenizer: XLMRobertaTokenizer instance
        max_length: Maximum sequence length
    """
    
    def __init__(
        self, 
        texts: List[str], 
        labels: List[int], 
        tokenizer: XLMRobertaTokenizer, 
        max_length: int = 256
    ):
        self.texts = texts
        self.labels = labels
        self.tokenizer = tokenizer
        self.max_length = max_length
    
    def __len__(self) -> int:
        return len(self.texts)
    
    def __getitem__(self, idx: int) -> Dict[str, torch.Tensor]:
        text = str(self.texts[idx])
        label = self.labels[idx]
        
        # Tokenize
        encoding = self.tokenizer(
            text,
            max_length=self.max_length,
            padding='max_length',
            truncation=True,
            return_tensors='pt'
        )
        
        return {
            'input_ids': encoding['input_ids'].squeeze(0),
            'attention_mask': encoding['attention_mask'].squeeze(0),
            'label': torch.tensor(label, dtype=torch.long)
        }

# Create datasets
train_dataset = MarathiSentimentDataset(
    train_df['text_clean'].tolist(),
    train_df['label_mapped'].tolist(),
    tokenizer,
    config.max_seq_length
)

val_dataset = MarathiSentimentDataset(
    val_df['text_clean'].tolist(),
    val_df['label_mapped'].tolist(),
    tokenizer,
    config.max_seq_length
)

test_dataset = MarathiSentimentDataset(
    test_df['text_clean'].tolist(),
    test_df['label_mapped'].tolist(),
    tokenizer,
    config.max_seq_length
)

print(f"\n‚úÖ Datasets created:")
print(f"   Train: {len(train_dataset):,} samples")
print(f"   Val:   {len(val_dataset):,} samples")
print(f"   Test:  {len(test_dataset):,} samples")

In [None]:
# Create DataLoaders
train_loader = DataLoader(
    train_dataset,
    batch_size=config.batch_size,
    shuffle=True,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

test_loader = DataLoader(
    test_dataset,
    batch_size=config.batch_size,
    shuffle=False,
    num_workers=0,
    pin_memory=True if torch.cuda.is_available() else False
)

print(f"‚úÖ DataLoaders created:")
print(f"   Train batches: {len(train_loader)}")
print(f"   Val batches:   {len(val_loader)}")
print(f"   Test batches:  {len(test_loader)}")

# Test a batch
sample_batch = next(iter(train_loader))
print(f"\nüîç Sample batch shapes:")
print(f"   Input IDs: {sample_batch['input_ids'].shape}")
print(f"   Attention Mask: {sample_batch['attention_mask'].shape}")
print(f"   Labels: {sample_batch['label'].shape}")

## 4. Traditional ML Baselines

Train and evaluate SVM, Random Forest, Logistic Regression, and KNN with TF-IDF features.

In [None]:
# Extract TF-IDF features
print("üîÑ Extracting TF-IDF features...")
vectorizer = TfidfVectorizer(max_features=config.tfidf_max_features)

X_train_tfidf = vectorizer.fit_transform(train_df['text_clean'])
X_val_tfidf = vectorizer.transform(val_df['text_clean'])
X_test_tfidf = vectorizer.transform(test_df['text_clean'])

y_train = train_df['label_mapped'].values
y_val = val_df['label_mapped'].values
y_test = test_df['label_mapped'].values

print(f"‚úÖ TF-IDF features extracted:")
print(f"   Feature dimensions: {X_train_tfidf.shape[1]}")
print(f"   Train shape: {X_train_tfidf.shape}")
print(f"   Val shape: {X_val_tfidf.shape}")
print(f"   Test shape: {X_test_tfidf.shape}")

In [None]:
# Dictionary to store results
traditional_ml_results = {}

# Define models
traditional_models = {
    'SVM': SVC(kernel='linear', C=1.0, class_weight='balanced', random_state=SEED),
    'Random Forest': RandomForestClassifier(n_estimators=200, min_samples_split=6, random_state=SEED),
    'Logistic Regression': LogisticRegression(max_iter=1000, class_weight='balanced', random_state=SEED),
    'KNN': KNeighborsClassifier(n_neighbors=5)
}

print("üöÄ Training Traditional ML Models...\n")

for model_name, model in traditional_models.items():
    print(f"{'='*60}")
    print(f"Training {model_name}...")
    start_time = time.time()
    
    # Train
    model.fit(X_train_tfidf, y_train)
    
    # Predict on validation set
    y_val_pred = model.predict(X_val_tfidf)
    
    # Calculate metrics
    accuracy = accuracy_score(y_val, y_val_pred)
    precision, recall, f1, _ = precision_recall_fscore_support(y_val, y_val_pred, average='macro')
    precision_w, recall_w, f1_w, _ = precision_recall_fscore_support(y_val, y_val_pred, average='weighted')
    
    train_time = time.time() - start_time
    
    # Store results
    traditional_ml_results[model_name] = {
        'model': model,
        'accuracy': accuracy,
        'precision_macro': precision,
        'recall_macro': recall,
        'f1_macro': f1,
        'precision_weighted': precision_w,
        'recall_weighted': recall_w,
        'f1_weighted': f1_w,
        'train_time': train_time
    }
    
    print(f"  Accuracy: {accuracy:.4f}")
    print(f"  F1-Score (Macro): {f1:.4f}")
    print(f"  F1-Score (Weighted): {f1_w:.4f}")
    print(f"  Training time: {train_time:.2f}s")

print(f"\n{'='*60}")
print("‚úÖ Traditional ML training complete!")

## 5. Hybrid Model Architecture

Implement the XLM-RoBERTa + CNN hybrid model with mean pooling.

In [None]:
class HybridSentimentModel(nn.Module):
    """
    Hybrid XLM-RoBERTa + CNN model for Marathi sentiment analysis.
    
    Architecture:
        1. XLM-RoBERTa base for contextual embeddings (768-dim)
        2. Mean pooling over sequence (attention-masked)
        3. Conv1D for local pattern extraction (768‚Üí256)
        4. Concatenate pooled + CNN features (1024-dim)
        5. Dense classification layers (1024‚Üí512‚Üí3)
    
    Args:
        config: Configuration object with hyperparameters
    """
    
    def __init__(self, config: Config):
        super(HybridSentimentModel, self).__init__()
        
        # Load pre-trained XLM-RoBERTa
        self.plm = XLMRobertaModel.from_pretrained(config.plm_name)
        
        # CNN for local patterns
        self.cnn = nn.Conv1d(
            in_channels=config.plm_hidden_size,
            out_channels=config.cnn_out_channels,
            kernel_size=config.cnn_kernel_size,
            padding='same'
        )
        
        # Classification head
        self.dropout = nn.Dropout(config.dropout_rate)
        self.fc1 = nn.Linear(config.plm_hidden_size + config.cnn_out_channels, config.dense_hidden_size)
        self.fc2 = nn.Linear(config.dense_hidden_size, config.num_classes)
        
        self.relu = nn.ReLU()
    
    def mean_pooling(self, token_embeddings: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Apply mean pooling with attention mask.
        
        Args:
            token_embeddings: Token embeddings (batch_size, seq_len, hidden_size)
            attention_mask: Attention mask (batch_size, seq_len)
            
        Returns:
            Pooled embeddings (batch_size, hidden_size)
        """
        input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
        sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, dim=1)
        sum_mask = torch.clamp(input_mask_expanded.sum(dim=1), min=1e-9)
        return sum_embeddings / sum_mask
    
    def forward(self, input_ids: torch.Tensor, attention_mask: torch.Tensor) -> torch.Tensor:
        """
        Forward pass.
        
        Args:
            input_ids: Input token IDs (batch_size, seq_len)
            attention_mask: Attention mask (batch_size, seq_len)
            
        Returns:
            Logits (batch_size, num_classes)
        """
        # Get PLM embeddings
        plm_output = self.plm(input_ids=input_ids, attention_mask=attention_mask)
        token_embeddings = plm_output.last_hidden_state  # (batch, seq_len, 768)
        
        # Mean pooling
        pooled_output = self.mean_pooling(token_embeddings, attention_mask)  # (batch, 768)
        
        # CNN for local patterns
        # Transpose for Conv1D: (batch, seq_len, 768) ‚Üí (batch, 768, seq_len)
        cnn_input = token_embeddings.permute(0, 2, 1)
        cnn_output = self.cnn(cnn_input)  # (batch, 256, seq_len)
        cnn_output = F.relu(cnn_output)
        cnn_pooled = F.max_pool1d(cnn_output, kernel_size=cnn_output.size(2)).squeeze(2)  # (batch, 256)
        
        # Concatenate features
        combined = torch.cat([pooled_output, cnn_pooled], dim=1)  # (batch, 1024)
        
        # Classification layers
        x = self.dropout(combined)
        x = self.fc1(x)
        x = self.relu(x)
        x = self.dropout(x)
        logits = self.fc2(x)  # (batch, 3)
        
        return logits

# Initialize model
hybrid_model = HybridSentimentModel(config).to(device)
print(f"‚úÖ Hybrid model initialized")
print(f"   Total parameters: {sum(p.numel() for p in hybrid_model.parameters()):,}")
print(f"   Trainable parameters: {sum(p.numel() for p in hybrid_model.parameters() if p.requires_grad):,}")

## 6. Training Functions

Implement training and validation loops with early stopping.

In [None]:
def train_epoch(
    model: nn.Module,
    data_loader: DataLoader,
    criterion: nn.Module,
    optimizer: torch.optim.Optimizer,
    device: torch.device
) -> Tuple[float, float]:
    """
    Train for one epoch.
    
    Args:
        model: PyTorch model
        data_loader: Training data loader
        criterion: Loss function
        optimizer: Optimizer
        device: Device to train on
        
    Returns:
        Average loss and accuracy for the epoch
    """
    model.train()
    total_loss = 0
    correct_predictions = 0
    total_samples = 0
    
    progress_bar = tqdm(data_loader, desc="Training")
    
    for batch in progress_bar:
        input_ids = batch['input_ids'].to(device)
        attention_mask = batch['attention_mask'].to(device)
        labels = batch['label'].to(device)
        
        optimizer.zero_grad()
        
        # Forward pass
        logits = model(input_ids, attention_mask)
        loss = criterion(logits, labels)
        
        # Backward pass
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()
        
        # Calculate accuracy
        _, preds = torch.max(logits, dim=1)
        correct_predictions += torch.sum(preds == labels).item()
        total_samples += labels.size(0)
        total_loss += loss.item()
        
        # Update progress bar
        progress_bar.set_postfix({
            'loss': f'{loss.item():.4f}',
            'acc': f'{correct_predictions/total_samples:.4f}'
        })
    
    avg_loss = total_loss / len(data_loader)
    accuracy = correct_predictions / total_samples
    
    return avg_loss, accuracy


def validate(
    model: nn.Module,
    data_loader: DataLoader,
    criterion: nn.Module,
    device: torch.device
) -> Tuple[float, float, np.ndarray, np.ndarray]:
    """
    Validate the model.
    
    Args:
        model: PyTorch model
        data_loader: Validation data loader
        criterion: Loss function
        device: Device to validate on
        
    Returns:
        Average loss, accuracy, predictions, and true labels
    """
    model.eval()
    total_loss = 0
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(data_loader, desc="Validation"):
            input_ids = batch['input_ids'].to(device)
            attention_mask = batch['attention_mask'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            logits = model(input_ids, attention_mask)
            loss = criterion(logits, labels)
            
            # Get predictions
            _, preds = torch.max(logits, dim=1)
            
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            total_loss += loss.item()
    
    avg_loss = total_loss / len(data_loader)
    all_preds = np.array(all_preds)
    all_labels = np.array(all_labels)
    accuracy = accuracy_score(all_labels, all_preds)
    
    return avg_loss, accuracy, all_preds, all_labels

print("‚úÖ Training functions defined")

## 7. Train Hybrid Model

Train the hybrid XLM-RoBERTa + CNN model with early stopping.

In [None]:
# Training setup
criterion = nn.CrossEntropyLoss()
optimizer = AdamW(hybrid_model.parameters(), lr=config.learning_rate_hybrid, weight_decay=config.weight_decay)

# Learning rate scheduler
total_steps = len(train_loader) * config.num_epochs
scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=config.warmup_steps,
    num_training_steps=total_steps
)

# Training history
history = {
    'train_loss': [],
    'train_acc': [],
    'val_loss': [],
    'val_acc': []
}

# Early stopping
best_val_loss = float('inf')
patience_counter = 0
best_model_path = os.path.join(config.models_dir, 'hybrid_model_best.pt')

print("üöÄ Starting Hybrid Model Training...\n")

for epoch in range(config.num_epochs):
    print(f"{'='*60}")
    print(f"Epoch {epoch + 1}/{config.num_epochs}")
    print(f"{'='*60}")
    
    # Train
    train_loss, train_acc = train_epoch(hybrid_model, train_loader, criterion, optimizer, device)
    
    # Validate
    val_loss, val_acc, val_preds, val_labels = validate(hybrid_model, val_loader, criterion, device)
    
    # Update scheduler
    scheduler.step()
    
    # Store history
    history['train_loss'].append(train_loss)
    history['train_acc'].append(train_acc)
    history['val_loss'].append(val_loss)
    history['val_acc'].append(val_acc)
    
    # Print epoch summary
    print(f"\nEpoch {epoch + 1} Summary:")
    print(f"  Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
    print(f"  Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:.4f}")
    
    # Early stopping
    if val_loss < best_val_loss:
        best_val_loss = val_loss
        patience_counter = 0
        # Save best model
        torch.save({
            'epoch': epoch,
            'model_state_dict': hybrid_model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'val_loss': val_loss,
            'val_acc': val_acc,
        }, best_model_path)
        print(f"  ‚úÖ Best model saved! (Val Loss: {val_loss:.4f})")
    else:
        patience_counter += 1
        print(f"  ‚è≥ No improvement ({patience_counter}/{config.early_stopping_patience})")
        
        if patience_counter >= config.early_stopping_patience:
            print(f"\nüõë Early stopping triggered at epoch {epoch + 1}")
            break
    
    print()

print("‚úÖ Hybrid model training complete!")
print(f"   Best validation loss: {best_val_loss:.4f}")
print(f"   Model saved to: {best_model_path}")

## 8. Model Evaluation

Evaluate the hybrid model on the test set and calculate comprehensive metrics.

In [None]:
# Load best model
checkpoint = torch.load(best_model_path)
hybrid_model.load_state_dict(checkpoint['model_state_dict'])
print(f"‚úÖ Loaded best model from epoch {checkpoint['epoch'] + 1}")

# Evaluate on test set
test_loss, test_acc, test_preds, test_labels = validate(hybrid_model, test_loader, criterion, device)

# Calculate comprehensive metrics
precision_macro, recall_macro, f1_macro, _ = precision_recall_fscore_support(
    test_labels, test_preds, average='macro'
)
precision_weighted, recall_weighted, f1_weighted, _ = precision_recall_fscore_support(
    test_labels, test_preds, average='weighted'
)

# Per-class metrics
precision_per_class, recall_per_class, f1_per_class, support = precision_recall_fscore_support(
    test_labels, test_preds, average=None
)

# Print results
print(f"\n{'='*60}")
print("üéØ HYBRID MODEL TEST SET RESULTS")
print(f"{'='*60}")
print(f"\nOverall Metrics:")
print(f"  Accuracy:            {test_acc:.4f}")
print(f"  Precision (Macro):   {precision_macro:.4f}")
print(f"  Recall (Macro):      {recall_macro:.4f}")
print(f"  F1-Score (Macro):    {f1_macro:.4f}")
print(f"  F1-Score (Weighted): {f1_weighted:.4f}")

print(f"\nPer-Class Metrics:")
for i, class_name in enumerate(config.label_names):
    print(f"\n  {class_name}:")
    print(f"    Precision: {precision_per_class[i]:.4f}")
    print(f"    Recall:    {recall_per_class[i]:.4f}")
    print(f"    F1-Score:  {f1_per_class[i]:.4f}")
    print(f"    Support:   {support[i]}")

# Store hybrid model results
hybrid_results = {
    'accuracy': test_acc,
    'precision_macro': precision_macro,
    'recall_macro': recall_macro,
    'f1_macro': f1_macro,
    'f1_weighted': f1_weighted,
    'test_preds': test_preds,
    'test_labels': test_labels
}

## 9. Results Visualization

Create comprehensive visualizations comparing all models.

In [None]:
# 1. Training History
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

epochs_range = range(1, len(history['train_loss']) + 1)

# Loss plot
ax1.plot(epochs_range, history['train_loss'], 'b-', label='Train Loss', linewidth=2)
ax1.plot(epochs_range, history['val_loss'], 'r-', label='Val Loss', linewidth=2)
ax1.set_xlabel('Epoch', fontsize=12)
ax1.set_ylabel('Loss', fontsize=12)
ax1.set_title('Training and Validation Loss', fontsize=14, fontweight='bold')
ax1.legend(fontsize=11)
ax1.grid(True, alpha=0.3)

# Accuracy plot
ax2.plot(epochs_range, history['train_acc'], 'b-', label='Train Accuracy', linewidth=2)
ax2.plot(epochs_range, history['val_acc'], 'r-', label='Val Accuracy', linewidth=2)
ax2.set_xlabel('Epoch', fontsize=12)
ax2.set_ylabel('Accuracy', fontsize=12)
ax2.set_title('Training and Validation Accuracy', fontsize=14, fontweight='bold')
ax2.legend(fontsize=11)
ax2.grid(True, alpha=0.3)

plt.tight_layout()
plt.savefig(os.path.join(config.figures_dir, 'training_history.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Training history plot saved")

In [None]:
# 2. Confusion Matrix for Hybrid Model
cm = confusion_matrix(test_labels, test_preds)

plt.figure(figsize=(10, 8))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', 
            xticklabels=config.label_names,
            yticklabels=config.label_names,
            cbar_kws={'label': 'Count'})
plt.title('Confusion Matrix - Hybrid XLM-RoBERTa + CNN', fontsize=14, fontweight='bold', pad=20)
plt.xlabel('Predicted Label', fontsize=12)
plt.ylabel('True Label', fontsize=12)
plt.tight_layout()
plt.savefig(os.path.join(config.figures_dir, 'confusion_matrix.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Confusion matrix saved")

In [None]:
# 3. Model Comparison Bar Chart
model_names = list(traditional_ml_results.keys()) + ['Hybrid XLM-R + CNN']
accuracies = [traditional_ml_results[name]['accuracy'] for name in traditional_ml_results.keys()] + [hybrid_results['accuracy']]
f1_scores = [traditional_ml_results[name]['f1_macro'] for name in traditional_ml_results.keys()] + [hybrid_results['f1_macro']]

x = np.arange(len(model_names))
width = 0.35

fig, ax = plt.subplots(figsize=(14, 6))
bars1 = ax.bar(x - width/2, accuracies, width, label='Accuracy', alpha=0.8, color='skyblue')
bars2 = ax.bar(x + width/2, f1_scores, width, label='F1-Score (Macro)', alpha=0.8, color='lightcoral')

ax.set_xlabel('Models', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Model Comparison: Traditional ML vs Hybrid Deep Learning', fontsize=14, fontweight='bold', pad=20)
ax.set_xticks(x)
ax.set_xticklabels(model_names, rotation=45, ha='right')
ax.legend(fontsize=11)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1.0])

# Add value labels on bars
for bars in [bars1, bars2]:
    for bar in bars:
        height = bar.get_height()
        ax.text(bar.get_x() + bar.get_width()/2., height,
                f'{height:.3f}',
                ha='center', va='bottom', fontsize=9)

plt.tight_layout()
plt.savefig(os.path.join(config.figures_dir, 'model_comparison.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Model comparison chart saved")

In [None]:
# 4. Per-Class Metrics for Hybrid Model
metrics_df = pd.DataFrame({
    'Precision': precision_per_class,
    'Recall': recall_per_class,
    'F1-Score': f1_per_class
}, index=config.label_names)

ax = metrics_df.plot(kind='bar', figsize=(12, 6), width=0.8, alpha=0.8)
ax.set_xlabel('Sentiment Class', fontsize=12, fontweight='bold')
ax.set_ylabel('Score', fontsize=12, fontweight='bold')
ax.set_title('Per-Class Performance: Hybrid XLM-RoBERTa + CNN', fontsize=14, fontweight='bold', pad=20)
ax.set_xticklabels(config.label_names, rotation=0)
ax.legend(fontsize=11)
ax.grid(axis='y', alpha=0.3)
ax.set_ylim([0, 1.0])

# Add value labels
for container in ax.containers:
    ax.bar_label(container, fmt='%.3f', fontsize=9)

plt.tight_layout()
plt.savefig(os.path.join(config.figures_dir, 'per_class_metrics.png'), dpi=300, bbox_inches='tight')
plt.show()

print("‚úÖ Per-class metrics chart saved")

## 10. Error Analysis

Analyze misclassifications to gain insights for model improvement.

In [None]:
# Find misclassified examples
misclassified_indices = np.where(test_preds != test_labels)[0]
correct_indices = np.where(test_preds == test_labels)[0]

print(f"üìä Error Analysis Summary:")
print(f"   Total test samples: {len(test_labels)}")
print(f"   Correct predictions: {len(correct_indices)} ({len(correct_indices)/len(test_labels)*100:.2f}%)")
print(f"   Misclassifications: {len(misclassified_indices)} ({len(misclassified_indices)/len(test_labels)*100:.2f}%)")

# Analyze misclassification patterns
print(f"\nüîç Misclassification Patterns:")
for true_label in range(3):
    for pred_label in range(3):
        if true_label != pred_label:
            count = np.sum((test_labels[misclassified_indices] == true_label) & 
                          (test_preds[misclassified_indices] == pred_label))
            if count > 0:
                print(f"   {config.label_names[true_label]} ‚Üí {config.label_names[pred_label]}: {count} samples")

# Display sample misclassifications
print(f"\n‚ùå Sample Misclassifications:\n")
for i in misclassified_indices[:10]:
    true_label = test_labels[i]
    pred_label = test_preds[i]
    text = test_df.iloc[i]['text_clean']
    
    print(f"True: {config.label_names[true_label]} | Pred: {config.label_names[pred_label]}")
    print(f"Text: {text[:120]}...")
    print("-" * 80)

# Text length analysis of misclassifications
misclassified_lengths = test_df.iloc[misclassified_indices]['text_length'].values
correct_lengths = test_df.iloc[correct_indices]['text_length'].values

print(f"\nüìè Text Length Analysis:")
print(f"   Misclassified - Mean: {misclassified_lengths.mean():.1f}, Median: {np.median(misclassified_lengths):.1f}")
print(f"   Correct - Mean: {correct_lengths.mean():.1f}, Median: {np.median(correct_lengths):.1f}")

# Visualize text length distribution
plt.figure(figsize=(12, 5))
plt.hist(correct_lengths, bins=50, alpha=0.6, label='Correct', color='green', edgecolor='black')
plt.hist(misclassified_lengths, bins=50, alpha=0.6, label='Misclassified', color='red', edgecolor='black')
plt.xlabel('Text Length (characters)', fontsize=12)
plt.ylabel('Frequency', fontsize=12)
plt.title('Text Length Distribution: Correct vs Misclassified Predictions', fontsize=14, fontweight='bold')
plt.legend(fontsize=11)
plt.grid(axis='y', alpha=0.3)
plt.tight_layout()
plt.savefig(os.path.join(config.figures_dir, 'error_analysis_length.png'), dpi=300, bbox_inches='tight')
plt.show()

print("\n‚úÖ Error analysis complete!")

## 11. Final Summary and Export Results

Compile all results and save to files for future reference.

In [None]:
# Create comprehensive results summary
results_summary = {
    'Traditional ML': {},
    'Hybrid Model': {
        'Accuracy': hybrid_results['accuracy'],
        'Precision (Macro)': hybrid_results['precision_macro'],
        'Recall (Macro)': hybrid_results['recall_macro'],
        'F1-Score (Macro)': hybrid_results['f1_macro'],
        'F1-Score (Weighted)': hybrid_results['f1_weighted']
    }
}

# Add traditional ML results
for model_name, results in traditional_ml_results.items():
    results_summary['Traditional ML'][model_name] = {
        'Accuracy': results['accuracy'],
        'F1-Score (Macro)': results['f1_macro'],
        'F1-Score (Weighted)': results['f1_weighted'],
        'Training Time (s)': results['train_time']
    }

# Print final summary table
print(f"\n{'='*80}")
print("üèÜ FINAL RESULTS SUMMARY")
print(f"{'='*80}\n")

print("Traditional ML Models (TF-IDF Features):")
print(f"{'Model':<20} {'Accuracy':<12} {'F1-Macro':<12} {'F1-Weighted':<12} {'Time (s)':<10}")
print("-" * 80)
for model_name, metrics in results_summary['Traditional ML'].items():
    print(f"{model_name:<20} {metrics['Accuracy']:<12.4f} {metrics['F1-Score (Macro)']:<12.4f} "
          f"{metrics['F1-Score (Weighted)']:<12.4f} {metrics['Training Time (s)']:<10.2f}")

print(f"\n{'='*80}")
print("Hybrid Deep Learning Model (XLM-RoBERTa + CNN):")
print(f"{'Metric':<30} {'Score':<12}")
print("-" * 80)
for metric, value in results_summary['Hybrid Model'].items():
    print(f"{metric:<30} {value:<12.4f}")

# Find best model
best_traditional = max(results_summary['Traditional ML'].items(), 
                      key=lambda x: x[1]['F1-Score (Macro)'])
hybrid_f1 = results_summary['Hybrid Model']['F1-Score (Macro)']

print(f"\n{'='*80}")
print("ü•á BEST MODELS:")
print(f"   Traditional ML: {best_traditional[0]} (F1-Macro: {best_traditional[1]['F1-Score (Macro)']:.4f})")
print(f"   Deep Learning:  Hybrid XLM-R + CNN (F1-Macro: {hybrid_f1:.4f})")
print(f"   Improvement:    {(hybrid_f1 - best_traditional[1]['F1-Score (Macro)'])*100:.2f}% increase")
print(f"{'='*80}\n")

# Save results to CSV
results_df = pd.DataFrame({
    'Model': list(results_summary['Traditional ML'].keys()) + ['Hybrid XLM-R + CNN'],
    'Accuracy': [v['Accuracy'] for v in results_summary['Traditional ML'].values()] + [hybrid_results['accuracy']],
    'F1_Macro': [v['F1-Score (Macro)'] for v in results_summary['Traditional ML'].values()] + [hybrid_results['f1_macro']],
    'F1_Weighted': [v['F1-Score (Weighted)'] for v in results_summary['Traditional ML'].values()] + [hybrid_results['f1_weighted']]
})

results_csv_path = os.path.join(config.results_dir, 'model_comparison_results.csv')
results_df.to_csv(results_csv_path, index=False)
print(f"‚úÖ Results saved to: {results_csv_path}")

# Save configuration
config_path = os.path.join(config.results_dir, 'experiment_config.json')
with open(config_path, 'w') as f:
    json.dump(config.to_dict(), f, indent=2)
print(f"‚úÖ Configuration saved to: {config_path}")

print("\nüéâ Experiment complete! All results and visualizations have been saved.")
print(f"   üìÅ Results directory: {config.results_dir}")
print(f"   üìä Figures directory: {config.figures_dir}")
print(f"   ü§ñ Models directory: {config.models_dir}")