# Model Comparison for Sign Language Recognition

This notebook compares different model architectures for sign language recognition. We'll train and evaluate multiple models on the same dataset to determine which performs best.

## Setup

First, let's import the necessary libraries and set up our environment.

In [None]:
# Add parent directory to path for imports
import sys
import os
sys.path.append(os.path.dirname(os.getcwd()))

# Standard libraries
import json
import pickle
import random
import time
from pathlib import Path

# Data processing libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm.notebook import tqdm
from sklearn.metrics import classification_report, confusion_matrix

# Deep learning libraries
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader

# Import project modules
from datasets.data_loader import get_dataloader
from models.cnn_lstm import CNNLSTM, LandmarkCNNLSTM
from models.mediapipe_ml import MediaPipeML
from models.transformer import TransformerModel

# Set plotting style
plt.style.use('fivethirtyeight')
sns.set_palette('viridis')
%matplotlib inline

# Set random seed for reproducibility
seed = 42
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(seed)

## Configuration

Define paths and parameters for the experiments.

In [None]:
# Dataset configuration
DATA_DIR = Path('../datasets/processed/wlasl')
BATCH_SIZE = 32
NUM_WORKERS = 4

# Training configuration
NUM_EPOCHS = 20
LEARNING_RATE = 0.001
PATIENCE = 5  # Early stopping patience

# Model configuration
MODELS = {
    'landmark_cnn_lstm': LandmarkCNNLSTM,
    'mediapipe_ml': MediaPipeML,
    'transformer': TransformerModel
}

# Device configuration
DEVICE = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {DEVICE}")

## Load Dataset

Load the dataset and prepare dataloaders for training and validation.

In [None]:
# Check if dataset exists
if not DATA_DIR.exists():
    print(f"Dataset directory {DATA_DIR} does not exist. Please run preprocessing scripts first.")
else:
    # Load metadata
    metadata_path = DATA_DIR / 'metadata.json'
    with open(metadata_path, 'r') as f:
        metadata = json.load(f)
    
    print(f"Number of classes: {metadata['num_classes']}")
    print(f"Total processed videos: {metadata['processed_videos']}")
    
    # Create dataloaders
    train_dataloader = get_dataloader(
        DATA_DIR, 
        batch_size=BATCH_SIZE, 
        split='train', 
        num_workers=NUM_WORKERS
    )
    
    val_dataloader = get_dataloader(
        DATA_DIR, 
        batch_size=BATCH_SIZE, 
        split='val', 
        num_workers=NUM_WORKERS
    )
    
    test_dataloader = get_dataloader(
        DATA_DIR, 
        batch_size=BATCH_SIZE, 
        split='test', 
        num_workers=NUM_WORKERS
    )
    
    print(f"Train dataset size: {len(train_dataloader.dataset)}")
    print(f"Validation dataset size: {len(val_dataloader.dataset)}")
    print(f"Test dataset size: {len(test_dataloader.dataset)}")
    
    # Get a sample batch
    sample_batch = next(iter(train_dataloader))
    input_dim = sample_batch['features'].shape[-1]
    print(f"Input dimension: {input_dim}")

## Training Functions

Define functions for training and evaluating models.

In [None]:
def train_epoch(model, dataloader, criterion, optimizer, device):
    """Train the model for one epoch."""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch in tqdm(dataloader, desc="Training", leave=False):
        # Move data to device
        features = batch['features'].to(device)
        labels = batch['label'].to(device)
        
        # Zero the parameter gradients
        optimizer.zero_grad()
        
        # Forward pass
        outputs = model(features)
        loss = criterion(outputs, labels)
        
        # Backward pass and optimize
        loss.backward()
        optimizer.step()
        
        # Statistics
        running_loss += loss.item() * features.size(0)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
    
    epoch_loss = running_loss / total
    epoch_acc = correct / total
    
    return epoch_loss, epoch_acc

def validate(model, dataloader, criterion, device):
    """Validate the model."""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    all_preds = []
    all_labels = []
    
    with torch.no_grad():
        for batch in tqdm(dataloader, desc="Validating", leave=False):
            # Move data to device
            features = batch['features'].to(device)
            labels = batch['label'].to(device)
            
            # Forward pass
            outputs = model(features)
            loss = criterion(outputs, labels)
            
            # Statistics
            running_loss += loss.item() * features.size(0)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            
            # Store predictions and labels for further analysis
            all_preds.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
    
    val_loss = running_loss / total
    val_acc = correct / total
    
    return val_loss, val_acc, all_preds, all_labels

def train_model(model_name, model, dataloaders, criterion, optimizer, device, num_epochs=20, patience=5):
    """Train a model with early stopping."""
    train_dataloader, val_dataloader = dataloaders
    
    # Initialize variables for early stopping
    best_val_acc = 0.0
    best_model_wts = model.state_dict().copy()
    best_epoch = 0
    no_improve_epochs = 0
    
    # Initialize lists for plotting
    train_losses = []
    train_accs = []
    val_losses = []
    val_accs = []
    
    # Training loop
    start_time = time.time()
    for epoch in range(num_epochs):
        print(f"Epoch {epoch+1}/{num_epochs}")
        
        # Train for one epoch
        train_loss, train_acc = train_epoch(model, train_dataloader, criterion, optimizer, device)
        
        # Validate
        val_loss, val_acc, _, _ = validate(model, val_dataloader, criterion, device)
        
        # Print statistics
        print(f"Train Loss: {train_loss:.4f} | Train Acc: {train_acc:.4f}")
        print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")
        
        # Save statistics for plotting
        train_losses.append(train_loss)
        train_accs.append(train_acc)
        val_losses.append(val_loss)
        val_accs.append(val_acc)
        
        # Check if this is the best model so far
        if val_acc > best_val_acc:
            best_val_acc = val_acc
            best_model_wts = model.state_dict().copy()
            best_epoch = epoch
            no_improve_epochs = 0
        else:
            no_improve_epochs += 1
        
        # Check if we should stop early
        if no_improve_epochs >= patience:
            print(f"Early stopping at epoch {epoch+1}")
            break
    
    # Calculate training time
    time_elapsed = time.time() - start_time
    print(f"Training complete in {time_elapsed // 60:.0f}m {time_elapsed % 60:.0f}s")
    print(f"Best validation accuracy: {best_val_acc:.4f} at epoch {best_epoch+1}")
    
    # Load the best model weights
    model.load_state_dict(best_model_wts)
    
    return model, {
        'train_losses': train_losses,
        'train_accs': train_accs,
        'val_losses': val_losses,
        'val_accs': val_accs,
        'best_val_acc': best_val_acc,
        'best_epoch': best_epoch,
        'training_time': time_elapsed
    }

def evaluate_model(model, test_dataloader, criterion, device):
    """Evaluate the model on the test set."""
    test_loss, test_acc, all_preds, all_labels = validate(model, test_dataloader, criterion, device)
    
    print(f"Test Loss: {test_loss:.4f} | Test Acc: {test_acc:.4f}")
    
    # Print classification report
    print("\nClassification Report:")
    print(classification_report(all_labels, all_preds))
    
    # Compute confusion matrix
    cm = confusion_matrix(all_labels, all_preds)
    
    # Plot confusion matrix
    plt.figure(figsize=(10, 8))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title('Confusion Matrix')
    plt.xlabel('Predicted Label')
    plt.ylabel('True Label')
    plt.show()
    
    return test_loss, test_acc

## Train Models

Train different model architectures and compare their performance.

In [None]:
# Check if we have the necessary data
if 'train_dataloader' in locals() and 'val_dataloader' in locals() and 'test_dataloader' in locals():
    # Initialize results dictionary
    results = {}
    
    # Get number of classes
    num_classes = metadata['num_classes']
    
    # Try each model
    for model_name in ['landmark_cnn_lstm', 'transformer']:
        print(f"\n{'='*50}")
        print(f"Training {model_name} model")
        print(f"{'='*50}")
        
        # Initialize model
        if model_name == 'landmark_cnn_lstm':
            model = LandmarkCNNLSTM(input_dim=input_dim, num_classes=num_classes)
        elif model_name == 'transformer':
            model = TransformerModel(input_dim=input_dim, num_classes=num_classes)
        else:
            print(f"Model {model_name} not implemented for this notebook")
            continue
        
        # Move model to device
        model = model.to(DEVICE)
        
        # Define loss function and optimizer
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
        
        # Train the model
        trained_model, training_stats = train_model(
            model_name,
            model,
            (train_dataloader, val_dataloader),
            criterion,
            optimizer,
            DEVICE,
            num_epochs=NUM_EPOCHS,
            patience=PATIENCE
        )
        
        # Evaluate on test set
        print(f"\nEvaluating {model_name} on test set:")
        test_loss, test_acc = evaluate_model(trained_model, test_dataloader, criterion, DEVICE)
        
        # Store results
        results[model_name] = {
            'model': trained_model,
            'training_stats': training_stats,
            'test_loss': test_loss,
            'test_acc': test_acc
        }
else:
    print("Dataset not loaded. Please run the dataset loading cell first.")

## Compare Training Curves

Plot the training and validation curves for each model.

In [None]:
# Check if we have results
if 'results' in locals() and results:
    # Plot training and validation curves
    fig, axes = plt.subplots(1, 2, figsize=(16, 6))
    
    # Loss curves
    ax = axes[0]
    for model_name, result in results.items():
        stats = result['training_stats']
        epochs = range(1, len(stats['train_losses']) + 1)
        ax.plot(epochs, stats['train_losses'], 'o-', label=f"{model_name} (train)")
        ax.plot(epochs, stats['val_losses'], 's--', label=f"{model_name} (val)")
    
    ax.set_title('Loss Curves')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Loss')
    ax.legend()
    ax.grid(True)
    
    # Accuracy curves
    ax = axes[1]
    for model_name, result in results.items():
        stats = result['training_stats']
        epochs = range(1, len(stats['train_accs']) + 1)
        ax.plot(epochs, stats['train_accs'], 'o-', label=f"{model_name} (train)")
        ax.plot(epochs, stats['val_accs'], 's--', label=f"{model_name} (val)")
    
    ax.set_title('Accuracy Curves')
    ax.set_xlabel('Epoch')
    ax.set_ylabel('Accuracy')
    ax.legend()
    ax.grid(True)
    
    plt.tight_layout()
    plt.show()

## Compare Model Performance

Compare the performance of different models on the test set.

In [None]:
# Check if we have results
if 'results' in locals() and results:
    # Prepare data for plotting
    model_names = list(results.keys())
    test_accs = [results[name]['test_acc'] for name in model_names]
    training_times = [results[name]['training_stats']['training_time'] / 60 for name in model_names]  # in minutes
    best_val_accs = [results[name]['training_stats']['best_val_acc'] for name in model_names]
    
    # Create a DataFrame for easy visualization
    df = pd.DataFrame({
        'Model': model_names,
        'Test Accuracy': test_accs,
        'Best Validation Accuracy': best_val_accs,
        'Training Time (min)': training_times
    })
    
    # Print the table
    print("Model Performance Comparison:")
    print(df.to_string(index=False))
    
    # Plot test accuracy
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Model', y='Test Accuracy', data=df)
    plt.title('Test Accuracy by Model')
    plt.ylabel('Accuracy')
    plt.ylim(0, 1)
    for i, acc in enumerate(test_accs):
        plt.text(i, acc + 0.02, f"{acc:.4f}", ha='center')
    plt.grid(True, axis='y')
    plt.show()
    
    # Plot training time
    plt.figure(figsize=(10, 6))
    sns.barplot(x='Model', y='Training Time (min)', data=df)
    plt.title('Training Time by Model')
    plt.ylabel('Time (minutes)')
    for i, time in enumerate(training_times):
        plt.text(i, time + 0.2, f"{time:.2f}", ha='center')
    plt.grid(True, axis='y')
    plt.show()

## Save Best Model

Save the best performing model for later use.

In [None]:
# Check if we have results
if 'results' in locals() and results:
    # Find the best model based on test accuracy
    best_model_name = max(results.keys(), key=lambda k: results[k]['test_acc'])
    best_model = results[best_model_name]['model']
    best_test_acc = results[best_model_name]['test_acc']
    
    print(f"Best model: {best_model_name} with test accuracy: {best_test_acc:.4f}")
    
    # Create directory for saving models
    save_dir = Path('../models/checkpoints')
    save_dir.mkdir(parents=True, exist_ok=True)
    
    # Save the model
    model_path = save_dir / f"{best_model_name}_acc{best_test_acc:.4f}.pth"
    
    if hasattr(best_model, 'save_checkpoint'):
        best_model.save_checkpoint(model_path)
    else:
        torch.save(best_model.state_dict(), model_path)
    
    print(f"Model saved to {model_path}")

## Conclusion

In this notebook, we compared different model architectures for sign language recognition. We trained and evaluated multiple models on the same dataset and analyzed their performance in terms of accuracy and training time.

Based on our experiments, we found that:

1. The LandmarkCNN-LSTM model achieves good accuracy with reasonable training time
2. The Transformer model provides competitive accuracy but requires more training time
3. [Add other insights based on your results]

For the LinguaSign project, we recommend using the [best model based on your results] as it provides the best balance of accuracy and efficiency.