### CRNN Model Training

In [None]:
import pickle

# Load training and test data
with open('data/train_data.pkl', 'rb') as f:
    data = pickle.load(f)

# Extract data components
X_train_mfcc = data['X_train_mfcc']
X_train_prosodic = data['X_train_prosodic']
X_test_mfcc = data['X_test_mfcc']
X_test_prosodic = data['X_test_prosodic']
y_train = data['y_train']
y_test = data['y_test']
encoder = data['encoder']
feature_names = data['feature_names']

print("Data successfully loaded!")

Data successfully loaded!


In [None]:
import pickle
import torch
from torch.utils.data import DataLoader, TensorDataset
import numpy as np
from sklearn.metrics import confusion_matrix, classification_report
import seaborn as sns

# Import necessary libraries
import torch.nn as nn
import torch.optim as optim
import matplotlib.pyplot as plt

# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")



Using device: cuda


In [None]:
# Convert data to PyTorch tensors for GPU processing
X_train_mfcc_tensor = torch.FloatTensor(X_train_mfcc)
X_train_prosodic_tensor = torch.FloatTensor(X_train_prosodic)
X_test_mfcc_tensor = torch.FloatTensor(X_test_mfcc)
X_test_prosodic_tensor = torch.FloatTensor(X_test_prosodic)

y_train_tensor = torch.LongTensor(y_train)
y_test_tensor = torch.LongTensor(y_test)

print("Tensors created successfully!")
print(f"MFCC tensor shape: {X_train_mfcc_tensor.shape}")
print(f"Prosodic tensor shape: {X_train_prosodic_tensor.shape}")
print(f"Labels tensor shape: {y_train_tensor.shape}")

# Create initial data loaders for shape verification
from torch.utils.data import TensorDataset, DataLoader

train_dataset = TensorDataset(X_train_mfcc_tensor, X_train_prosodic_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_mfcc_tensor, X_test_prosodic_tensor, y_test_tensor)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=32, shuffle=False)

In [None]:
# Print shapes to verify
print("MFCC shape:", X_train_mfcc_tensor.shape)
print("Prosodic features shape:", X_train_prosodic_tensor.shape)
print("Number of training batches:", len(train_loader))
print("Number of test batches:", len(test_loader))

MFCC shape: torch.Size([1152, 1, 128, 345])
Prosodic features shape: torch.Size([1152, 9])
Number of training batches: 36
Number of test batches: 9


In [None]:
import torch.nn.functional as F
import math

class MultiBranchCRNN(nn.Module):
    """
    Multi-Branch CRNN for Speech Emotion Recognition
    - MFCC Branch: CNN feature extraction + RNN temporal modeling
    - Prosodic Branch: Dense layers for prosodic features
    - Prosody-Aware Attention: Uses prosodic features to guide attention
    """
    def __init__(self, mfcc_freq_dim, prosody_dim, num_classes, 
                 cnn_channels=[64, 128, 256], rnn_hidden_size=256, 
                 rnn_layers=2, dropout=0.3):
        super().__init__()
        
        print("Initializing Multi-Branch CRNN Architecture...")
        
        # MFCC Branch: CNN Feature Extractor
        # Processes spectral features with increasing channel depth
        self.mfcc_cnn = nn.Sequential(
            # First CNN block: Extract low-level spectral patterns
            nn.Conv2d(1, cnn_channels[0], kernel_size=3, padding=1),
            nn.BatchNorm2d(cnn_channels[0]),
            nn.ReLU(inplace=True),
            nn.Dropout2d(dropout * 0.5),
            
            # Second CNN block: Extract mid-level features with pooling
            nn.Conv2d(cnn_channels[0], cnn_channels[1], kernel_size=3, padding=1),
            nn.BatchNorm2d(cnn_channels[1]),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 2)),  # Reduce frequency and time dimensions
            nn.Dropout2d(dropout * 0.7),
            
            # Third CNN block: Extract high-level features
            nn.Conv2d(cnn_channels[1], cnn_channels[2], kernel_size=3, padding=1),
            nn.BatchNorm2d(cnn_channels[2]),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=(2, 1)),  # Reduce only frequency dimension
            nn.Dropout2d(dropout),
            
            # Adaptive pooling to standardize frequency dimension
            nn.AdaptiveAvgPool2d((1, None))  # Pool to 1 frequency bin, keep time
        )
        
        # Calculate CNN output dimension automatically
        with torch.no_grad():
            dummy_input = torch.zeros(1, 1, mfcc_freq_dim, 100)
            cnn_output = self.mfcc_cnn(dummy_input)
            self.cnn_output_dim = cnn_output.size(1)
            print(f"CNN output dimension: {self.cnn_output_dim}")
        
        # Bidirectional LSTM for temporal modeling
        # Captures both forward and backward temporal dependencies
        self.lstm = nn.LSTM(
            input_size=self.cnn_output_dim,
            hidden_size=rnn_hidden_size // 2,  # Divide by 2 for bidirectional
            num_layers=rnn_layers,
            batch_first=True,
            dropout=dropout if rnn_layers > 1 else 0,
            bidirectional=True
        )
        
        # Prosodic Branch: Dense layers for prosodic features
        # Processes global acoustic features like pitch, energy, rhythm
        self.prosody_branch = nn.Sequential(
            nn.Linear(prosody_dim, 128),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout * 0.5)
        )
        
        # Multi-head Attention configuration
        self.attention_heads = 8
        self.head_dim = rnn_hidden_size // self.attention_heads
        
        # Attention projection layers
        self.query_proj = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.key_proj = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.value_proj = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        self.attention_out = nn.Linear(rnn_hidden_size, rnn_hidden_size)
        
        # Prosody-aware attention weights
        # Uses prosodic features to modulate attention weights
        self.prosody_attention = nn.Sequential(
            nn.Linear(64 + rnn_hidden_size, rnn_hidden_size // 2),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            nn.Linear(rnn_hidden_size // 2, self.attention_heads),
            nn.Softmax(dim=-1)
        )
        
        # Final classifier: Combines MFCC and prosodic features
        self.classifier = nn.Sequential(
            nn.Linear(rnn_hidden_size + 64, rnn_hidden_size),
            nn.LayerNorm(rnn_hidden_size),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout),
            
            nn.Linear(rnn_hidden_size, rnn_hidden_size // 2),
            nn.ReLU(inplace=True),
            nn.Dropout(dropout * 0.5),
            
            nn.Linear(rnn_hidden_size // 2, num_classes)
        )
        
        self._initialize_weights()
        print("Multi-Branch CRNN initialized successfully!")
    
    def _initialize_weights(self):
        """Initialize model weights for stable training"""
        for module in self.modules():
            if isinstance(module, nn.Conv2d):
                nn.init.kaiming_normal_(module.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(module, nn.Linear):
                nn.init.xavier_uniform_(module.weight)
                nn.init.constant_(module.bias, 0)
            elif isinstance(module, nn.LSTM):
                for name, param in module.named_parameters():
                    if 'weight' in name:
                        nn.init.xavier_uniform_(param)
                    elif 'bias' in name:
                        nn.init.constant_(param, 0)
    
    def prosody_aware_attention(self, lstm_output, prosodic_features):
        """
        Prosody-Aware Multi-Head Attention Mechanism
        Uses prosodic features to guide attention weights on LSTM output
        """
        batch_size, seq_len, hidden_size = lstm_output.shape
        
        # Multi-head attention projections
        Q = self.query_proj(lstm_output).view(batch_size, seq_len, self.attention_heads, self.head_dim).transpose(1, 2)
        K = self.key_proj(lstm_output).view(batch_size, seq_len, self.attention_heads, self.head_dim).transpose(1, 2)
        V = self.value_proj(lstm_output).view(batch_size, seq_len, self.attention_heads, self.head_dim).transpose(1, 2)
        
        # Compute attention scores
        attention_scores = torch.matmul(Q, K.transpose(-2, -1)) / math.sqrt(self.head_dim)
        attention_weights = F.softmax(attention_scores, dim=-1)
        
        # Prosody-aware modulation
        # Combine prosodic features with mean LSTM output for context
        lstm_mean = lstm_output.mean(dim=1, keepdim=True)  # Global context
        prosody_lstm_combined = torch.cat([
            prosodic_features.unsqueeze(1).expand(-1, seq_len, -1),
            lstm_mean.expand(-1, seq_len, -1)
        ], dim=-1)
        
        # Generate prosody-aware attention modulation weights
        prosody_weights = self.prosody_attention(prosody_lstm_combined)  # [B, T, num_heads]
        prosody_weights = prosody_weights.transpose(1, 2).unsqueeze(-1)  # [B, num_heads, T, 1]
        
        # Apply prosody modulation to attention weights
        attention_weights = attention_weights * prosody_weights
        attention_weights = F.softmax(attention_weights, dim=-1)
        
        # Apply attention to values
        attended_output = torch.matmul(attention_weights, V)
        attended_output = attended_output.transpose(1, 2).contiguous().view(batch_size, seq_len, hidden_size)
        
        # Residual connection and output projection
        attended_output = self.attention_out(attended_output)
        return lstm_output + attended_output  # Residual connection
    
    def forward(self, mfcc_features, prosodic_features):
        """
        Forward pass through Multi-Branch CRNN
        """
        # Ensure MFCC features have channel dimension for CNN
        if mfcc_features.dim() == 3:
            mfcc_features = mfcc_features.unsqueeze(1)  # Add channel dimension [B, 1, F, T]
        
        # MFCC Branch: CNN feature extraction
        cnn_features = self.mfcc_cnn(mfcc_features)  # [B, C, 1, T]
        cnn_features = cnn_features.squeeze(2).transpose(1, 2)  # [B, T, C]
        
        # LSTM temporal modeling
        lstm_output, _ = self.lstm(cnn_features)  # [B, T, hidden_size]
        
        # Prosodic Branch: Dense feature processing
        prosodic_processed = self.prosody_branch(prosodic_features)  # [B, 64]
        
        # Prosody-aware attention mechanism
        attended_output = self.prosody_aware_attention(lstm_output, prosodic_processed)
        
        # Global average pooling over time dimension
        final_mfcc = attended_output.mean(dim=1)  # [B, hidden_size]
        
        # Feature fusion: Combine MFCC and prosodic features
        combined_features = torch.cat([final_mfcc, prosodic_processed], dim=1)
        
        # Classification
        output = self.classifier(combined_features)
        return output

# Model configuration
mfcc_freq_dim = X_train_mfcc.shape[2] if len(X_train_mfcc.shape) > 2 else X_train_mfcc.shape[1]
prosody_dim = X_train_prosodic.shape[1]
num_classes = len(np.unique(y_train))

print(f"\nModel Configuration:")
print(f"MFCC frequency dimension: {mfcc_freq_dim}")
print(f"Prosodic features: {prosody_dim}")
print(f"Number of emotion classes: {num_classes}")

# Initialize the Multi-Branch CRNN model
model = MultiBranchCRNN(
    mfcc_freq_dim=mfcc_freq_dim,
    prosody_dim=prosody_dim,
    num_classes=num_classes,
    cnn_channels=[64, 128, 256],  # Progressive CNN channel increase
    rnn_hidden_size=256,          # LSTM hidden size
    rnn_layers=2,                 # Number of LSTM layers
    dropout=0.3                   # Dropout for regularization
).to(device)

# Count model parameters
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"\nModel Statistics:")
print(f"Total parameters: {total_params:,} (~{total_params/1e6:.1f}M)")
print(f"Trainable parameters: {trainable_params:,}")
print(f"Model size estimate: ~{total_params * 4 / 1024**2:.1f} MB")
print(f"Device: {next(model.parameters()).device}")

In [None]:
from sklearn.model_selection import train_test_split
import time

# Training Configuration for RTX 4070 Laptop
print("Setting up training configuration for RTX 4070 laptop...")

# Hyperparameters optimized for RTX 4070 laptop
BATCH_SIZE = 32              # Balanced for 8GB VRAM
LEARNING_RATE = 1e-3         # Initial learning rate
EPOCHS = 100                 # Maximum epochs for thorough training
PATIENCE = 15                # Early stopping patience (increased for longer training)
WEIGHT_DECAY = 1e-4          # L2 regularization
GRADIENT_CLIP = 1.0          # Gradient clipping for stability
WARMUP_EPOCHS = 5            # Learning rate warmup

# Create validation split from training data
print("Creating train/validation split...")
X_train_mfcc_split, X_val_mfcc, X_train_prosodic_split, X_val_prosodic, y_train_split, y_val = train_test_split(
    X_train_mfcc, X_train_prosodic, y_train, 
    test_size=0.15, 
    random_state=42, 
    stratify=y_train
)

# Convert all data to tensors
train_mfcc_tensor = torch.FloatTensor(X_train_mfcc_split)
train_prosodic_tensor = torch.FloatTensor(X_train_prosodic_split)
train_labels_tensor = torch.LongTensor(y_train_split)

val_mfcc_tensor = torch.FloatTensor(X_val_mfcc)
val_prosodic_tensor = torch.FloatTensor(X_val_prosodic)
val_labels_tensor = torch.LongTensor(y_val)

test_mfcc_tensor = torch.FloatTensor(X_test_mfcc)
test_prosodic_tensor = torch.FloatTensor(X_test_prosodic)
test_labels_tensor = torch.LongTensor(y_test)

# Create datasets and data loaders
train_dataset = TensorDataset(train_mfcc_tensor, train_prosodic_tensor, train_labels_tensor)
val_dataset = TensorDataset(val_mfcc_tensor, val_prosodic_tensor, val_labels_tensor)
test_dataset = TensorDataset(test_mfcc_tensor, test_prosodic_tensor, test_labels_tensor)

# Data loaders with optimized settings for RTX 4070
train_loader = DataLoader(
    train_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=True, 
    num_workers=4,      # Optimized for modern CPUs
    pin_memory=True,    # Faster GPU transfer
    persistent_workers=True  # Reuse workers
)

val_loader = DataLoader(
    val_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=4, 
    pin_memory=True,
    persistent_workers=True
)

test_loader = DataLoader(
    test_dataset, 
    batch_size=BATCH_SIZE, 
    shuffle=False, 
    num_workers=4, 
    pin_memory=True,
    persistent_workers=True
)

# Optimizer with weight decay for regularization
optimizer = torch.optim.AdamW(
    model.parameters(), 
    lr=LEARNING_RATE, 
    weight_decay=WEIGHT_DECAY,
    betas=(0.9, 0.999),
    eps=1e-8
)

# Learning rate scheduler with warmup and decay
def get_lr_scheduler(optimizer, warmup_epochs, total_epochs):
    def lr_lambda(epoch):
        if epoch < warmup_epochs:
            # Linear warmup
            return epoch / warmup_epochs
        else:
            # Cosine annealing after warmup
            return 0.5 * (1 + math.cos(math.pi * (epoch - warmup_epochs) / (total_epochs - warmup_epochs)))
    return torch.optim.lr_scheduler.LambdaLR(optimizer, lr_lambda)

scheduler = get_lr_scheduler(optimizer, WARMUP_EPOCHS, EPOCHS)

# Loss function with label smoothing for better generalization
criterion = nn.CrossEntropyLoss(label_smoothing=0.1)

# Mixed precision scaler for RTX 4070 efficiency
scaler = torch.cuda.amp.GradScaler()

print(f"\nDataset Statistics:")
print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(val_dataset)}")
print(f"Test samples: {len(test_dataset)}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Training batches per epoch: {len(train_loader)}")
print(f"Validation batches: {len(val_loader)}")

print(f"\nTraining Configuration:")
print(f"Learning rate: {LEARNING_RATE}")
print(f"Weight decay: {WEIGHT_DECAY}")
print(f"Max epochs: {EPOCHS}")
print(f"Early stopping patience: {PATIENCE}")
print(f"Gradient clipping: {GRADIENT_CLIP}")
print(f"Mixed precision: Enabled")
print("Configuration complete!")

In [None]:
class ProgressTracker:
    """Real-time training progress tracker"""
    def __init__(self):
        self.reset()
    
    def reset(self):
        self.train_losses = []
        self.train_accuracies = []
        self.val_losses = []
        self.val_accuracies = []
        self.learning_rates = []
        self.best_val_acc = 0.0
        self.best_epoch = 0
        self.patience_counter = 0
    
    def update(self, train_loss, train_acc, val_loss, val_acc, lr, epoch):
        self.train_losses.append(train_loss)
        self.train_accuracies.append(train_acc)
        self.val_losses.append(val_loss)
        self.val_accuracies.append(val_acc)
        self.learning_rates.append(lr)
        
        if val_acc > self.best_val_acc:
            self.best_val_acc = val_acc
            self.best_epoch = epoch
            self.patience_counter = 0
            return True  # New best model
        else:
            self.patience_counter += 1
            return False
    
    def should_stop(self, patience):
        return self.patience_counter >= patience

def train_epoch(model, train_loader, criterion, optimizer, scaler, device):
    """Train model for one epoch with progress tracking"""
    model.train()
    running_loss = 0.0
    correct = 0
    total = 0
    
    for batch_idx, (mfcc, prosodic, labels) in enumerate(train_loader):
        # Move data to GPU
        mfcc, prosodic, labels = mfcc.to(device), prosodic.to(device), labels.to(device)
        
        # Zero gradients
        optimizer.zero_grad()
        
        # Forward pass with mixed precision
        with torch.cuda.amp.autocast():
            outputs = model(mfcc, prosodic)
            loss = criterion(outputs, labels)
        
        # Backward pass with gradient scaling
        scaler.scale(loss).backward()
        
        # Gradient clipping for stability
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), GRADIENT_CLIP)
        
        # Optimizer step
        scaler.step(optimizer)
        scaler.update()
        
        # Statistics
        running_loss += loss.item()
        _, predicted = outputs.max(1)
        total += labels.size(0)
        correct += predicted.eq(labels).sum().item()
        
        # Progress update every 10 batches
        if batch_idx % 10 == 0:
            current_acc = 100. * correct / total
            print(f'  Batch {batch_idx:3d}/{len(train_loader)} | '
                  f'Loss: {loss.item():.4f} | Acc: {current_acc:5.2f}% | '
                  f'GPU Mem: {torch.cuda.memory_allocated()/1024**3:.1f}GB', end='\r')
    
    epoch_loss = running_loss / len(train_loader)
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

def validate_epoch(model, val_loader, criterion, device):
    """Validate model performance"""
    model.eval()
    running_loss = 0.0
    correct = 0
    total = 0
    
    with torch.no_grad():
        for mfcc, prosodic, labels in val_loader:
            mfcc, prosodic, labels = mfcc.to(device), prosodic.to(device), labels.to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(mfcc, prosodic)
                loss = criterion(outputs, labels)
            
            running_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
    
    epoch_loss = running_loss / len(val_loader)
    epoch_acc = 100. * correct / total
    return epoch_loss, epoch_acc

# Initialize progress tracker
tracker = ProgressTracker()
best_model_state = None

print("Starting Multi-Branch CRNN Training...")
print("=" * 70)
print(f"Target: >80% accuracy on speech emotion recognition")
print(f"Device: {device}")
print(f"Model: Multi-Branch CRNN with Prosody-Aware Attention")
print("=" * 70)

start_time = time.time()

for epoch in range(EPOCHS):
    epoch_start = time.time()
    
    print(f'\nEpoch {epoch+1:2d}/{EPOCHS}:')
    
    # Training phase
    print('  Training...')
    train_loss, train_acc = train_epoch(model, train_loader, criterion, optimizer, scaler, device)
    
    # Validation phase
    print('\n  Validating...')
    val_loss, val_acc = validate_epoch(model, val_loader, criterion, device)
    
    # Learning rate step
    scheduler.step()
    current_lr = optimizer.param_groups[0]['lr']
    
    # Update progress tracker
    is_best = tracker.update(train_loss, train_acc, val_loss, val_acc, current_lr, epoch)
    
    # Save best model
    if is_best:
        best_model_state = model.state_dict().copy()
        improvement = "NEW BEST!"
    else:
        improvement = f"({tracker.patience_counter}/{PATIENCE})"
    
    # Epoch summary
    epoch_time = time.time() - epoch_start
    print(f'\n  Results:')
    print(f'    Train Loss: {train_loss:.4f} | Train Acc: {train_acc:5.2f}%')
    print(f'    Val Loss:   {val_loss:.4f} | Val Acc:   {val_acc:5.2f}% {improvement}')
    print(f'    LR: {current_lr:.2e} | Time: {epoch_time:.1f}s')
    
    # Progress bar simulation
    progress = (epoch + 1) / EPOCHS
    bar_length = 30
    filled_length = int(bar_length * progress)
    bar = '=' * filled_length + '-' * (bar_length - filled_length)
    print(f'  Progress: [{bar}] {progress*100:.1f}%')
    
    # Early stopping check
    if tracker.should_stop(PATIENCE):
        print(f'\nEarly stopping triggered at epoch {epoch+1}')
        print(f'   Best validation accuracy: {tracker.best_val_acc:.2f}% (epoch {tracker.best_epoch+1})')
        break
    
    # Memory cleanup
    torch.cuda.empty_cache()

# Load best model
if best_model_state is not None:
    model.load_state_dict(best_model_state)
    print(f'\nLoaded best model from epoch {tracker.best_epoch+1}')

total_time = time.time() - start_time
print(f'\nTraining completed in {total_time/60:.1f} minutes')
print(f'Best validation accuracy: {tracker.best_val_acc:.2f}%')

# Training summary
print(f'\n{"="*50}')
print("TRAINING SUMMARY")
print(f'{"="*50}')
print(f'Best Validation Accuracy: {tracker.best_val_acc:.2f}%')
print(f'Target Achievement: {"ACHIEVED" if tracker.best_val_acc >= 80 else "NOT YET"}')
print(f'Total Training Time: {total_time/60:.1f} minutes')
print(f'Best Epoch: {tracker.best_epoch+1}')
print(f'Model Parameters: {total_params:,}')
print(f'{"="*50}')

In [None]:
def evaluate_model(model, test_loader, criterion, device):
    """Comprehensive model evaluation on test set"""
    model.eval()
    test_loss = 0.0
    correct = 0
    total = 0
    all_predictions = []
    all_labels = []
    all_probabilities = []
    
    print("Evaluating model on test set...")
    
    with torch.no_grad():
        for batch_idx, (mfcc, prosodic, labels) in enumerate(test_loader):
            mfcc, prosodic, labels = mfcc.to(device), prosodic.to(device), labels.to(device)
            
            with torch.cuda.amp.autocast():
                outputs = model(mfcc, prosodic)
                loss = criterion(outputs, labels)
                probabilities = F.softmax(outputs, dim=1)
            
            test_loss += loss.item()
            _, predicted = outputs.max(1)
            total += labels.size(0)
            correct += predicted.eq(labels).sum().item()
            
            all_predictions.extend(predicted.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())
            all_probabilities.extend(probabilities.cpu().numpy())
            
            # Progress indicator
            if batch_idx % 5 == 0:
                print(f'  Test batch {batch_idx:2d}/{len(test_loader)}', end='\r')
    
    test_loss = test_loss / len(test_loader)
    test_acc = 100. * correct / total
    
    return test_loss, test_acc, all_predictions, all_labels, all_probabilities

# Evaluate the trained model
print("\n" + "="*60)
print("MODEL EVALUATION")
print("="*60)

test_loss, test_acc, test_predictions, test_labels, test_probabilities = evaluate_model(
    model, test_loader, criterion, device
)

# Emotion class names (RAVDESS dataset)
emotion_names = [
    'Neutral', 'Calm', 'Happy', 'Sad', 
    'Angry', 'Fearful', 'Disgust', 'Surprised'
]

print(f"\nTest Set Results:")
print(f"   Test Loss: {test_loss:.4f}")
print(f"   Test Accuracy: {test_acc:.2f}%")

# Detailed classification report
print(f"\nDetailed Classification Report:")
from sklearn.metrics import classification_report, confusion_matrix
print(classification_report(
    test_labels, test_predictions, 
    target_names=emotion_names, 
    digits=3
))

# Per-class accuracy analysis
print(f"\nPer-Class Accuracy:")
class_accuracies = {}
for i, emotion in enumerate(emotion_names):
    mask = np.array(test_labels) == i
    if mask.sum() > 0:
        class_acc = (np.array(test_predictions)[mask] == i).mean() * 100
        class_accuracies[emotion] = class_acc
        status = "GOOD" if class_acc >= 70 else "FAIR" if class_acc >= 50 else "POOR"
        print(f"   {emotion:10s}: {class_acc:5.1f}% {status}")

# Target achievement check
print(f"\nTARGET ACHIEVEMENT:")
if test_acc >= 80.0:
    print(f"   SUCCESS! Test accuracy {test_acc:.2f}% >= 80% target")
    achievement = "ACHIEVED"
else:
    print(f"   Progress: {test_acc:.2f}% (target: 80%)")
    print(f"   Gap to target: {80.0 - test_acc:.2f}%")
    achievement = "IN PROGRESS"

# Model performance summary
print(f"\nPerformance Summary:")
print(f"   Best Validation Accuracy: {tracker.best_val_acc:.2f}%")
print(f"   Final Test Accuracy: {test_acc:.2f}%")
print(f"   Generalization Gap: {tracker.best_val_acc - test_acc:.2f}%")
print(f"   Model Complexity: {total_params:,} parameters")
print(f"   Training Efficiency: {total_time/60:.1f} minutes")
print(f"   Target Status: {achievement}")

# Architecture effectiveness analysis
print(f"\nArchitecture Analysis:")
print(f"   Multi-Branch Design: MFCC CNN + Prosodic Dense")
print(f"   Attention Mechanism: Prosody-Aware Multi-Head")
print(f"   Temporal Modeling: Bidirectional LSTM")
print(f"   Regularization: Dropout + Label Smoothing + Weight Decay")
print(f"   Optimization: Mixed Precision + Gradient Clipping")

print("="*60)

In [None]:
# Results Visualization and Model Saving
import os

print("Creating visualizations...")

# Create visualization figure
fig, ((ax1, ax2), (ax3, ax4)) = plt.subplots(2, 2, figsize=(16, 12))
fig.suptitle('Multi-Branch CRNN Training Results', fontsize=16, fontweight='bold')

# 1. Training and Validation Loss
ax1.plot(tracker.train_losses, label='Training Loss', color='#2E86AB', linewidth=2)
ax1.plot(tracker.val_losses, label='Validation Loss', color='#A23B72', linewidth=2)
ax1.set_title('Model Loss Over Time', fontweight='bold')
ax1.set_xlabel('Epoch')
ax1.set_ylabel('Loss')
ax1.legend()
ax1.grid(True, alpha=0.3)
ax1.set_ylim(bottom=0)

# 2. Training and Validation Accuracy
ax2.plot(tracker.train_accuracies, label='Training Accuracy', color='#F18F01', linewidth=2)
ax2.plot(tracker.val_accuracies, label='Validation Accuracy', color='#C73E1D', linewidth=2)
ax2.axhline(y=80, color='green', linestyle='--', alpha=0.7, label='Target (80%)')
ax2.set_title('Model Accuracy Over Time', fontweight='bold')
ax2.set_xlabel('Epoch')
ax2.set_ylabel('Accuracy (%)')
ax2.legend()
ax2.grid(True, alpha=0.3)
ax2.set_ylim(0, 100)

# 3. Confusion Matrix
cm = confusion_matrix(test_labels, test_predictions)
im = ax3.imshow(cm, interpolation='nearest', cmap='Blues')
ax3.set_title('Test Set Confusion Matrix', fontweight='bold')
ax3.set_xlabel('Predicted Emotion')
ax3.set_ylabel('True Emotion')

# Add text annotations to confusion matrix
thresh = cm.max() / 2.
for i in range(cm.shape[0]):
    for j in range(cm.shape[1]):
        ax3.text(j, i, format(cm[i, j], 'd'),
                ha="center", va="center",
                color="white" if cm[i, j] > thresh else "black",
                fontweight='bold')

# Set tick labels
tick_marks = np.arange(len(emotion_names))
ax3.set_xticks(tick_marks)
ax3.set_yticks(tick_marks)
ax3.set_xticklabels([name[:4] for name in emotion_names], rotation=45)
ax3.set_yticklabels([name[:4] for name in emotion_names])

# 4. Per-Class Accuracy Bar Chart
emotions = list(class_accuracies.keys())
accuracies = list(class_accuracies.values())
colors = ['#28a745' if acc >= 70 else '#ffc107' if acc >= 50 else '#dc3545' for acc in accuracies]

bars = ax4.bar(emotions, accuracies, color=colors, alpha=0.8, edgecolor='black', linewidth=0.5)
ax4.axhline(y=80, color='green', linestyle='--', alpha=0.7, label='Target (80%)')
ax4.set_title('Per-Class Test Accuracy', fontweight='bold')
ax4.set_xlabel('Emotion Class')
ax4.set_ylabel('Accuracy (%)')
ax4.set_ylim(0, 100)
ax4.tick_params(axis='x', rotation=45)
ax4.grid(True, alpha=0.3, axis='y')

# Add value labels on bars
for bar, acc in zip(bars, accuracies):
    height = bar.get_height()
    ax4.text(bar.get_x() + bar.get_width()/2., height + 1,
             f'{acc:.1f}%', ha='center', va='bottom', fontweight='bold', fontsize=9)

plt.tight_layout()
plt.show()

# Save the trained model
print("\nSaving trained model...")
os.makedirs('models', exist_ok=True)
model_path = 'models/multibranch_crnn_emotion_model.pth'

# Comprehensive model save with all important information
save_dict = {
    'model_state_dict': model.state_dict(),
    'model_config': {
        'mfcc_freq_dim': mfcc_freq_dim,
        'prosody_dim': prosody_dim,
        'num_classes': num_classes,
        'cnn_channels': [64, 128, 256],
        'rnn_hidden_size': 256,
        'rnn_layers': 2,
        'dropout': 0.3
    },
    'training_config': {
        'batch_size': BATCH_SIZE,
        'learning_rate': LEARNING_RATE,
        'weight_decay': WEIGHT_DECAY,
        'epochs_trained': len(tracker.train_losses),
        'early_stopping_patience': PATIENCE
    },
    'results': {
        'best_val_accuracy': tracker.best_val_acc,
        'best_epoch': tracker.best_epoch,
        'test_accuracy': test_acc,
        'test_loss': test_loss,
        'total_parameters': total_params,
        'training_time_minutes': total_time/60
    },
    'training_history': {
        'train_losses': tracker.train_losses,
        'val_losses': tracker.val_losses,
        'train_accuracies': tracker.train_accuracies,
        'val_accuracies': tracker.val_accuracies,
        'learning_rates': tracker.learning_rates
    },
    'evaluation': {
        'emotion_names': emotion_names,
        'class_accuracies': class_accuracies,
        'confusion_matrix': cm.tolist(),
        'test_predictions': test_predictions,
        'test_labels': test_labels
    },
    'metadata': {
        'pytorch_version': torch.__version__,
        'device': str(device),
        'timestamp': time.strftime('%Y-%m-%d %H:%M:%S'),
        'architecture': 'Multi-Branch CRNN with Prosody-Aware Attention'
    }
}

torch.save(save_dict, model_path)

model_size_mb = os.path.getsize(model_path) / (1024**2)
print(f"Model saved successfully!")
print(f"   Path: {model_path}")
print(f"   Size: {model_size_mb:.1f} MB")

# Final Results Summary
print(f"\n{'='*70}")
print("MULTI-BRANCH CRNN TRAINING COMPLETE")
print(f"{'='*70}")
print(f"Final Test Accuracy: {test_acc:.2f}%")
print(f"Best Validation Accuracy: {tracker.best_val_acc:.2f}%")
print(f"Target Achievement: {'ACHIEVED!' if test_acc >= 80 else 'In Progress'}")
print(f"Architecture: Multi-Branch CRNN + Prosody-Aware Attention")
print(f"Training Time: {total_time/60:.1f} minutes")
print(f"Model Parameters: {total_params:,}")
print(f"Model Saved: {model_path}")
print(f"Optimized for: RTX 4070 Laptop")
print(f"{'='*70}")

if test_acc >= 80:
    print("Congratulations! Your model has achieved the target accuracy!")
    print("Ready for deployment in speech emotion recognition applications.")
else:
    print("To improve performance, consider:")
    print("   • Increasing model complexity or training epochs")
    print("   • Fine-tuning hyperparameters")
    print("   • Data augmentation techniques")
    print("   • Ensemble methods")
    
print("\nModel Architecture Summary:")
print("   MFCC Branch: CNN (64→128→256) + Bidirectional LSTM")
print("   Prosodic Branch: Dense layers (128→64)")
print("   Attention: Prosody-Aware Multi-Head (8 heads)")
print("   Fusion: Feature concatenation + Dense classifier")
print("   Optimization: Mixed precision + Gradient clipping + Early stopping")