In [51]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pandas as pd
import os

In [52]:
class CNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=8)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=8)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout1 = nn.Dropout(0.5)
        self.conv3 = nn.Conv1d(128, 128, kernel_size=8)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout2 = nn.Dropout(0.5)
        self.conv4 = nn.Conv1d(128, 64, kernel_size=3, padding=2)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc1 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool1(x)
        x = self.dropout1(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool2(x)
        x = self.dropout2(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

In [53]:
class EmotionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=192, num_classes=8, dropout_rate=0.3):
        super(EmotionLSTM, self).__init__()
        
        # Simplified architecture - one bidirectional LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        
        # Simple dense layers
        self.bn = nn.BatchNorm1d(hidden_size*2)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        
        # Get last time step or squeeze
        if lstm_out.size(1) == 1:
            lstm_out = lstm_out.squeeze(1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        # Apply batch normalization
        lstm_out = self.bn(lstm_out)
        lstm_out = self.dropout(lstm_out)
        
        # Dense layers
        x = self.fc1(lstm_out)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [None]:
NUM_EMOTIONS = 8  # For RAVDESS dataset: neutral, calm, happy, sad, angry, fearful, disgust, surprised
EMOTION_LABELS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

# Voting Ensemble Model
class VotingEnsemble(nn.Module):
    def __init__(self, speech_model, facial_model):
        super(VotingEnsemble, self).__init__()
        self.speech_model = speech_model
        self.facial_model = facial_model
        
        # Freeze base models (optional - you can set to True if you want to keep original models fixed)
        for param in self.speech_model.parameters():
            param.requires_grad = False
        for param in self.facial_model.parameters():
            param.requires_grad = False
            
        # Learnable weights for each model
        self.speech_weight = nn.Parameter(torch.tensor(0.5))
        self.facial_weight = nn.Parameter(torch.tensor(0.5))
        
        # Additional fusion layer for feature-level combination
        self.fusion_layer = nn.Linear(NUM_EMOTIONS * 2, NUM_EMOTIONS)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, speech_input, facial_input):
        # Get predictions from individual models
        speech_out = self.speech_model(speech_input)
        facial_out = self.facial_model(facial_input)
        
        # Method 1: Weighted average of probabilities
        speech_probs = torch.softmax(speech_out, dim=1) * torch.sigmoid(self.speech_weight)
        facial_probs = torch.softmax(facial_out, dim=1) * torch.sigmoid(self.facial_weight)
        weighted_avg = (speech_probs + facial_probs) / (torch.sigmoid(self.speech_weight) + torch.sigmoid(self.facial_weight))
        
        # Method 2: Feature-level fusion
        # concat_features = torch.cat((speech_out, facial_out), dim=1)
        # fused_output = self.fusion_layer(self.dropout(concat_features))
        
        # You can choose which method to use based on performance
        return weighted_avg  # Method 1
        # return fused_output  # Method 2

class MaxConfidenceEnsemble(nn.Module):
    def __init__(self, speech_model, facial_model):
        super(MaxConfidenceEnsemble, self).__init__()
        self.speech_model = speech_model
        self.facial_model = facial_model
        
        # Freeze models (optional)
        for param in self.speech_model.parameters():
            param.requires_grad = False
        for param in self.facial_model.parameters():
            param.requires_grad = False

    def forward(self, speech_input, facial_input):
    # Get logits
        speech_out = self.speech_model(speech_input)
        facial_out = self.facial_model(facial_input)

        # Get probabilities
        speech_probs = torch.softmax(speech_out, dim=1)
        facial_probs = torch.softmax(facial_out, dim=1)

        # Get max confidence per sample
        speech_max_conf, _ = speech_probs.max(dim=1)
        facial_max_conf, _ = facial_probs.max(dim=1)

        # 🔍 Debug: Count which model is selected
        with torch.no_grad():
            speech_selected = (speech_max_conf >= facial_max_conf).sum().item()
            facial_selected = (facial_max_conf > speech_max_conf).sum().item()
            print(f"Speech selected: {speech_selected}, Facial selected: {facial_selected}")

        # Choose output with higher or equal confidence for speech
        margin = 0.05  # 5% threshold
        final_probs = torch.where(
            ((facial_max_conf - speech_max_conf) > margin).unsqueeze(1),
            facial_probs,
            speech_probs
        )

        return final_probs

# Evaluation functions
def evaluate_model(model, speech_data, facial_data, labels, device='cuda'):
    """
    Evaluate the ensemble model and return metrics
    
    Args:
        model: The ensemble model
        speech_data: Speech features (tensor)
        facial_data: Facial features (tensor)
        labels: Ground truth labels (tensor)
        device: Device to run evaluation on ('cuda' or 'cpu')
        
    Returns:
        dict: Dictionary containing various metrics
    """
    model.eval()
    
    # Move data to device
    speech_data = speech_data.to(device)
    facial_data = facial_data.to(device)
    labels = labels.to(device)
    
    with torch.no_grad():
        # Forward pass
        outputs = model(speech_data, facial_data)
        _, predicted = torch.max(outputs, 1)
        
        # Convert to numpy for sklearn metrics
        predicted_np = predicted.cpu().numpy()
        labels_np = labels.cpu().numpy()
        
        # Calculate metrics
        acc = accuracy_score(labels_np, predicted_np)
        report = classification_report(labels_np, predicted_np, target_names=EMOTION_LABELS, output_dict=True)
        conf_matrix = confusion_matrix(labels_np, predicted_np)
        
        # Per-class metrics
        per_class_acc = report['weighted avg']['precision']
        
        return {
            'accuracy': acc,
            'per_class_accuracy': per_class_acc,
            'classification_report': report,
            'confusion_matrix': conf_matrix
        }

def visualize_confusion_matrix(conf_matrix, class_names=EMOTION_LABELS):
    """
    Visualize confusion matrix
    
    Args:
        conf_matrix: Confusion matrix
        class_names: List of class names
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.show()

def evaluate_individual_vs_ensemble(speech_model, facial_model, ensemble_model, 
                                   speech_data, facial_data, labels, device='cuda'):
    """
    Compare individual models vs ensemble
    
    Args:
        speech_model: Speech emotion model
        facial_model: Facial emotion model
        ensemble_model: Ensemble model
        speech_data: Speech features (tensor)
        facial_data: Facial features (tensor)
        labels: Ground truth labels (tensor)
        device: Device to run evaluation on ('cuda' or 'cpu')
    
    Returns:
        dict: Dictionary with all evaluation results
    """
    # Move data to device
    speech_data = speech_data.to(device)
    facial_data = facial_data.to(device)
    labels = labels.to(device)
    
    # Set models to evaluation mode
    speech_model.eval()
    facial_model.eval()
    ensemble_model.eval()
    
    with torch.no_grad():
        # Individual model predictions
        speech_outputs = speech_model(speech_data)
        facial_outputs = facial_model(facial_data)
        ensemble_outputs = ensemble_model(speech_data, facial_data)
        
        # Get predicted classes
        _, speech_preds = torch.max(speech_outputs, 1)
        _, facial_preds = torch.max(facial_outputs, 1)
        _, ensemble_preds = torch.max(ensemble_outputs, 1)
        
        # Convert to numpy
        speech_preds_np = speech_preds.cpu().numpy()
        facial_preds_np = facial_preds.cpu().numpy()
        ensemble_preds_np = ensemble_preds.cpu().numpy()
        labels_np = labels.cpu().numpy()
        
        # Calculate accuracies
        speech_acc = accuracy_score(labels_np, speech_preds_np)
        facial_acc = accuracy_score(labels_np, facial_preds_np)
        ensemble_acc = accuracy_score(labels_np, ensemble_preds_np)
        
        # Generate classification reports
        speech_report = classification_report(labels_np, speech_preds_np, 
                                             target_names=EMOTION_LABELS, output_dict=True)
        facial_report = classification_report(labels_np, facial_preds_np, 
                                             target_names=EMOTION_LABELS, output_dict=True)
        ensemble_report = classification_report(labels_np, ensemble_preds_np, 
                                              target_names=EMOTION_LABELS, output_dict=True)
        
        # Generate confusion matrices
        speech_cm = confusion_matrix(labels_np, speech_preds_np)
        facial_cm = confusion_matrix(labels_np, facial_preds_np)
        ensemble_cm = confusion_matrix(labels_np, ensemble_preds_np)
        
        # Return all results
        return {
            'speech': {
                'accuracy': speech_acc,
                'classification_report': speech_report,
                'confusion_matrix': speech_cm
            },
            'facial': {
                'accuracy': facial_acc,
                'classification_report': facial_report,
                'confusion_matrix': facial_cm
            },
            'ensemble': {
                'accuracy': ensemble_acc,
                'classification_report': ensemble_report,
                'confusion_matrix': ensemble_cm
            }
        }

def plot_comparison(results):
    """
    Plot comparison of models
    
    Args:
        results: Results dictionary from evaluate_individual_vs_ensemble
    """
    models = ['Speech Model', 'Facial Model', 'Ensemble Model']
    accuracies = [
        results['speech']['accuracy'] * 100,
        results['facial']['accuracy'] * 100,
        results['ensemble']['accuracy'] * 100
    ]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(models, accuracies, color=['blue', 'green', 'red'])
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.2f}%', ha='center', va='bottom')
    
    plt.ylabel('Accuracy (%)')
    plt.title('Model Comparison')
    plt.ylim(0, 100)  # Set y-axis from 0 to 100
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.show()

In [57]:
def aggregate_facial_features(facial_data, video_ids):
    """
    Aggregate facial features per video ID
    
    Args:
        facial_data: Tensor of facial features
        video_ids: List of video IDs corresponding to each facial feature
        
    Returns:
        tuple: (aggregated_features, unique_video_ids)
    """
    video_to_frames = {}
    
    # Group frame indices by video ID
    for i, vid in enumerate(video_ids):
        if vid not in video_to_frames:
            video_to_frames[vid] = []
        video_to_frames[vid].append(i)
    
    # Aggregate frames for each video
    aggregated_features = []
    unique_video_ids = []
    
    for vid, frame_indices in video_to_frames.items():
        video_frames = facial_data[frame_indices]
        # Average across frames
        avg_features = torch.mean(video_frames, dim=0, keepdim=True)
        aggregated_features.append(avg_features)
        unique_video_ids.append(vid)
    
    if len(aggregated_features) > 0:
        return torch.cat(aggregated_features, dim=0), unique_video_ids
    else:
        return None, []

In [None]:
def main():
    # Load your models and data here
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    # For CSV file
    # Read the CSV file
    df_csv = pd.read_csv("C:/Nini/Capstone/CSV_Files/Speech data/Speech_Test_preprocessed_new.csv")
    # Convert to a PyTorch tensor
    X_train_speech = df_csv.drop(['Unnamed: 0','Path','Emotion'], axis=1).to_numpy()  
    csv_tensor = torch.tensor(X_train_speech, dtype=torch.float32).unsqueeze(1)
    # Save the tensor to a .pt file
    torch.save(csv_tensor, 'C:/Nini/Capstone/CSV_Files/Speech data/speech_test_data.pt')
    
    # For Excel file
    # Read the Excel file
    df_excel = pd.read_excel("C:/Nini/Capstone/CSV_Files/Facial data/FacialFeatures_Test.xlsx")
    X_train_face = df_excel.drop(['Unnamed: 0','emotion','video_name','BaseName'], axis=1).to_numpy()  
    # Convert to a PyTorch tensor
    excel_tensor = torch.tensor(X_train_face, dtype=torch.float32).unsqueeze(1)
    # Save the tensor to a .pt file
    torch.save(excel_tensor, 'C:/Nini/Capstone/CSV_Files/Facial data/face_test_data.pt')

    # Load the labels from the pickle file
    lb = joblib.load("C:/Nini/Capstone/src/Model_training/label_encoder.pkl")

    raw_labels = df_csv['Emotion']
    
    # Transform using your label encoder
    encoded_labels = lb.transform(raw_labels)

    # Convert to PyTorch tensor
    test_labels_tensor = torch.tensor(encoded_labels)

    # If you want to save as a .pt file for future use
    torch.save(test_labels_tensor, 'C:/Nini/Capstone/CSV_Files/Speech data/test_labels.pt')
    
    # Load test data
    speech_test_data = torch.load('C:/Nini/Capstone/CSV_Files/Speech data/speech_test_data.pt')
    facial_test_data = torch.load('C:/Nini/Capstone/CSV_Files/Facial data/face_test_data.pt')
    test_labels = torch.load('C:/Nini/Capstone/CSV_Files/Speech data/test_labels.pt')

    # Print original data shapes
    print(f"Original speech data shape: {speech_test_data.shape}")
    print(f"Original facial data shape: {facial_test_data.shape}")
    print(f"Original labels shape: {test_labels.shape}")

    # Get video IDs from both datasets
    speech_video_names = df_csv['Path'].values
    speech_video_ids = [os.path.splitext(os.path.basename(f))[0] for f in speech_video_names]
    facial_video_names = df_excel['video_name'].values
    facial_video_ids = [os.path.splitext(os.path.basename(f))[0] for f in facial_video_names]
    
    # Aggregate facial features per video
    aggregated_facial_data, unique_facial_video_ids = aggregate_facial_features(
        facial_test_data, facial_video_ids
    )
    
    print(f"Aggregated facial data shape: {aggregated_facial_data.shape}")
    print(f"Number of unique facial videos: {len(unique_facial_video_ids)}")
    
    # Find common videos between datasets
    common_video_ids = set(speech_video_ids).intersection(set(unique_facial_video_ids))
    print(f"Found {len(common_video_ids)} common videos between datasets")
    
    if len(common_video_ids) == 0:
        print("ERROR: No common videos found between datasets!")
        # As a fallback, ensure tensors have the same first dimension
        min_samples = min(speech_test_data.shape[0], aggregated_facial_data.shape[0])
        aligned_speech_data = speech_test_data[:min_samples]
        aligned_facial_data = aggregated_facial_data[:min_samples]
        aligned_labels = test_labels[:min_samples]
        print(f"Using first {min_samples} samples from each dataset as fallback")
    else:
        # Create indices for the common videos
        speech_indices = [i for i, vid in enumerate(speech_video_ids) if vid in common_video_ids]
        facial_indices = [i for i, vid in enumerate(unique_facial_video_ids) if vid in common_video_ids]
        
        # Ensure we have the same order of videos in both datasets
        speech_video_order = [speech_video_ids[i] for i in speech_indices]
        facial_video_order = [unique_facial_video_ids[i] for i in facial_indices]
        
        # Create a mapping to reorder facial indices if needed
        reordering = []
        for vid in speech_video_order:
            if vid in facial_video_order:
                reordering.append(facial_video_order.index(vid))
        
        # Extract the aligned data
        aligned_speech_data = speech_test_data[speech_indices]
        aligned_facial_data = aggregated_facial_data[reordering]
        aligned_labels = test_labels[speech_indices]
    
    print(f"Aligned speech data shape: {aligned_speech_data.shape}")
    print(f"Aligned facial data shape: {aligned_facial_data.shape}")
    print(f"Aligned labels shape: {aligned_labels.shape}")

    print("Speech")
    print(aligned_speech_data)
    print(aligned_facial_data)
    # Initialize your models
    speech_model = CNNModel(input_size=X_train_speech.shape[1], num_classes=8).to(device)
    facial_model = EmotionLSTM(input_size=X_train_face.shape[1]).to(device)
    
    # Load pre-trained weights
    speech_model.load_state_dict(torch.load('C:/Nini/Capstone/Models/DataAugmentation_cnn_model_new_final.pth', weights_only=True))
    facial_model.load_state_dict(torch.load('C:/Nini/Capstone/src/Facial Models/emotion_lstm_model.pth', weights_only=True))
    
    # Create ensemble model
    # ensemble_model = VotingEnsemble(speech_model, facial_model).to(device)
    ensemble_model = MaxConfidenceEnsemble(speech_model, facial_model).to(device)
    
    # Evaluate models with aligned data
    results = evaluate_individual_vs_ensemble(
        speech_model, facial_model, ensemble_model,
        aligned_speech_data, aligned_facial_data, aligned_labels, device
    )
    
    # Print results
    print(f"Speech Model Accuracy: {results['speech']['accuracy']:.4f}")
    print(f"Facial Model Accuracy: {results['facial']['accuracy']:.4f}")
    print(f"Ensemble Model Accuracy: {results['ensemble']['accuracy']:.4f}")
    
    # Plot comparison
    plot_comparison(results)
    
    # Visualize confusion matrix for the ensemble
    visualize_confusion_matrix(results['ensemble']['confusion_matrix'])
    
    # Print detailed report for ensemble
    print("\nEnsemble Classification Report:")
    for emotion, metrics in results['ensemble']['classification_report'].items():
        if isinstance(metrics, dict):
            print(f"{emotion}: precision={metrics['precision']:.2f}, recall={metrics['recall']:.2f}, f1-score={metrics['f1-score']:.2f}")

if __name__ == "__main__":
    main()

Original speech data shape: torch.Size([565, 1, 128])
Original facial data shape: torch.Size([5681, 1, 21])
Original labels shape: torch.Size([565])
Aggregated facial data shape: torch.Size([52, 1, 21])
Number of unique facial videos: 52
Found 52 common videos between datasets
Aligned speech data shape: torch.Size([52, 1, 128])
Aligned facial data shape: torch.Size([52, 1, 21])
Aligned labels shape: torch.Size([52])
Speech
tensor([[[ 0.7161,  0.9442,  0.6214,  ...,  0.6145,  0.7294,  0.9747]],

        [[ 0.2393,  0.2322,  1.0326,  ..., -1.0124, -1.0052, -0.9658]],

        [[ 0.5010,  0.5907, -0.0187,  ..., -0.6323, -0.8343, -0.9241]],

        ...,

        [[ 2.0672,  1.7239,  1.4174,  ...,  1.8156,  1.7393,  1.7581]],

        [[ 1.2390,  0.8291,  0.9826,  ...,  0.0247,  0.0819,  0.1359]],

        [[-0.0484,  0.3097,  0.4756,  ...,  0.3339,  0.3580,  0.4293]]])
tensor([[[3.3740e-01, 1.5618e-01, 3.4763e-01,  ..., 2.0077e+00,
          1.2318e+02, 3.0090e+03]],

        [[3.5752e-01