In [13]:
import torch
import torch.nn as nn
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

In [14]:
class CNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=8)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=8)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout1 = nn.Dropout(0.5)
        self.conv3 = nn.Conv1d(128, 128, kernel_size=8)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout2 = nn.Dropout(0.5)
        self.conv4 = nn.Conv1d(128, 64, kernel_size=3, padding=2)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc1 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool1(x)
        x = self.dropout1(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool2(x)
        x = self.dropout2(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

In [15]:
class EmotionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=192, num_classes=8, dropout_rate=0.3):
        super(EmotionLSTM, self).__init__()
        
        # Simplified architecture - one bidirectional LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        
        # Simple dense layers
        self.bn = nn.BatchNorm1d(hidden_size*2)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        
        # Get last time step or squeeze
        if lstm_out.size(1) == 1:
            lstm_out = lstm_out.squeeze(1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        # Apply batch normalization
        lstm_out = self.bn(lstm_out)
        lstm_out = self.dropout(lstm_out)
        
        # Dense layers
        x = self.fc1(lstm_out)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [16]:
# speech_features = pd.read_csv('path/to/speech_features.csv')

# # Load facial features (8 Excel files for different emotions)
# emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']
# facial_features_by_emotion = {}

# for emotion in emotions:
#     facial_features_by_emotion[emotion] = pd.read_excel(f'path/to/facial_features_{emotion}.xlsx')

# # Now we need to combine these into appropriate tensors
# # This depends on your exact data format, but here's a general approach:

# # 1. Extract features and labels from your data
# X_speech = speech_features.drop('E', axis=1).values  # Assuming there's a label column
# y = speech_features['emotion_label'].values  # Assuming labels are in the speech CSV

# # 2. Combine facial features (this depends on your exact format)
# # Method 1: If each Excel has the same samples in the same order
# # X_facial = []
# # for i, sample in enumerate(X_speech):
# #     # Get the emotion label for this sample
# #     emotion = emotions[y[i]]
# #     # Get the corresponding facial features
# #     facial_sample = facial_features_by_emotion[emotion].iloc[i].values
# #     X_facial.append(facial_sample)
# # X_facial = np.array(X_facial)

# # Alternative Method 2: If Excel files are organized by emotion and have some ID column
# # This assumes each file only has data for one emotion and has an ID column to match with speech data
# X_facial = np.zeros((len(X_speech), facial_features_dimensionality))
# for i, row in speech_features.iterrows():
#     sample_id = row['Emotion']  # Assuming there's an ID column
#     emotion = emotions[row['emotion_label']]
#     # Find the matching facial features using the ID
#     matching_facial = facial_features_by_emotion[emotion][
#         facial_features_by_emotion[emotion]['Emotion'] == sample_id]
#     if not matching_facial.empty:
#         X_facial[i] = matching_facial.drop('Emotion', axis=1).values[0]

# # 3. Create train/test split (or train/val/test if training the ensemble)
# X_speech_train, X_speech_test, X_facial_train, X_facial_test, y_train, y_test = train_test_split(
#     X_speech, X_facial, y, test_size=0.2, random_state=42, stratify=y)

In [17]:
# NUM_EMOTIONS = 8  # For RAVDESS dataset: neutral, calm, happy, sad, angry, fearful, disgust, surprised

# # Voting Ensemble Model
# class VotingEnsemble(nn.Module):
#     def __init__(self, speech_model, facial_model):
#         super(VotingEnsemble, self).__init__()
#         self.speech_model = speech_model
#         self.facial_model = facial_model
        
#         # Freeze base models (optional - you can set to True if you want to keep original models fixed)
#         for param in self.speech_model.parameters():
#             param.requires_grad = False
#         for param in self.facial_model.parameters():
#             param.requires_grad = False
            
#         # Learnable weights for each model
#         self.speech_weight = nn.Parameter(torch.tensor(0.5))
#         self.facial_weight = nn.Parameter(torch.tensor(0.5))
        
#         # Additional fusion layer for feature-level combination
#         self.fusion_layer = nn.Linear(NUM_EMOTIONS * 2, NUM_EMOTIONS)
#         self.dropout = nn.Dropout(0.3)
        
#     def forward(self, speech_input, facial_input):
#         # Get predictions from individual models
#         speech_out = self.speech_model(speech_input)
#         facial_out = self.facial_model(facial_input)
        
#         # Method 1: Weighted average of probabilities
#         speech_probs = torch.softmax(speech_out, dim=1) * torch.sigmoid(self.speech_weight)
#         facial_probs = torch.softmax(facial_out, dim=1) * torch.sigmoid(self.facial_weight)
#         weighted_avg = (speech_probs + facial_probs) / (torch.sigmoid(self.speech_weight) + torch.sigmoid(self.facial_weight))
        
#         # Method 2: Feature-level fusion
#         concat_features = torch.cat((speech_out, facial_out), dim=1)
#         fused_output = self.fusion_layer(self.dropout(concat_features))
        
#         # You can choose which method to use based on performance
#         # return weighted_avg  # Method 1
#         return fused_output  # Method 2

# # Alternative: Simple majority voting ensemble (no training required)
# class MajorityVotingEnsemble(nn.Module):
#     def __init__(self, speech_model, facial_model):
#         super(MajorityVotingEnsemble, self).__init__()
#         self.speech_model = speech_model
#         self.facial_model = facial_model
        
#         # Freeze base models
#         for param in self.speech_model.parameters():
#             param.requires_grad = False
#         for param in self.facial_model.parameters():
#             param.requires_grad = False
    
#     def forward(self, speech_input, facial_input):
#         # Get predictions from individual models
#         speech_out = self.speech_model(speech_input)
#         facial_out = self.facial_model(facial_input)
        
#         # Simple average of probabilities
#         speech_probs = torch.softmax(speech_out, dim=1)
#         facial_probs = torch.softmax(facial_out, dim=1)
        
#         return (speech_probs + facial_probs) / 2

# # Boosting-inspired ensemble that learns to correct errors
# class BoostingEnsemble(nn.Module):
#     def __init__(self, speech_model, facial_model):
#         super(BoostingEnsemble, self).__init__()
#         self.speech_model = speech_model
#         self.facial_model = facial_model
        
#         # Freeze base models
#         for param in self.speech_model.parameters():
#             param.requires_grad = False
#         for param in self.facial_model.parameters():
#             param.requires_grad = False
        
#         # Sequential combination - first model feeds into correction network
#         hidden_size = 128
#         self.error_correction = nn.Sequential(
#             nn.Linear(NUM_EMOTIONS * 2, hidden_size),
#             nn.ReLU(),
#             nn.Dropout(0.3),
#             nn.Linear(hidden_size, NUM_EMOTIONS)
#         )
    
#     def forward(self, speech_input, facial_input):
#         # Get base predictions
#         speech_logits = self.speech_model(speech_input)
#         facial_logits = self.facial_model(facial_input)
        
#         # Combine logits for error correction
#         combined = torch.cat((speech_logits, facial_logits), dim=1)
        
#         # Apply correction network
#         final_output = self.error_correction(combined)
        
#         return final_output

In [None]:
NUM_EMOTIONS = 8  # For RAVDESS dataset: neutral, calm, happy, sad, angry, fearful, disgust, surprised
EMOTION_LABELS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

# Voting Ensemble Model
class VotingEnsemble(nn.Module):
    def __init__(self, speech_model, facial_model):
        super(VotingEnsemble, self).__init__()
        self.speech_model = speech_model
        self.facial_model = facial_model
        
        # Freeze base models (optional - you can set to True if you want to keep original models fixed)
        for param in self.speech_model.parameters():
            param.requires_grad = False
        for param in self.facial_model.parameters():
            param.requires_grad = False
            
        # Learnable weights for each model
        self.speech_weight = nn.Parameter(torch.tensor(0.5))
        self.facial_weight = nn.Parameter(torch.tensor(0.5))
        
        # Additional fusion layer for feature-level combination
        self.fusion_layer = nn.Linear(NUM_EMOTIONS * 2, NUM_EMOTIONS)
        self.dropout = nn.Dropout(0.3)
        
    def forward(self, speech_input, facial_input):
        # Get predictions from individual models
        speech_out = self.speech_model(speech_input)
        facial_out = self.facial_model(facial_input)
        
        # Method 1: Weighted average of probabilities
        speech_probs = torch.softmax(speech_out, dim=1) * torch.sigmoid(self.speech_weight)
        facial_probs = torch.softmax(facial_out, dim=1) * torch.sigmoid(self.facial_weight)
        weighted_avg = (speech_probs + facial_probs) / (torch.sigmoid(self.speech_weight) + torch.sigmoid(self.facial_weight))
        
        # Method 2: Feature-level fusion
        concat_features = torch.cat((speech_out, facial_out), dim=1)
        fused_output = self.fusion_layer(self.dropout(concat_features))
        
        # You can choose which method to use based on performance
        # return weighted_avg  # Method 1
        return fused_output  # Method 2

# Alternative: Simple majority voting ensemble (no training required)
class MajorityVotingEnsemble(nn.Module):
    def __init__(self, speech_model, facial_model):
        super(MajorityVotingEnsemble, self).__init__()
        self.speech_model = speech_model
        self.facial_model = facial_model
        
        # Freeze base models
        for param in self.speech_model.parameters():
            param.requires_grad = False
        for param in self.facial_model.parameters():
            param.requires_grad = False
    
    def forward(self, speech_input, facial_input):
        # Get predictions from individual models
        speech_out = self.speech_model(speech_input)
        facial_out = self.facial_model(facial_input)
        
        # Simple average of probabilities
        speech_probs = torch.softmax(speech_out, dim=1)
        facial_probs = torch.softmax(facial_out, dim=1)
        
        return (speech_probs + facial_probs) / 2

# Boosting-inspired ensemble that learns to correct errors
class BoostingEnsemble(nn.Module):
    def __init__(self, speech_model, facial_model):
        super(BoostingEnsemble, self).__init__()
        self.speech_model = speech_model
        self.facial_model = facial_model
        
        # Freeze base models
        for param in self.speech_model.parameters():
            param.requires_grad = False
        for param in self.facial_model.parameters():
            param.requires_grad = False
        
        # Sequential combination - first model feeds into correction network
        hidden_size = 128
        self.error_correction = nn.Sequential(
            nn.Linear(NUM_EMOTIONS * 2, hidden_size),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(hidden_size, NUM_EMOTIONS)
        )
    
    def forward(self, speech_input, facial_input):
        # Get base predictions
        speech_logits = self.speech_model(speech_input)
        facial_logits = self.facial_model(facial_input)
        
        # Combine logits for error correction
        combined = torch.cat((speech_logits, facial_logits), dim=1)
        
        # Apply correction network
        final_output = self.error_correction(combined)
        
        return final_output

# Evaluation functions
def evaluate_model(model, speech_data, facial_data, labels, device='cuda'):
    """
    Evaluate the ensemble model and return metrics
    
    Args:
        model: The ensemble model
        speech_data: Speech features (tensor)
        facial_data: Facial features (tensor)
        labels: Ground truth labels (tensor)
        device: Device to run evaluation on ('cuda' or 'cpu')
        
    Returns:
        dict: Dictionary containing various metrics
    """
    model.eval()
    
    # Move data to device
    speech_data = speech_data.to(device)
    facial_data = facial_data.to(device)
    labels = labels.to(device)
    
    with torch.no_grad():
        # Forward pass
        outputs = model(speech_data, facial_data)
        _, predicted = torch.max(outputs, 1)
        
        # Convert to numpy for sklearn metrics
        predicted_np = predicted.cpu().numpy()
        labels_np = labels.cpu().numpy()
        
        # Calculate metrics
        acc = accuracy_score(labels_np, predicted_np)
        report = classification_report(labels_np, predicted_np, target_names=EMOTION_LABELS, output_dict=True)
        conf_matrix = confusion_matrix(labels_np, predicted_np)
        
        # Per-class metrics
        per_class_acc = report['weighted avg']['precision']
        
        return {
            'accuracy': acc,
            'per_class_accuracy': per_class_acc,
            'classification_report': report,
            'confusion_matrix': conf_matrix
        }

def visualize_confusion_matrix(conf_matrix, class_names=EMOTION_LABELS):
    """
    Visualize confusion matrix
    
    Args:
        conf_matrix: Confusion matrix
        class_names: List of class names
    """
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.show()

def evaluate_individual_vs_ensemble(speech_model, facial_model, ensemble_model, 
                                   speech_data, facial_data, labels, device='cuda'):
    """
    Compare individual models vs ensemble
    
    Args:
        speech_model: Speech emotion model
        facial_model: Facial emotion model
        ensemble_model: Ensemble model
        speech_data: Speech features (tensor)
        facial_data: Facial features (tensor)
        labels: Ground truth labels (tensor)
        device: Device to run evaluation on ('cuda' or 'cpu')
    
    Returns:
        dict: Dictionary with all evaluation results
    """
    # Move data to device
    speech_data = speech_data.to(device)
    facial_data = facial_data.to(device)
    labels = labels.to(device)
    
    # Set models to evaluation mode
    speech_model.eval()
    facial_model.eval()
    ensemble_model.eval()
    
    with torch.no_grad():
        # Individual model predictions
        speech_outputs = speech_model(speech_data)
        facial_outputs = facial_model(facial_data)
        ensemble_outputs = ensemble_model(speech_data, facial_data)
        
        # Get predicted classes
        _, speech_preds = torch.max(speech_outputs, 1)
        _, facial_preds = torch.max(facial_outputs, 1)
        _, ensemble_preds = torch.max(ensemble_outputs, 1)
        
        # Convert to numpy
        speech_preds_np = speech_preds.cpu().numpy()
        facial_preds_np = facial_preds.cpu().numpy()
        ensemble_preds_np = ensemble_preds.cpu().numpy()
        labels_np = labels.cpu().numpy()
        
        # Calculate accuracies
        speech_acc = accuracy_score(labels_np, speech_preds_np)
        facial_acc = accuracy_score(labels_np, facial_preds_np)
        ensemble_acc = accuracy_score(labels_np, ensemble_preds_np)
        
        # Generate classification reports
        speech_report = classification_report(labels_np, speech_preds_np, 
                                             target_names=EMOTION_LABELS, output_dict=True)
        facial_report = classification_report(labels_np, facial_preds_np, 
                                             target_names=EMOTION_LABELS, output_dict=True)
        ensemble_report = classification_report(labels_np, ensemble_preds_np, 
                                              target_names=EMOTION_LABELS, output_dict=True)
        
        # Generate confusion matrices
        speech_cm = confusion_matrix(labels_np, speech_preds_np)
        facial_cm = confusion_matrix(labels_np, facial_preds_np)
        ensemble_cm = confusion_matrix(labels_np, ensemble_preds_np)
        
        # Return all results
        return {
            'speech': {
                'accuracy': speech_acc,
                'classification_report': speech_report,
                'confusion_matrix': speech_cm
            },
            'facial': {
                'accuracy': facial_acc,
                'classification_report': facial_report,
                'confusion_matrix': facial_cm
            },
            'ensemble': {
                'accuracy': ensemble_acc,
                'classification_report': ensemble_report,
                'confusion_matrix': ensemble_cm
            }
        }

def plot_comparison(results):
    """
    Plot comparison of models
    
    Args:
        results: Results dictionary from evaluate_individual_vs_ensemble
    """
    models = ['Speech Model', 'Facial Model', 'Ensemble Model']
    accuracies = [
        results['speech']['accuracy'] * 100,
        results['facial']['accuracy'] * 100,
        results['ensemble']['accuracy'] * 100
    ]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(models, accuracies, color=['blue', 'green', 'red'])
    
    # Add value labels on top of bars
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.2f}%', ha='center', va='bottom')
    
    plt.ylabel('Accuracy (%)')
    plt.title('Model Comparison')
    plt.ylim(0, 100)  # Set y-axis from 0 to 100
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.show()

# Example usage
def main():
    # Load your models and data here
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # Initialize your models
    speech_model = CNNModel(input_size=torch.Size([1, 128]), num_classes=8).to(device)
    facial_model = EmotionLSTM(input_size=21).to(device)
    
    # Load pre-trained weights
    speech_model.load_state_dict(torch.load('C:/Nini/Capstone/Models/DataAugmentation_cnn_model_new.pth', weights_only=True))
    facial_model.load_state_dict(torch.load('/kaggle/input/face-emotion/pytorch/default/1/emotion_lstm_model-2.pth', weights_only=True))
    
    # Create ensemble model
    ensemble_model = VotingEnsemble(speech_model, facial_model).to(device)
    
    # Load test data
    speech_test_data = torch.load('path/to/speech_test_data.pt')
    facial_test_data = torch.load('path/to/facial_test_data.pt')
    test_labels = torch.load('path/to/test_labels.pt')
    
    # Evaluate models
    results = evaluate_individual_vs_ensemble(
        speech_model, facial_model, ensemble_model,
        speech_test_data, facial_test_data, test_labels, device
    )
    
    # Print results
    print(f"Speech Model Accuracy: {results['speech']['accuracy']:.4f}")
    print(f"Facial Model Accuracy: {results['facial']['accuracy']:.4f}")
    print(f"Ensemble Model Accuracy: {results['ensemble']['accuracy']:.4f}")
    
    # Plot comparison
    plot_comparison(results)
    
    # Visualize confusion matrix for the ensemble
    visualize_confusion_matrix(results['ensemble']['confusion_matrix'])
    
    # Print detailed report for ensemble
    print("\nEnsemble Classification Report:")
    for emotion, metrics in results['ensemble']['classification_report'].items():
        if isinstance(metrics, dict):
            print(f"{emotion}: precision={metrics['precision']:.2f}, recall={metrics['recall']:.2f}, f1-score={metrics['f1-score']:.2f}")

if __name__ == "__main__":
    main()

In [None]:
# # Initialize your pre-trained models
# device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
# speech_model = CNNModel(input_size=torch.Size([1, 128]), num_classes=8).to(device)

# facial_model = EmotionLSTM(input_size=21)

# # Load your pre-trained weights
# speech_model.load_state_dict(torch.load('/kaggle/input/speech-emotion-detection/pytorch/default/1/DataAugmentation_cnn_model_new.pth', weights_only=True))
# facial_model.load_state_dict(torch.load('/kaggle/input/face-emotion/pytorch/default/1/emotion_lstm_model-2.pth', weights_only=True))

# # Create ensemble
# ensemble = VotingEnsemble(speech_model, facial_model)  # Or any other ensemble variant

# # Use the ensemble for prediction
# # speech_features = torch.FloatTensor(your_speech_features)
# # facial_features = torch.FloatTensor(your_facial_features)
# # predictions = ensemble(speech_features, facial_features)