In [None]:
pip install -U scikit-learn==1.6.1

In [11]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
import pandas as pd
import os
import pickle
import IPython.display as ipd
from moviepy.video.io.VideoFileClip import VideoFileClip
import librosa
import librosa.display
import pandas as pd
import numpy as np
import os
import soxr
import torch.optim as optim
import pickle
import os
import cv2
import mediapipe as mp
from sklearn.preprocessing import RobustScaler
from tqdm import tqdm

In [22]:
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

# Define key landmark indices for emotion-relevant features
LEFT_EYE = [33, 133, 160, 159, 158, 144, 153, 154, 155, 173, 157, 163]
RIGHT_EYE = [362, 385, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381]
LEFT_EYEBROW = [70, 63, 105, 66, 107]
RIGHT_EYEBROW = [336, 296, 334, 293, 300]
MOUTH_OUTER = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 375, 321, 405, 314, 17, 84, 181, 91, 146]
MOUTH_INNER = [78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308, 324, 318, 402, 317, 14, 87, 178, 88, 95]
NOSE = [1, 2, 3, 4, 5, 6, 168, 197, 195, 5, 4, 98, 97, 2, 326, 327]

In [23]:
class CNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=8)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=8)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout1 = nn.Dropout(0.5)
        self.conv3 = nn.Conv1d(128, 128, kernel_size=8)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout2 = nn.Dropout(0.5)
        self.conv4 = nn.Conv1d(128, 64, kernel_size=3, padding=2)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc1 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)
    
    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool1(x)
        x = self.dropout1(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool2(x)
        x = self.dropout2(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

In [24]:
class EmotionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_classes=8, dropout_rate=0.3):
        super(EmotionLSTM, self).__init__()
        
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        
        self.bn = nn.BatchNorm1d(hidden_size*2)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        lstm_out, _ = self.lstm(x)
        
        if lstm_out.size(1) == 1:
            lstm_out = lstm_out.squeeze(1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        lstm_out = self.bn(lstm_out)
        lstm_out = self.dropout(lstm_out)
        
        x = self.fc1(lstm_out)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [25]:
NUM_EMOTIONS = 8 
EMOTION_LABELS = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

class VotingEnsemble(nn.Module):
    def __init__(self, speech_model, facial_model):
        super(VotingEnsemble, self).__init__()
        self.speech_model = speech_model
        self.facial_model = facial_model
        
        for param in self.speech_model.parameters():
            param.requires_grad = False
        for param in self.facial_model.parameters():
            param.requires_grad = False
            
        self.speech_weight = nn.Parameter(torch.tensor(0.5))
        self.facial_weight = nn.Parameter(torch.tensor(0.5))
        
    def forward(self, speech_input, facial_input):
        speech_out = self.speech_model(speech_input)
        facial_out = self.facial_model(facial_input)
        
        speech_probs = torch.softmax(speech_out, dim=1) * torch.sigmoid(self.speech_weight)
        facial_probs = torch.softmax(facial_out, dim=1) * torch.sigmoid(self.facial_weight)
        weighted_avg = (speech_probs + facial_probs) / (torch.sigmoid(self.speech_weight) + torch.sigmoid(self.facial_weight))
        
        return weighted_avg  

In [26]:
def evaluate_model(model, speech_data, facial_data, labels, device='cuda'):
    model.eval()
    
    speech_data = speech_data.to(device)
    facial_data = facial_data.to(device)
    labels = labels.to(device)
    
    with torch.no_grad():
        outputs = model(speech_data, facial_data)
        _, predicted = torch.max(outputs, 1)
        
        predicted_np = predicted.cpu().numpy()
        labels_np = labels.cpu().numpy()
        
        acc = accuracy_score(labels_np, predicted_np)
        report = classification_report(labels_np, predicted_np, target_names=EMOTION_LABELS, output_dict=True)
        conf_matrix = confusion_matrix(labels_np, predicted_np)
        
        per_class_acc = report['weighted avg']['precision']
        
        return {
            'accuracy': acc,
            'per_class_accuracy': per_class_acc,
            'classification_report': report,
            'confusion_matrix': conf_matrix
        }

def visualize_confusion_matrix(conf_matrix, class_names=EMOTION_LABELS):
    plt.figure(figsize=(10, 8))
    sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', 
                xticklabels=class_names, yticklabels=class_names)
    plt.xlabel('Predicted')
    plt.ylabel('True')
    plt.title('Confusion Matrix')
    plt.tight_layout()
    plt.savefig('confusion_matrix.png')
    plt.show()

def evaluate_individual_vs_ensemble(speech_model, facial_model, ensemble_model, 
                                   speech_data, facial_data,device='cuda'):
    speech_data = speech_data.to(device)
    facial_data = facial_data.to(device)
    # labels = labels.to(device)
    
    speech_model.eval()
    facial_model.eval()
    ensemble_model.eval()
    
    with torch.no_grad():
        speech_outputs = speech_model(speech_data)
        facial_outputs = facial_model(facial_data)
        ensemble_outputs = ensemble_model(speech_data, facial_data)

        # return speech_outputs, facial_outputs, ensemble_outputs
        
        _, speech_preds = torch.max(speech_outputs, 1)
        _, facial_preds = torch.max(facial_outputs, 1)
        _, ensemble_preds = torch.max(ensemble_outputs, 1)
        
        speech_preds_np = speech_preds.cpu().numpy()
        facial_preds_np = facial_preds.cpu().numpy()
        ensemble_preds_np = ensemble_preds.cpu().numpy()

        return speech_preds_np, facial_preds_np, ensemble_preds_np
        # labels_np = labels.cpu().numpy()
        
        # speech_acc = accuracy_score(labels_np, speech_preds_np)
        # facial_acc = accuracy_score(labels_np, facial_preds_np)
        # ensemble_acc = accuracy_score(labels_np, ensemble_preds_np)
        
        # speech_report = classification_report(labels_np, speech_preds_np, 
        #                                      target_names=EMOTION_LABELS, output_dict=True)
        # facial_report = classification_report(labels_np, facial_preds_np, 
        #                                      target_names=EMOTION_LABELS, output_dict=True)
        # ensemble_report = classification_report(labels_np, ensemble_preds_np, 
        #                                       target_names=EMOTION_LABELS, output_dict=True)
        
        # speech_cm = confusion_matrix(labels_np, speech_preds_np)
        # facial_cm = confusion_matrix(labels_np, facial_preds_np)
        # ensemble_cm = confusion_matrix(labels_np, ensemble_preds_np)
        
        # return {
        #     'speech': {
        #         'accuracy': speech_acc,
        #         'classification_report': speech_report,
        #         'confusion_matrix': speech_cm
        #     },
        #     'facial': {
        #         'accuracy': facial_acc,
        #         'classification_report': facial_report,
        #         'confusion_matrix': facial_cm
        #     },
        #     'ensemble': {
        #         'accuracy': ensemble_acc,
        #         'classification_report': ensemble_report,
        #         'confusion_matrix': ensemble_cm
        #     }
        # }

def plot_comparison(results):
    models = ['Speech Model', 'Facial Model', 'Ensemble Model']
    accuracies = [
        results['speech']['accuracy'] * 100,
        results['facial']['accuracy'] * 100,
        results['ensemble']['accuracy'] * 100
    ]
    
    plt.figure(figsize=(10, 6))
    bars = plt.bar(models, accuracies, color=['blue', 'green', 'red'])
    
    for bar in bars:
        height = bar.get_height()
        plt.text(bar.get_x() + bar.get_width()/2., height + 0.5,
                f'{height:.2f}%', ha='center', va='bottom')
    
    plt.ylabel('Accuracy (%)')
    plt.title('Model Comparison')
    plt.ylim(0, 100)  
    plt.grid(axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.savefig('model_comparison.png')
    plt.show()

In [27]:
def extract_features_from_audio(video_path):
    video_clip = VideoFileClip(video_path)
    audio_clip = video_clip.audio
    # audio_path = f"C:/Nini/Capstone/Test/{video_path.split('/')[5].split('.')[0]}.wav"
    audio_path = "dummy.wav"
    audio_clip.write_audiofile(audio_path)

    X, sample_rate = librosa.load(audio_path,res_type='kaiser_fast',duration=3,sr=44100,offset=0.5)
    audio_resampled = soxr.resample(X, sample_rate, 16000)
    spectrogram = librosa.feature.melspectrogram(y=audio_resampled,sr=16000,n_mels=128,fmax=8000)
    db_spec = librosa.power_to_db(spectrogram)
    log_spectrogram = np.mean(db_spec,axis=1)

    mean = np.load("C:/Nini/Capstone/src/Data Preprocessing/mean.npy")
    std = np.load("C:/Nini/Capstone/src/Data Preprocessing/std.npy")

    mean_tensor = torch.from_numpy(mean).float()
    std_tensor = torch.from_numpy(std).float()
    log_spectrogram = torch.from_numpy(log_spectrogram).float()
    log_spectrogram = (log_spectrogram - mean_tensor) / std_tensor

    log_spectrogram = log_spectrogram.unsqueeze(0).unsqueeze(1).float()
    return log_spectrogram

In [28]:
def calculate_distance(p1, p2):
    return np.linalg.norm(p1 - p2)

def calculate_eye_aspect_ratio(eye):
    v1 = calculate_distance(eye[1], eye[7])
    v2 = calculate_distance(eye[2], eye[6])
    v3 = calculate_distance(eye[3], eye[5])
    h = calculate_distance(eye[0], eye[4])
    return (v1 + v2 + v3) / (3.0 * h)

def calculate_mouth_aspect_ratio(outer, inner):
    outer_v = calculate_distance(outer[3], outer[9])
    inner_v = calculate_distance(inner[3], inner[9])
    h = calculate_distance(outer[0], outer[6])
    return outer_v / h, inner_v / outer_v

def calculate_eyebrow_position(eyebrow, eye):
    return np.mean([p[1] for p in eye]) - np.mean([p[1] for p in eyebrow])

def extract_features_from_frame(frame):
    left_eye = frame[LEFT_EYE]
    right_eye = frame[RIGHT_EYE]
    left_eyebrow = frame[LEFT_EYEBROW]
    right_eyebrow = frame[RIGHT_EYEBROW]
    mouth_outer = frame[MOUTH_OUTER]
    mouth_inner = frame[MOUTH_INNER]
    nose = frame[NOSE]

    width = np.max(frame[:, 0]) - np.min(frame[:, 0])
    height = np.max(frame[:, 1]) - np.min(frame[:, 1])

    left_ear = calculate_eye_aspect_ratio(left_eye)
    right_ear = calculate_eye_aspect_ratio(right_eye)
    mar, openness = calculate_mouth_aspect_ratio(mouth_outer, mouth_inner)
    l_eyebrow_pos = calculate_eyebrow_position(left_eyebrow, left_eye) / height
    r_eyebrow_pos = calculate_eyebrow_position(right_eyebrow, right_eye) / height
    mouth_center_y = (mouth_outer[3][1] + mouth_outer[9][1]) / 2
    smile = ((mouth_center_y - mouth_outer[0][1]) + (mouth_center_y - mouth_outer[6][1])) / (2 * height)
    nose_wrinkle = np.std([p[2] for p in nose])
    eye_sym = abs(left_ear - right_ear)
    brow_sym = abs(l_eyebrow_pos - r_eyebrow_pos)

    return [left_ear, right_ear, mar, openness, l_eyebrow_pos, r_eyebrow_pos, smile, nose_wrinkle, eye_sym, brow_sym]

def summarize_video_features(features):
    features = np.array(features)
    summary = []
    for i in range(features.shape[1]):
        f = features[:, i]
        summary.extend([np.mean(f), np.std(f), np.min(f), np.max(f), np.max(f)-np.min(f), f[-1] - f[0]])
    return summary

def extract_landmarks_from_video(path):
    cap = cv2.VideoCapture(path)
    features = []
    with mp_face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as fm:
        while cap.isOpened():
            ret, img = cap.read()
            if not ret: break
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            result = fm.process(img_rgb)
            if result.multi_face_landmarks:
                lm = result.multi_face_landmarks[0]
                points = np.array([[p.x, p.y, p.z] for p in lm.landmark])
                try:
                    features.append(extract_features_from_frame(points))
                except:
                    continue
    cap.release()
    return summarize_video_features(features)

In [29]:
def extract_features_from_face(video_path):
    df = []
    landmarks = extract_landmarks_from_video(video_path)
    df.append(landmarks)
    columns = [
            'left_eye_ar', 'right_eye_ar', 'mouth_ar', 'mouth_openness',
            'left_eyebrow_pos', 'right_eyebrow_pos', 'smile_ratio', 'nose_wrinkle',
            'eye_symmetry', 'eyebrow_symmetry'
        ]
    stats = ['mean', 'std', 'min', 'max', 'range', 'delta']
    colnames = [f"{f}_{s}" for f in columns for s in stats]
    feats = pd.DataFrame(df,columns=colnames)

    face_features = feats.values
    with open("C:/Nini/Capstone/CSV_Files/Facial data/New Facial Data/robust_scaler-2.pkl", 'rb') as r:
        scaler = pickle.load(r)
    face_features_scaled = scaler.transform(face_features)
    tensor_face = torch.tensor(face_features_scaled, dtype=torch.float32).reshape((face_features_scaled.shape[0], 1, face_features_scaled.shape[1]))
    return tensor_face, face_features

In [31]:
def main():
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    # df_speech = pd.read_csv("C:/Nini/Capstone/CSV_Files/Speech data/Speech_Test1_preprocessed_new.csv")
    # X_speech = df_speech.drop(columns=['Path','Unnamed: 0','Emotion']).values 
    # X_tensor_speech = torch.tensor(X_speech, dtype=torch.float32).unsqueeze(1)
    # torch.save(X_tensor_speech, "C:/Nini/Capstone/CSV_Files/Speech data/speech_test_data.pt")
    # speech_test_data = torch.load("C:/Nini/Capstone/CSV_Files/Speech data/speech_test_data.pt", weights_only=True)

    # df_face = pd.read_excel("C:/Nini/Capstone/CSV_Files/Facial data/New Facial Data/FaceTest(e)(arranged).xlsx")
    # X_face = df_face.drop(columns=['video_path','emotion']).values
    # with open("C:/Nini/Capstone/CSV_Files/Facial data/New Facial Data/robust_scaler-2.pkl", 'rb') as r:
    #     scaler = pickle.load(r)
    # X_face_scaled = scaler.fit_transform(X_face)
    # X_tensor_face = torch.tensor(X_face_scaled, dtype=torch.float32).reshape((X_face_scaled.shape[0], 1, X_face_scaled.shape[1]))
    # torch.save(X_tensor_face, 'C:/Nini/Capstone/CSV_Files/Facial data/New Facial Data/face_test_data.pt')
    # facial_test_data = torch.load('C:/Nini/Capstone/CSV_Files/Facial data/New Facial Data/face_test_data.pt', weights_only=True)

    # with open("C:/Nini/Capstone/src/Model_training/label_encoder.pkl", 'rb') as f:
    #     label_encoder = pickle.load(f)
    # test_labels = df_face['emotion'].values
    # test_labels_encoded = label_encoder.transform(test_labels)
    # test_labels_tensor = torch.tensor(test_labels_encoded, dtype=torch.long)
    # torch.save(test_labels_tensor, "C:/Nini/Capstone/CSV_Files/Speech data/test_labels.pt")
    # test_labels = torch.load("C:/Nini/Capstone/CSV_Files/Speech data/test_labels.pt", weights_only=True)
    video_path = "C:/Nini/Capstone/Test/1001_TSI_ANG_XX.mp4"
    speech_features = extract_features_from_audio(video_path)
    face_features, X_face = extract_features_from_face(video_path)
    
    
    speech_model = CNNModel(input_size=speech_features.shape[1], num_classes=8).to(device)
    facial_model = EmotionLSTM(input_size=X_face.shape[1]).to(device)
    
    speech_model.load_state_dict(torch.load('C:/Nini/Capstone/Models/DataAugmentation_cnn_model_new_final_1.pth', weights_only=True))
    facial_model.load_state_dict(torch.load('C:/Nini/Capstone/Models/emotion_lstm_model-7.pth', weights_only=True))
    
    ensemble_model = VotingEnsemble(speech_model, facial_model).to(device)
    
    speech_results, facial_results, ensemble_results = evaluate_individual_vs_ensemble(
        speech_model, facial_model, ensemble_model,
        speech_features, face_features, device
    )
    with open("C:/Nini/Capstone/src/Model_training/label_encoder.pkl", 'rb') as f:
        label_encoder = pickle.load(f)
    
    speech_emotion = label_encoder.inverse_transform(speech_results)
    facial_emotion = label_encoder.inverse_transform(facial_results)
    ensemble_emotion = label_encoder.inverse_transform(ensemble_results)
    print(f'The class predicted by the speech model is: {speech_emotion}')
    print(f'The class predicted by the facial model is: {facial_emotion}')
    print(f'The class predicted by the ensemble model is: {ensemble_emotion}')

    

    # print(f"Speech Model Accuracy: {results['speech']['accuracy']:.4f}")
    # print(f"Facial Model Accuracy: {results['facial']['accuracy']:.4f}")
    # print(f"Ensemble Model Accuracy: {results['ensemble']['accuracy']:.4f}")

    # plot_comparison(results)
    
    # visualize_confusion_matrix(results['ensemble']['confusion_matrix'])
    
    # print("\nEnsemble Classification Report:")
    # for emotion, metrics in results['ensemble']['classification_report'].items():
    #     if isinstance(metrics, dict):
    #         print(f"{emotion}: precision={metrics['precision']:.2f}, recall={metrics['recall']:.2f}, f1-score={metrics['f1-score']:.2f}")

if __name__ == "__main__":
    main()

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'isom', 'minor_version': '512', 'compatible_brands': 'isomiso2avc1mp41', 'encoder': 'Lavf61.7.100'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': None, 'default': True, 'size': [480, 360], 'bitrate': 233, 'fps': 29.97002997002997, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'handler_name': 'VideoHandler', 'vendor_id': '[0][0][0][0]', 'encoder': 'Lavc61.19.100 libx264'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': None, 'default': True, 'fps': 44100, 'bitrate': 130, 'metadata': {'Metadata': '', 'handler_name': 'SoundHandler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 2.5, 'bitrate': 375, 'start': 0.0, 'default_video_input_number': 0, 'default_video_stream_number': 0, 'video_codec_name': 'h264', 'video_profile': '(High)', 'video_size': [480, 360], 'video_bitrate': 233, 'video_fps':

                                                       

MoviePy - Done.
The class predicted by the speech model is: ['angry']
The class predicted by the facial model is: ['disgust']
The class predicted by the ensemble model is: ['disgust']
