In [22]:
import IPython.display as ipd
from moviepy.video.io.VideoFileClip import VideoFileClip
import librosa
import librosa.display
import pandas as pd
import numpy as np
import os
import soxr
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import pickle
import os
import cv2
import mediapipe as mp
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import RobustScaler
import matplotlib.pyplot as plt
from tqdm import tqdm

In [23]:
class CNNModel(nn.Module):
    def __init__(self, input_size, num_classes):
        super(CNNModel, self).__init__()
        self.conv1 = nn.Conv1d(in_channels=1, out_channels=64, kernel_size=8)
        self.bn1 = nn.BatchNorm1d(64)
        self.conv2 = nn.Conv1d(64, 128, kernel_size=8)
        self.bn2 = nn.BatchNorm1d(128)
        self.pool1 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout1 = nn.Dropout(0.5)
        self.conv3 = nn.Conv1d(128, 128, kernel_size=8)
        self.bn3 = nn.BatchNorm1d(128)
        self.pool2 = nn.MaxPool1d(kernel_size=4, padding=2)
        self.dropout2 = nn.Dropout(0.5)
        self.conv4 = nn.Conv1d(128, 64, kernel_size=3, padding=2)
        self.bn4 = nn.BatchNorm1d(64)
        self.fc1 = nn.Linear(512, 256)
        self.dropout3 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(256, num_classes)


    def forward(self, x):
        x = F.relu(self.bn1(self.conv1(x)))
        x = F.relu(self.bn2(self.conv2(x)))
        x = self.pool1(x)
        x = self.dropout1(x)
        x = F.relu(self.bn3(self.conv3(x)))
        x = self.pool2(x)
        x = self.dropout2(x)
        x = F.relu(self.bn4(self.conv4(x)))
        x = x.view(x.size(0), -1)
        x = F.relu(self.fc1(x))
        x = self.dropout3(x)
        x = self.fc2(x)
        return x

In [24]:
class EmotionLSTM(nn.Module):
    def __init__(self, input_size, hidden_size=128, num_classes=8, dropout_rate=0.3):
        super(EmotionLSTM, self).__init__()
        
        # Simplified architecture - one bidirectional LSTM
        self.lstm = nn.LSTM(input_size, hidden_size, batch_first=True, bidirectional=True)
        self.dropout = nn.Dropout(dropout_rate)
        
        # Simple dense layers
        self.bn = nn.BatchNorm1d(hidden_size*2)
        self.fc1 = nn.Linear(hidden_size*2, hidden_size)
        self.relu = nn.ReLU()
        self.fc2 = nn.Linear(hidden_size, num_classes)
        
    def forward(self, x):
        # LSTM layer
        lstm_out, _ = self.lstm(x)
        
        # Get last time step or squeeze
        if lstm_out.size(1) == 1:
            lstm_out = lstm_out.squeeze(1)
        else:
            lstm_out = lstm_out[:, -1, :]
        
        # Apply batch normalization
        lstm_out = self.bn(lstm_out)
        lstm_out = self.dropout(lstm_out)
        
        # Dense layers
        x = self.fc1(lstm_out)
        x = self.relu(x)
        x = self.dropout(x)
        x = self.fc2(x)
        
        return x

In [25]:
video_path = "C:/Nini/Data/Data/Actor_03/01-01-05-02-02-01-03.mp4"

In [26]:
video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_path = f"C:/Nini/Capstone/Test/{video_path.split('/')[5].split('.')[0]}.wav"
audio_clip.write_audiofile(audio_path)

{'video_found': True, 'audio_found': True, 'metadata': {'major_brand': 'M4V', 'minor_version': '1', 'compatible_brands': 'M4V mp42isom', 'creation_time': '2013-03-21T00:12:56.000000Z'}, 'inputs': [{'streams': [{'input_number': 0, 'stream_number': 0, 'stream_type': 'video', 'language': 'eng', 'default': True, 'size': [1280, 720], 'bitrate': 9986, 'fps': 29.97002997002997, 'codec_name': 'h264', 'profile': '(High)', 'metadata': {'Metadata': '', 'creation_time': '2013-03-21T00:12:56.000000Z', 'handler_name': 'Mainconcept MP4 Video Media Handler', 'vendor_id': '[0][0][0][0]', 'encoder': 'AVC Coding'}}, {'input_number': 0, 'stream_number': 1, 'stream_type': 'audio', 'language': 'eng', 'default': True, 'fps': 48000, 'bitrate': 189, 'metadata': {'Metadata': '', 'creation_time': '2013-03-21T00:12:56.000000Z', 'handler_name': 'Mainconcept MP4 Sound Media Handler', 'vendor_id': '[0][0][0][0]'}}], 'input_number': 0}], 'duration': 4.24, 'bitrate': 10232, 'start': 0.0, 'default_video_input_number': 

                                                       

MoviePy - Done.




In [27]:
x, sr = librosa.load(audio_path)
ipd.Audio(audio_path)

In [28]:
X, sample_rate = librosa.load(audio_path,res_type='kaiser_fast',duration=3,sr=44100,offset=0.5)
audio_resampled = soxr.resample(X, sample_rate, 16000)
spectrogram = librosa.feature.melspectrogram(y=audio_resampled,sr=16000,n_mels=128,fmax=8000)
db_spec = librosa.power_to_db(spectrogram)
log_spectrogram = np.mean(db_spec,axis=1)

In [29]:
mean = np.load("C:/Nini/Capstone/src/Data Preprocessing/mean.npy")
std = np.load("C:/Nini/Capstone/src/Data Preprocessing/std.npy")

In [30]:
mean_tensor = torch.from_numpy(mean).float()
std_tensor = torch.from_numpy(std).float()

In [31]:
# mean[0]

In [32]:
# log_spectrogram = (log_spectrogram - mean[0])/std[0]

In [33]:
# log_spectrogram = np.array(log_spectrogram)

In [34]:
type(log_spectrogram)

numpy.ndarray

In [35]:
log_spectrogram = torch.from_numpy(log_spectrogram).float()
log_spectrogram = (log_spectrogram - mean_tensor) / std_tensor

In [36]:
print("log_spectrogram shape:", log_spectrogram.shape)
print("mean_tensor shape:", mean_tensor.shape)
print("std_tensor shape:", std_tensor.shape)

log_spectrogram shape: torch.Size([128])
mean_tensor shape: torch.Size([128])
std_tensor shape: torch.Size([128])


In [37]:
std_tensor

tensor([10.5004,  8.1579,  7.5787,  8.7747,  9.7786,  9.1531,  8.6085,  7.8184,
         7.6558,  7.9041,  8.3458,  8.8826,  9.0084,  9.0105,  8.7831,  8.4749,
         8.4480,  8.4919,  8.6689,  8.8820,  8.9406,  9.0338,  9.2556,  9.3027,
         9.1692,  9.2803,  9.3588,  9.3791,  9.5350,  9.5976,  9.5871,  9.5901,
         9.4579,  9.5016,  9.5009,  9.4823,  9.5634,  9.7926,  9.9066,  9.9346,
        10.0990,  9.9851,  9.9400, 10.0021, 10.1168, 10.2085, 10.3214, 10.1942,
        10.0825, 10.0040, 10.0280, 10.0928, 10.1213, 10.1145, 10.1310, 10.0923,
        10.0881, 10.1279, 10.0797, 10.0679, 10.0969, 10.1493, 10.1873, 10.2158,
        10.2929, 10.3212, 10.3677, 10.3788, 10.4219, 10.3231, 10.2738, 10.3355,
        10.3254, 10.2910, 10.2751, 10.3326, 10.3542, 10.4280, 10.4436, 10.4775,
        10.5535, 10.6880, 10.7805, 10.8352, 10.8247, 10.8508, 10.8958, 10.8945,
        10.9203, 10.8739, 10.8958, 10.8087, 10.8048, 10.8234, 10.7753, 10.7433,
        10.6784, 10.6398, 10.6093, 10.56

In [38]:
log_spectrogram

tensor([2.4728, 1.9737, 2.4532, 2.0629, 1.6781, 1.9771, 2.3649, 2.8323, 3.3906,
        3.3443, 3.2443, 3.1908, 2.8184, 2.6653, 2.5085, 2.5703, 2.7404, 2.8281,
        2.7760, 2.6967, 2.7529, 2.6522, 2.7785, 2.8131, 2.5531, 2.5534, 2.8503,
        2.7366, 2.6551, 2.6157, 2.7405, 2.8119, 3.0640, 3.0309, 2.9365, 2.7861,
        2.6752, 2.7367, 2.6131, 2.5528, 2.5663, 2.7521, 2.6987, 2.6508, 2.6040,
        2.7374, 2.6271, 2.5725, 2.7359, 2.6779, 2.7443, 2.7300, 2.6985, 2.6706,
        2.4988, 2.5296, 2.6124, 2.5700, 2.5608, 2.5293, 2.5356, 2.4988, 2.4304,
        2.3286, 2.2575, 2.2451, 2.3606, 2.3855, 2.4035, 2.3990, 2.2771, 2.3406,
        2.4321, 2.4760, 2.5632, 2.4876, 2.3854, 2.3706, 2.3436, 2.3744, 2.2946,
        2.2545, 2.1881, 2.0584, 2.1657, 2.1668, 2.1765, 2.2207, 2.3439, 2.2796,
        2.4288, 2.4096, 2.4304, 2.4362, 2.3717, 2.3920, 2.3370, 2.3566, 2.3520,
        2.3868, 2.4504, 2.5838, 2.6362, 2.6408, 2.6069, 2.7070, 2.7159, 2.8198,
        2.8896, 2.7947, 2.7451, 2.6962, 

In [39]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
num_classes = 8  # Update this based on your dataset
model = CNNModel(input_size=128, num_classes=num_classes)  # Update input_size accordingly
model.load_state_dict(torch.load("C:/Nini/Capstone/Models/DataAugmentation_cnn_model_new_final_1.pth", map_location=device))  # Load saved weights
model.to(device)
model.eval()  # Set to evaluation mode
print("Model Loaded Successfully")

Model Loaded Successfully


In [40]:
def predict(model, input_tensor, device):
    model.eval()  # Set model to evaluation mode
    input_tensor = input_tensor.unsqueeze(0).unsqueeze(1).float().to(device)
    
    with torch.no_grad():  # Disable gradient calculations for inference
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1)  # Get predicted class
        
    return predicted_class.cpu().numpy()

In [41]:
predictions = predict(model, log_spectrogram, device)
print("Predicted Labels:", predictions)

Predicted Labels: [0]


In [42]:
with open("C:/Nini/Capstone/src/Model_training/label_encoder.pkl", 'rb') as f:
        label_encoder = pickle.load(f)

In [43]:
predicted_class = label_encoder.inverse_transform(predictions)

In [44]:
predicted_class

array(['angry'], dtype=object)

In [45]:
mp_face_mesh = mp.solutions.face_mesh
face_mesh = mp_face_mesh.FaceMesh(
    static_image_mode=False,
    max_num_faces=1,
    min_detection_confidence=0.5,
    min_tracking_confidence=0.5
)

# Define key landmark indices for emotion-relevant features
LEFT_EYE = [33, 133, 160, 159, 158, 144, 153, 154, 155, 173, 157, 163]
RIGHT_EYE = [362, 385, 387, 388, 466, 263, 249, 390, 373, 374, 380, 381]
LEFT_EYEBROW = [70, 63, 105, 66, 107]
RIGHT_EYEBROW = [336, 296, 334, 293, 300]
MOUTH_OUTER = [61, 185, 40, 39, 37, 0, 267, 269, 270, 409, 291, 375, 321, 405, 314, 17, 84, 181, 91, 146]
MOUTH_INNER = [78, 191, 80, 81, 82, 13, 312, 311, 310, 415, 308, 324, 318, 402, 317, 14, 87, 178, 88, 95]
NOSE = [1, 2, 3, 4, 5, 6, 168, 197, 195, 5, 4, 98, 97, 2, 326, 327]

In [46]:
def calculate_distance(p1, p2):
    return np.linalg.norm(p1 - p2)

def calculate_eye_aspect_ratio(eye):
    v1 = calculate_distance(eye[1], eye[7])
    v2 = calculate_distance(eye[2], eye[6])
    v3 = calculate_distance(eye[3], eye[5])
    h = calculate_distance(eye[0], eye[4])
    return (v1 + v2 + v3) / (3.0 * h)

def calculate_mouth_aspect_ratio(outer, inner):
    outer_v = calculate_distance(outer[3], outer[9])
    inner_v = calculate_distance(inner[3], inner[9])
    h = calculate_distance(outer[0], outer[6])
    return outer_v / h, inner_v / outer_v

def calculate_eyebrow_position(eyebrow, eye):
    return np.mean([p[1] for p in eye]) - np.mean([p[1] for p in eyebrow])

def extract_features_from_frame(frame):
    left_eye = frame[LEFT_EYE]
    right_eye = frame[RIGHT_EYE]
    left_eyebrow = frame[LEFT_EYEBROW]
    right_eyebrow = frame[RIGHT_EYEBROW]
    mouth_outer = frame[MOUTH_OUTER]
    mouth_inner = frame[MOUTH_INNER]
    nose = frame[NOSE]

    width = np.max(frame[:, 0]) - np.min(frame[:, 0])
    height = np.max(frame[:, 1]) - np.min(frame[:, 1])

    left_ear = calculate_eye_aspect_ratio(left_eye)
    right_ear = calculate_eye_aspect_ratio(right_eye)
    mar, openness = calculate_mouth_aspect_ratio(mouth_outer, mouth_inner)
    l_eyebrow_pos = calculate_eyebrow_position(left_eyebrow, left_eye) / height
    r_eyebrow_pos = calculate_eyebrow_position(right_eyebrow, right_eye) / height
    mouth_center_y = (mouth_outer[3][1] + mouth_outer[9][1]) / 2
    smile = ((mouth_center_y - mouth_outer[0][1]) + (mouth_center_y - mouth_outer[6][1])) / (2 * height)
    nose_wrinkle = np.std([p[2] for p in nose])
    eye_sym = abs(left_ear - right_ear)
    brow_sym = abs(l_eyebrow_pos - r_eyebrow_pos)

    return [left_ear, right_ear, mar, openness, l_eyebrow_pos, r_eyebrow_pos, smile, nose_wrinkle, eye_sym, brow_sym]

def summarize_video_features(features):
    features = np.array(features)
    summary = []
    for i in range(features.shape[1]):
        f = features[:, i]
        summary.extend([np.mean(f), np.std(f), np.min(f), np.max(f), np.max(f)-np.min(f), f[-1] - f[0]])
    return summary

def extract_landmarks_from_video(path):
    cap = cv2.VideoCapture(path)
    features = []
    with mp_face_mesh.FaceMesh(
        static_image_mode=False,
        max_num_faces=1,
        min_detection_confidence=0.5,
        min_tracking_confidence=0.5) as fm:
        while cap.isOpened():
            ret, img = cap.read()
            if not ret: break
            img_rgb = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
            result = fm.process(img_rgb)
            if result.multi_face_landmarks:
                lm = result.multi_face_landmarks[0]
                points = np.array([[p.x, p.y, p.z] for p in lm.landmark])
                try:
                    features.append(extract_features_from_frame(points))
                except:
                    continue
    cap.release()
    return summarize_video_features(features)

# def process_dataset(path):
#     df = []
#     emo_map = {'01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad', '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'}
#     for root, _, files in os.walk(path):
#         for f in files:
#             if f.endswith(".mp4"):
#                 emo_id = f.split("-")[2]
#                 label = emo_map.get(emo_id, 'unknown')
#                 file_path = os.path.join(root, f)
#                 print(f"Processing {f} → {label}")
#                 try:
#                     feats = extract_landmarks_from_video(file_path)
#                     df.append(feats + [file_path, label])
#                 except:
#                     print(f"Error in {f}")
#     columns = [
#         'left_eye_ar', 'right_eye_ar', 'mouth_ar', 'mouth_openness',
#         'left_eyebrow_pos', 'right_eyebrow_pos', 'smile_ratio', 'nose_wrinkle',
#         'eye_symmetry', 'eyebrow_symmetry'
#     ]
#     stats = ['mean', 'std', 'min', 'max', 'range', 'delta']
#     colnames = [f"{f}_{s}" for f in columns for s in stats] + ['video_path', 'emotion']
#     return pd.DataFrame(df, columns=colnames)

In [47]:
df = []
landmarks = extract_landmarks_from_video(video_path)
df.append(landmarks)
columns = [
        'left_eye_ar', 'right_eye_ar', 'mouth_ar', 'mouth_openness',
        'left_eyebrow_pos', 'right_eyebrow_pos', 'smile_ratio', 'nose_wrinkle',
        'eye_symmetry', 'eyebrow_symmetry'
    ]
stats = ['mean', 'std', 'min', 'max', 'range', 'delta']
colnames = [f"{f}_{s}" for f in columns for s in stats]
feats = pd.DataFrame(df,columns=colnames)

In [49]:
print(landmarks)

[0.605584340763062, 0.02261802498773898, 0.5597053352415211, 0.6427292081911665, 0.08302387294964542, 0.01394572323560117, 0.4582474914944238, 0.02061480836915212, 0.4165953069494294, 0.49134117761763546, 0.07474587066820604, 0.012320292374403019, 0.9407622010172192, 0.06511001892958869, 0.781777007501517, 1.0438612925773703, 0.26208428507585324, -0.013917939266933632, 0.7345717334428171, 0.02857223523082702, 0.6772546840518582, 0.8018708805108553, 0.12461619645899713, -0.019125688370075666, 0.09659519615047737, 0.0032454663854460473, 0.08861369621841413, 0.10670736301326389, 0.018093666794849764, 0.004329691043851183, 0.09210688740929522, 0.0032821644894732746, 0.08590865283430123, 0.10569473517052076, 0.019786082336219535, -0.0008366001134981343, -0.004104558093250184, 0.0055638064108529525, -0.012108494048790433, 0.00417486005298086, 0.016283354101771293, 0.0014505949796739701, 0.019571816332341726, 0.000883350052003349, 0.01783318654274165, 0.021322640205341952, 0.00348945366260030

In [48]:
feats

Unnamed: 0,left_eye_ar_mean,left_eye_ar_std,left_eye_ar_min,left_eye_ar_max,left_eye_ar_range,left_eye_ar_delta,right_eye_ar_mean,right_eye_ar_std,right_eye_ar_min,right_eye_ar_max,...,eye_symmetry_min,eye_symmetry_max,eye_symmetry_range,eye_symmetry_delta,eyebrow_symmetry_mean,eyebrow_symmetry_std,eyebrow_symmetry_min,eyebrow_symmetry_max,eyebrow_symmetry_range,eyebrow_symmetry_delta
0,0.605584,0.022618,0.559705,0.642729,0.083024,0.013946,0.458247,0.020615,0.416595,0.491341,...,0.12746,0.185234,0.057774,0.001625,0.004488,0.002521,0.000465,0.008777,0.008312,0.005166


In [83]:
face_featues = feats.values

In [None]:
with open("C:/Nini/Capstone/CSV_Files/Facial data/New Facial Data/robust_scaler-2.pkl", 'rb') as r:
        scaler = pickle.load(r)

In [85]:
scaler = RobustScaler()

In [86]:
face_features_scaled = scaler.fit_transform(face_featues)
tensor_face = torch.tensor(face_features_scaled, dtype=torch.float32).reshape((face_features_scaled.shape[0], 1, face_features_scaled.shape[1]))

In [87]:
tensor_face.shape

torch.Size([1, 1, 60])

In [88]:
tensor_face

tensor([[[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
          0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.]]])

In [89]:
facial_model = EmotionLSTM(input_size=face_featues.shape[1]).to(device)
facial_model.load_state_dict(torch.load('C:/Nini/Capstone/Models/emotion_lstm_model-7.pth', weights_only=True))

<All keys matched successfully>

In [90]:
def predictfacial(model, input_tensor, device):
    model.eval()  # Set model to evaluation mode
    input_tensor = input_tensor.to(device)
    
    with torch.no_grad():  # Disable gradient calculations for inference
        output = model(input_tensor)
        predicted_class = torch.argmax(output, dim=1)  # Get predicted class
        
    return predicted_class.cpu().numpy()

In [91]:
predictions = predictfacial(facial_model, tensor_face, device)
print("Predicted Labels:", predictions)

Predicted Labels: [3]


In [92]:
with open("C:/Nini/Capstone/src/Model_training/label_encoder.pkl", 'rb') as f:
        label_encoder = pickle.load(f)
predicted_class = label_encoder.inverse_transform(predictions)
predicted_class

array(['fearful'], dtype=object)