### Pair the Emoton Prediction Model with Musical Logic

In [None]:
# System and file handling
import os

# Numerical and Data Handling Libraries
import numpy as np
import pandas as pd

# Machine Learning Libraries
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier
from sklearn.manifold import TSNE
import umap.umap_ as umap
from sklearn.model_selection import train_test_split
import joblib

# Deep Learning Libraries
from tensorflow.keras.models import load_model
import tensorflow as tf

# Audio Processing Libraries
import librosa
import soundfile as sf
import librosa.display

# Computer Vision Libraries
import cv2
import dlib

# Data Visualization Libraries
import seaborn as sns
import matplotlib.pyplot as plt

# Real-time Multimedia Handling Libraries
import pygame

# Collections for Data Structures
from collections import deque

from IPython.display import Audio as ipd
from IPython.display import Audio

import librosa
import librosa.display
def show_audio(y, sr, normalise=True):
    # Plot the waveform
    plt.figure(figsize=(10, 4))
    librosa.display.waveshow(y, sr=sr)
    plt.title('Waveform')
    plt.show()
    
    # Return the audio player
    return Audio(y, rate=sr, normalize=normalise, autoplay=True)

In [None]:
# Manipulate song choice
# Initialize pygame mixer
pygame.mixer.init()

# Load pre-trained models
rf_model = joblib.load('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/random_forest_model.pkl')
scaler = joblib.load('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/scaler.pkl')
cnn_model = load_model('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/model.h5')

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/haar/haarcascade_frontalface_alt2.xml')
if face_cascade.empty():
    raise IOError("Failed to load Haar Cascade file. Please check the file path.")

# Load Dlib's shape predictor for facial landmarks
shape_predictor = dlib.shape_predictor('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/shape_predictor_68_face_landmarks.dat')

# Define emotions map
emotion_map = {0: 'angry', 1: 'happy', 2: 'sad', 3: 'neutral'}

# Emotion buffer for smoothing predictions
emotion_buffer = deque(maxlen=20)  # Keep last 20 frames

# Path to neutral song
neutral_song_path = 'Music/neutral/aklim_hep_sende.mp3'

# Load and play the initial song
pygame.mixer.music.load(neutral_song_path)
pygame.mixer.music.play()

# Change the song based on the emotion using code adapted from: https://librosa.org/doc/main/generated/librosa.effects.time_stretch.html 
def change_music_based_on_emotion(current_emotion):
    y, sr = librosa.load(neutral_song_path)
    if current_emotion == 'happy':
        y = librosa.effects.time_stretch(y, rate=1.2)
    elif current_emotion == 'sad':
        y = librosa.effects.time_stretch(y, rate=0.8)
    elif current_emotion == 'angry':
        y = librosa.effects.time_stretch(y, rate=1.3)
        y = librosa.effects.pitch_shift(y, sr=sr, n_steps=5)

    # Save and play the modified song using documentation from: https://www.pygame.org/docs/ref/music.html
        
    sf.write('modified_song.wav', y, sr)
    pygame.mixer.music.fadeout(1000)  # Fade out current music
    pygame.mixer.music.load('modified_song.wav')  # Load new song
    pygame.mixer.music.play()

# Initialize video capture
cap = cv2.VideoCapture(0)

current_emotion = 'neutral'
prev_emotion = 'neutral'

# debugged with gpt, original code was adapted from; https://roboflow.com/use-opencv/read-video-streams-with-cv2-videocapture 
while True:
    ret, frame = cap.read()
    if not ret:
        break

    # Convert to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

    for (x, y, w, h) in faces:
        # Draw bounding box around the face
        cv2.rectangle(frame, (x, y), (x + w, y + h), (36, 255, 12), 2)

        # Get the landmarks
        dlib_rect = dlib.rectangle(x, y, x + w, y + h)
        landmarks = shape_predictor(gray, dlib_rect)

        # Extract facial features for Random Forest
        left_eye = np.array([landmarks.part(36).x, landmarks.part(36).y])
        right_eye = np.array([landmarks.part(45).x, landmarks.part(45).y])
        nose_tip = np.array([landmarks.part(30).x, landmarks.part(30).y])
        mouth_left = np.array([landmarks.part(48).x, landmarks.part(48).y])
        mouth_right = np.array([landmarks.part(54).x, landmarks.part(54).y])
        chin = np.array([landmarks.part(8).x, landmarks.part(8).y])
        
        # Calculate distances (features used during training)
        eye_distance = np.linalg.norm(left_eye - right_eye)
        nose_to_mouth = np.linalg.norm(nose_tip - (mouth_left + mouth_right) / 2)
        nose_to_left_eye = np.linalg.norm(nose_tip - left_eye)
        nose_to_right_eye = np.linalg.norm(nose_tip - right_eye)
        mouth_width = np.linalg.norm(mouth_left - mouth_right)
        nose_to_chin = np.linalg.norm(nose_tip - chin)
        left_eye_to_chin = np.linalg.norm(left_eye - chin)
        right_eye_to_chin = np.linalg.norm(right_eye - chin)
        mouth_to_chin = np.linalg.norm((mouth_left + mouth_right) / 2 - chin)

        # Prepare the feature vector for Random Forest (9 features used during training)
        features = np.array([[eye_distance, nose_to_mouth, nose_to_left_eye, nose_to_right_eye, mouth_width, nose_to_chin, left_eye_to_chin, right_eye_to_chin, mouth_to_chin]])
        features_scaled = scaler.transform(features)

        # Predict the emotion with Random Forest
        rf_emotion_label = rf_model.predict(features_scaled)[0]
        rf_emotion_text = emotion_map.get(rf_emotion_label, "Unknown")

        # Prepare the image for CNN (150x150 as per your earlier configuration)
        face_img = cv2.resize(frame[y:y+h, x:x+w], (150, 150))
        face_img = face_img / 255.0  # Normalize to [0, 1]
        face_img = np.expand_dims(face_img, axis=0)  # Add batch dimension

        # Predict the emotion with CNN
        cnn_predictions = cnn_model.predict(face_img)
        cnn_emotion_label = np.argmax(cnn_predictions, axis=1)[0]
        cnn_emotion_text = emotion_map.get(cnn_emotion_label, "Unknown")

        # Combine the predictions (simple voting mechanism)
        if rf_emotion_text == cnn_emotion_text:
            final_emotion_text = rf_emotion_text
        else:
            # Prioritize CNN prediction if different from Random Forest
            final_emotion_text = cnn_emotion_text

        # Display the emotion on the frame
        cv2.putText(frame, final_emotion_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

        # Add the predicted emotion to the buffer
        emotion_buffer.append(final_emotion_text)

        # Get the most common emotion in the buffer
        common_emotion = max(set(emotion_buffer), key=emotion_buffer.count)

        # Change the music only if the emotion has stabilized (debugged with gpt)
        if common_emotion != current_emotion and emotion_buffer.count(common_emotion) > 15:
            current_emotion = common_emotion
            change_music_based_on_emotion(current_emotion)

    # Display the frame with face and emotion overlay
    cv2.imshow('Emotion Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and close all windows
cap.release()
cv2.destroyAllWindows()
pygame.mixer.quit()
