### Pair the Emoton Prediction Model with Musical Logic
- Same code from '7-Thesis-song-choice-demo.ipynb' but debugged using GPT when implementing softmax and buffering, when mismatch between the expected input shape of your CNN model prevented the camera working

In [5]:
import numpy as np
import pandas as pd
import cv2
import dlib
import joblib
import pygame
from collections import deque
from keras.models import load_model
from scipy.special import softmax
import librosa
import soundfile as sf

In [7]:
# Initialize pygame mixer
pygame.mixer.init()

# Load pre-trained models
rf_model = joblib.load('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/random_forest_model.pkl')
scaler = joblib.load('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/scaler.pkl')
cnn_model = load_model('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/model.h5')

# Load Haar Cascade for face detection
face_cascade = cv2.CascadeClassifier('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/haar/haarcascade_frontalface_alt2.xml')
if face_cascade.empty():
    raise IOError("Failed to load Haar Cascade file. Please check the file path.")

# Load Dlib's shape predictor for facial landmarks
shape_predictor = dlib.shape_predictor('/Users/nixi/Desktop/Final-thesis-folder-24/AI-for-Media-project-23-24/shape_predictor_68_face_landmarks.dat')

# Define emotions map
emotion_map = {0: 'angry', 1: 'happy', 2: 'sad', 3: 'neutral'}

# Emotion buffer for smoothing predictions
emotion_buffer = deque(maxlen=20)  # Keep last 20 frames

# Path to neutral song
neutral_song_path = 'Music/neutral/aklim_hep_sende.mp3'

# Load and play the initial song
pygame.mixer.music.load(neutral_song_path)
pygame.mixer.music.play()

# Function to manipulate audio based on emotion
def manipulate_audio(file_path, emotion):
    y, sr = librosa.load(file_path)
    
    # Adjust tempo based on the emotional states
    if emotion == 'happy':
        y_fast = librosa.effects.time_stretch(y, rate=1.5)
    elif emotion == 'sad':
        y_fast = librosa.effects.time_stretch(y, rate=0.9)
    elif emotion == 'angry':
        y_fast = librosa.effects.time_stretch(y, rate=1.5)
    else:  # Neutral
        y_fast = librosa.effects.time_stretch(y, rate=1.0)

    # Adjust pitch and apply additional effects for "angry"
    if emotion == 'angry':
        y_shifted = librosa.effects.pitch_shift(y_fast, sr=sr, n_steps=5)
        y_shifted = np.tanh(y_shifted * 15)
        y_shifted = librosa.effects.preemphasis(y_shifted, coef=1)
    elif emotion == 'sad':
        y_shifted = librosa.effects.pitch_shift(y_fast, sr=sr, n_steps=-4)
    else:
        y_shifted = y_fast

    return y_shifted, sr

# Initialize video capture
cap = cv2.VideoCapture(0)  # Try changing the index if you have multiple cameras

if not cap.isOpened():
    print("Error: Could not open video capture.")
    exit()

current_emotion = 'neutral'
prev_emotion = 'neutral'

while True:
    ret, frame = cap.read()
    if not ret:
        print("Error: Could not read frame.")
        break

    # Convert to grayscale for face detection
    gray = cv2.cvtColor(frame, cv2.COLOR_BGR2GRAY)
    faces = face_cascade.detectMultiScale(gray, scaleFactor=1.1, minNeighbors=5)

    for (x, y, w, h) in faces:
        # Draw bounding box around the face
        cv2.rectangle(frame, (x, y), (x + w, y + h), (36, 255, 12), 2)

        # Get the landmarks
        dlib_rect = dlib.rectangle(x, y, x + w, y + h)
        landmarks = shape_predictor(gray, dlib_rect)

        # Extract facial features for Random Forest
        left_eye = np.array([landmarks.part(36).x, landmarks.part(36).y])
        right_eye = np.array([landmarks.part(45).x, landmarks.part(45).y])
        nose_tip = np.array([landmarks.part(30).x, landmarks.part(30).y])
        mouth_left = np.array([landmarks.part(48).x, landmarks.part(48).y])
        mouth_right = np.array([landmarks.part(54).x, landmarks.part(54).y])
        chin = np.array([landmarks.part(8).x, landmarks.part(8).y])
        left_eyebrow = np.array([landmarks.part(21).x, landmarks.part(21).y])
        right_eyebrow = np.array([landmarks.part(22).x, landmarks.part(22).y])
        
        # Calculate distances and angles (features used during training)
        total_face_area = w * h
        left_eyebrow_angle = np.arctan2(left_eyebrow[1] - left_eye[1], left_eyebrow[0] - left_eye[0])
        right_eyebrow_angle = np.arctan2(right_eyebrow[1] - right_eye[1], right_eyebrow[0] - right_eye[0])
        eyebrow_distance = np.linalg.norm(left_eyebrow - right_eyebrow)
        eyes_dist = np.linalg.norm(left_eye - right_eye)
        eyes_to_nose_dist = np.linalg.norm((left_eye + right_eye) / 2 - nose_tip)
        nose_to_mouth_dist = np.linalg.norm(nose_tip - (mouth_left + mouth_right) / 2)
        mouth_angle = np.arctan2(mouth_right[1] - mouth_left[1], mouth_right[0] - mouth_left[0])
        nose_angle = np.arctan2(nose_tip[1] - chin[1], nose_tip[0] - chin[0])

        # Prepare the feature vector for Random Forest
        features = np.array([[total_face_area, left_eyebrow_angle, right_eyebrow_angle, eyebrow_distance, 
                              eyes_dist, eyes_to_nose_dist, nose_to_mouth_dist, mouth_angle, nose_angle]])

        # Use the original feature names used during fitting
        original_feature_names = ['TotalFaceArea', 'LeftEyebrowAngle', 'RightEyebrowAngle', 'EyebrowDistance', 
                                  'EyesDist', 'EyesToNoseDist', 'NoseToMouthDist', 'MouthAngle', 'NoseAngle']
        features_df = pd.DataFrame(features, columns=original_feature_names)

        # Scale the features
        features_scaled = scaler.transform(features_df)

        # Predict the emotion with Random Forest
        rf_emotion_label = rf_model.predict(features_scaled)[0]
        rf_emotion_text = emotion_map.get(rf_emotion_label, "Unknown")

        # Prepare the image for CNN (150x150 as per your earlier configuration)
        face_img = cv2.resize(frame[y:y+h, x:x+w], (150, 150))
        face_img = cv2.cvtColor(face_img, cv2.COLOR_BGR2GRAY)  # Convert to grayscale
        face_img = face_img / 255.0  # Normalize to [0, 1]
        face_img = np.expand_dims(face_img, axis=-1)  # Add channel dimension for grayscale
        face_img = np.expand_dims(face_img, axis=0)  # Add batch dimension

        # Predict the emotion with CNN
        cnn_predictions = cnn_model.predict(face_img)
        print("CNN Predictions:", cnn_predictions)  # Debugging: Print raw predictions
        cnn_probabilities = softmax(cnn_predictions, axis=1)  # Apply softmax to get probabilities
        print("CNN Probabilities:", cnn_probabilities)  # Debugging: Print probabilities
        cnn_emotion_label = np.argmax(cnn_probabilities, axis=1)[0]
        cnn_emotion_text = emotion_map.get(cnn_emotion_label, "Unknown")

        # Combine the predictions using weighted voting
        if rf_emotion_text == cnn_emotion_text:
            final_emotion_text = rf_emotion_text
        else:
            # Use softmax probabilities to weigh the decision
            rf_weight = 0.5  # Example weight for Random Forest
            cnn_weight = cnn_probabilities[0][cnn_emotion_label]  # Use the probability of the predicted class
            if cnn_weight > rf_weight:
                final_emotion_text = cnn_emotion_text
            else:
                final_emotion_text = rf_emotion_text

        # Display the emotion on the frame
        cv2.putText(frame, final_emotion_text, (x, y - 10), cv2.FONT_HERSHEY_SIMPLEX, 0.9, (36, 255, 12), 2)

        # Add the predicted emotion to the buffer
        emotion_buffer.append(final_emotion_text)

        # Get the most common emotion in the buffer
        common_emotion = max(set(emotion_buffer), key=emotion_buffer.count)

        # Change the music only if the emotion has stabilized
        if common_emotion != current_emotion and emotion_buffer.count(common_emotion) > 15:
            current_emotion = common_emotion
            y_shifted, sr = manipulate_audio(neutral_song_path, current_emotion)
            sf.write('modified_song.wav', y_shifted, sr)
            pygame.mixer.music.fadeout(1000)  # Fade out current music
            pygame.mixer.music.load('modified_song.wav')  # Load new song
            pygame.mixer.music.play()

    # Display the frame with face and emotion overlay
    cv2.imshow('Emotion Recognition', frame)

    if cv2.waitKey(1) & 0xFF == ord('q'):
        break

# Release the capture and close all windows
cap.release()
cv2.destroyAllWindows()
pygame.mixer.quit()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
CNN Predictions: [[0.00254196 0.0012075  0.08095461 0.9152959 ]]
CNN Probabilities: [[0.17948781 0.17924845 0.19412844 0.4471353 ]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
CNN Predictions: [[2.1227135e-03 5.3287687e-04 1.7405245e-02 9.7993910e-01]]
CNN Probabilities: [[0.17629048 0.17601043 0.17900534 0.46869373]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step
CNN Predictions: [[0.00707293 0.00363335 0.26454356 0.7247501 ]]
CNN Probabilities: [[0.18726988 0.18662687 0.24226241 0.38384083]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
CNN Predictions: [[0.00152109 0.00344019 0.9498646  0.04517412]]
CNN Probabilities: [[0.17768393 0.17802526 0.45867866 0.18561217]]
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step
CNN Predictions: [[0.00297935 0.00500751 0.7836868  0.20832628]]
CNN Probabilities: [[0.18474053 0.185

: 