In [1]:
import os
import cv2
import numpy as np
import pandas as pd
import librosa
import torch
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential, load_model
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout, LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from transformers import BertTokenizer, BertForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import gradio as gr
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

In [2]:
def load_fer2013(data_dir):
    emotions = ['angry', 'disgust', 'fear', 'happy', 'neutral', 'sad', 'surprise']
    emotion_to_label = {emotion: idx for idx, emotion in enumerate(emotions)}
    X, y = [], []
    for subset in ['train', 'test']:
        for emotion in emotions:
            folder = os.path.join(data_dir, subset, emotion)
            for img_file in os.listdir(folder):
                if img_file.endswith('.jpg'):
                    img = cv2.imread(os.path.join(folder, img_file), cv2.IMREAD_GRAYSCALE)
                    if img is not None:
                        img = cv2.resize(img, (48, 48))
                        X.append(img)
                        y.append(emotion_to_label[emotion])
    X = np.array(X).reshape(-1, 48, 48, 1) / 255.0
    y = np.array(y)
    return train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
def load_goemotions(data_dir):
    emotion_files = [os.path.join(data_dir, f) for f in os.listdir(data_dir) if f.endswith('.csv')]
    df = pd.concat([pd.read_csv(f) for f in emotion_files], ignore_index=True)
    emotion_cols = [col for col in df.columns if col not in ['text', 'id']]
    X, y = df['text'], df[emotion_cols].idxmax(axis=1)
    return train_test_split(X, y, test_size=0.2, random_state=42)

def extract_mfcc(file_path):
    y, sr = librosa.load(file_path, sr=None)
    mfccs = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
    return np.mean(mfccs.T, axis=0)

In [7]:
def load_ravdess(data_dir):
    emotion_map = {
        '01': 'neutral', '02': 'calm', '03': 'happy', '04': 'sad',
        '05': 'angry', '06': 'fearful', '07': 'disgust', '08': 'surprised'
    }
    X, y = [], []
    for root, _, files in os.walk(data_dir):
        for file in files:
            if file.endswith('.wav'):
                emotion_code = file.split('-')[2]
                if emotion_code in emotion_map:
                    X.append(extract_mfcc(os.path.join(root, file)))
                    y.append(emotion_map[emotion_code])
    X = np.array(X)
    le = LabelEncoder()
    y = le.fit_transform(y)
    return *train_test_split(X, y, test_size=0.2, random_state=42), le.classes_

In [9]:
def build_fer_model():
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=(48, 48, 1)),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(7, activation='softmax')
    ])
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [11]:
def build_lstm_model(input_shape, num_classes):
    model = Sequential([
        LSTM(64, input_shape=input_shape, return_sequences=False),
        Dropout(0.3),
        Dense(64, activation='relu'),
        Dropout(0.3),
        Dense(num_classes, activation='softmax')
    ])
    model.compile(optimizer=Adam(1e-3), loss='categorical_crossentropy', metrics=['accuracy'])
    return model

In [13]:
def get_image_emotion(image, model):
    img = Image.fromarray(image.astype('uint8')).convert('L').resize((48, 48))
    img = np.expand_dims(np.array(img), axis=(0, -1)) / 255.0
    return np.argmax(model.predict(img))

In [15]:
def get_text_emotion(text, model, tokenizer):
    inputs = tokenizer(text, return_tensors='pt', padding=True, truncation=True, max_length=128)
    if torch.cuda.is_available():
        model.to('cuda')
        inputs = {k: v.to('cuda') for k, v in inputs.items()}
    with torch.no_grad():
        outputs = model(**inputs)
    return torch.argmax(outputs.logits, dim=1).item()

In [17]:
def get_audio_emotion(audio_path, model):
    mfcc = extract_mfcc(audio_path).reshape(1, 1, -1)
    return np.argmax(model.predict(mfcc))

In [19]:
def fused_emotion(image, text, audio_path, image_model, text_model, tokenizer, audio_model):
    preds = [
        get_image_emotion(image, image_model),
        get_text_emotion(text, text_model, tokenizer),
        get_audio_emotion(audio_path, audio_model)
    ]
    return max(set(preds), key=preds.count)

In [21]:
# print("Valid Spotify genres:", sp.recommendation_genre_seeds())

In [23]:
# emotion_to_music = {
#     0: {"mood": "calm", "valence": 0.2},
#     1: {"mood": "joy", "valence": 0.8},
#     2: {"mood": "angry", "valence": 0.1},
#     3: {"mood": "fear", "valence": 0.1},
#     4: {"mood": "surprise", "valence": 0.6},
#     5: {"mood": "neutral", "valence": 0.5},
#     6: {"mood": "disgust", "valence": 0.3}
# }

emotion_to_music = {
    0: {"mood": "chill", "valence": 0.2},
    1: {"mood": "pop", "valence": 0.8},
    2: {"mood": "rock", "valence": 0.1},          # was metal → now valid
    3: {"mood": "ambient", "valence": 0.1},
    4: {"mood": "electronic", "valence": 0.6},
    5: {"mood": "acoustic", "valence": 0.5},
    6: {"mood": "blues", "valence": 0.3}
}
sp = spotipy.Spotify(auth_manager=spotipy.SpotifyOAuth(
    client_id="YOUR_SPOTIFY_CLIENT_ID",
    client_secret="YOUR_SPOTIFY_CLIENT_SECRET",
    redirect_uri="http://localhost:8888/callback",
    scope="user-read-private user-read-email"
))


#sp = spotipy.Spotify(auth_manager=SpotifyClientCredentials(
    #client_id="8a0d569b187a42a0a88de5fd710b0f36", client_secret="53f9fab414e846ab95bd9347de5a83c8"))

In [25]:
def recommend_music(emotion_label):
    mood = emotion_to_music[emotion_label]['mood']
    valence = emotion_to_music[emotion_label]['valence']
    results = sp.recommendations(seed_genres=[mood], target_valence=valence, limit=5)
    return [f"{t['name']} by {t['artists'][0]['name']}: {t['external_urls']['spotify']}" for t in results['tracks']]

In [27]:
# def emotion_music_app(image, text, audio_file):
#     emotion = fused_emotion(image, text, audio_file, model_fer, model_text, tokenizer, model_audio)
#     mood = emotion_to_music[emotion]['mood']
#     recommendations = recommend_music(emotion)
#     return f"Detected Emotion: {mood.capitalize()}\n\nTop Songs:\n" + "\n".join(recommendations)

In [29]:
def emotion_music_app(image, text, audio_file):
    try:
        emotion = fused_emotion(image, text, audio_file, model_fer, model_text, tokenizer, model_audio)
        mood = emotion_to_music[emotion]['mood']
        recommendations = recommend_music(emotion)
        return f"Detected Emotion: {mood.capitalize()}\n\nTop Songs:\n" + "\n".join(recommendations)
    except Exception as e:
        import traceback
        traceback_str = traceback.format_exc()
        print(traceback_str)  # This will print in Jupyter output
        return f"❌ Error occurred:\n{str(e)}"

In [31]:
gui = gr.Interface(
    fn=emotion_music_app,
    inputs=[
        gr.Image(label="Facial Image"),
        gr.Textbox(label="Text Input"),
        gr.Audio(type="filepath", label="Audio Input (WAV)")
    ],
    outputs="text",
    title="Multimodal Emotion-Based Music Recommender",
    description="Upload a face image, type text, and record voice to receive emotion-aware music recommendations."
)

In [33]:
model_fer = load_model("fer_model.h5")
model_audio = load_model("best_audio_lstm_model.h5")
model_text = BertForSequenceClassification.from_pretrained("./saved_model/bert_goemotions")
tokenizer = BertTokenizer.from_pretrained("./saved_model/bert_goemotions")



In [51]:
gui.launch(server_name="127.0.0.1", server_port=7862)

Rerunning server... use `close()` to stop if you need to change `launch()` parameters.
----
* To create a public link, set `share=True` in `launch()`.


