In [20]:
import librosa
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Conv2D, MaxPooling2D, Flatten, Dense, Dropout
import sounddevice as sd
from pathlib import Path

# Define emotion labels based on RAVDESS
emotion_labels = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}

In [21]:
# Load and preprocess RAVDESS dataset
def load_data(dataset_path):
    features, labels = [], []
    dataset_path = Path(dataset_path)
    
    # Iterate through actor folders
    for actor_folder in dataset_path.glob('Actor_*'):
        for file in actor_folder.glob('*.wav'):
            # Extract emotion label from filename (e.g., 03-01-03-01-01-01-01.wav)
            emotion = file.name.split('-')[2]
            if emotion in emotion_labels:
                # Load audio file
                y, sr = librosa.load(file, sr=22050)
                # Extract MFCC features
                mfcc = librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40)
                # Standardize input length (pad or truncate)
                max_len = 100  # Adjust based on your needs
                if mfcc.shape[1] < max_len:
                    mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
                else:
                    mfcc = mfcc[:, :max_len]
                # Reshape for CNN: (n_mfcc, time_steps, 1)
                mfcc = mfcc.reshape(mfcc.shape[0], mfcc.shape[1], 1)
                features.append(mfcc)
                labels.append(int(emotion) - 1)  # Convert to 0-based index
    
    return np.array(features), np.array(labels)

In [22]:
# Build CNN model
def build_model(input_shape, num_classes):
    model = Sequential([
        Conv2D(32, (3, 3), activation='relu', input_shape=input_shape),
        MaxPooling2D((2, 2)),
        Conv2D(64, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Conv2D(128, (3, 3), activation='relu'),
        MaxPooling2D((2, 2)),
        Flatten(),
        Dense(128, activation='relu'),
        Dropout(0.5),
        Dense(num_classes, activation='softmax')
    ])
    # Fix typo in loss function name
    model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
    return model

In [23]:
# Record live audio
def record_audio(duration=3, sr=22050):
    print("Recording... Speak now.")
    audio = sd.rec(int(duration * sr), samplerate=sr, channels=1)
    sd.wait()  # Wait until recording is finished
    audio = audio.flatten()
    return audio, sr

# Preprocess live audio
def preprocess_audio(audio, sr):
    mfcc = librosa.feature.mfcc(y=audio, sr=sr, n_mfcc=40)
    max_len = 100  # Match training data
    if mfcc.shape[1] < max_len:
        mfcc = np.pad(mfcc, ((0, 0), (0, max_len - mfcc.shape[1])), mode='constant')
    else:
        mfcc = mfcc[:, :max_len]
    # Reshape for CNN: (1, n_mfcc, time_steps, 1)
    mfcc = mfcc.reshape(1, mfcc.shape[0], mfcc.shape[1], 1)
    return mfcc

In [24]:
# Main execution
if __name__ == "__main__":
    # Load dataset
    dataset_path = './presentations/008-emmotion-classifier/dataset'  # Path to your RAVDESS dataset folder
    X, y = load_data(dataset_path)
    
    # Check data shapes
    print(f"Features shape: {X.shape}, Labels shape: {y.shape}")
    
    # Train model
    input_shape = (X.shape[1], X.shape[2], 1)  # (n_mfcc, time_steps, channels)
    num_classes = len(emotion_labels)
    model = build_model(input_shape, num_classes)
    model.fit(X, y, epochs=20, batch_size=32, validation_split=0.2, verbose=1)
    
    # Save model (optional)
    model.save('emotion_recognition_cnn_model.h5')
    
    # Record and classify live audio
    audio, sr = record_audio(duration=3)
    features = preprocess_audio(audio, sr)
    prediction = model.predict(features)
    emotion_idx = np.argmax(prediction, axis=1)[0]
    print(f"Detected emotion: {emotion_labels[f'{emotion_idx + 1:02d}']}")

Features shape: (1440, 40, 100, 1), Labels shape: (1440,)
Epoch 1/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 57ms/step - accuracy: 0.1538 - loss: 4.5755 - val_accuracy: 0.1250 - val_loss: 1.9768
Epoch 2/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.1949 - loss: 1.9962 - val_accuracy: 0.1875 - val_loss: 1.9528
Epoch 3/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.2259 - loss: 1.9781 - val_accuracy: 0.2604 - val_loss: 1.9018
Epoch 4/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 51ms/step - accuracy: 0.2124 - loss: 1.9231 - val_accuracy: 0.2361 - val_loss: 1.8905
Epoch 5/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.2113 - loss: 1.9590 - val_accuracy: 0.2535 - val_loss: 1.8525
Epoch 6/20
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 52ms/step - accuracy: 0.2132 - loss: 1.8988 - val_accuracy: 0.



Recording... Speak now.
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
Detected emotion: sad


In [25]:
import tkinter as tk
from PIL import Image, ImageTk
import os

def create_gui(model, emotion_labels):
    root = tk.Tk()
    root.title("Voice Emotion Recognizer")
    root.geometry("400x300")

    # Label to display emotion
    emotion_var = tk.StringVar(value="Press Record to start")
    emotion_label = tk.Label(root, textvariable=emotion_var, font=("Arial", 16))
    emotion_label.pack(pady=20)

    # Image display for emotion icon
    image_label = tk.Label(root)
    image_label.pack(pady=10)

    def record_and_predict():
        audio, sr = record_audio(duration=5)
        features = preprocess_audio(audio, sr)
        prediction = model.predict(features)
        emotion_idx = np.argmax(prediction, axis=1)[0]
        emotion = emotion_labels[f'{emotion_idx + 1:02d}']
        emotion_var.set(f"Detected: {emotion}")

        # Update icon (assumes you have emotion icons in ./icons/)
        icon_path = f"./icons/{emotion}.png"
        if os.path.exists(icon_path):
            img = Image.open(icon_path).resize((100, 100))
            photo = ImageTk.PhotoImage(img)
            image_label.config(image=photo)
            image_label.image = photo

    # Record button
    record_button = tk.Button(root, text="Record", command=record_and_predict, font=("Arial", 14))
    record_button.pack(pady=20)

    root.mainloop()

# Call GUI after training
create_gui(model, emotion_labels)

ModuleNotFoundError: No module named '_tkinter'