In [None]:
import os
import glob
import numpy as np
import librosa
import joblib
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import (
    Dense,
    Conv1D,
    MaxPooling1D,
    BatchNormalization,
    Dropout,
    GlobalAveragePooling1D,
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping

In [None]:
# --- Constants ---
DATASET_PATH = "dataset"  # Expected folder name for RAVDESS dataset
SAMPLE_RATE = 22050
N_MFCC = 40  # Number of MFCCs to extract
MAX_PAD_LEN = 174  # Maximum length of MFCC sequence (found by empirical analysis)
MODEL_FILE = "ser_model.h5"
ENCODER_FILE = "label_encoder.pkl"

In [None]:
# --- RAVDESS Emotion Mapping ---
# The RAVDESS filenames have the emotion code as the 3rd part (e.g., 03-01-04-...)
emotion_map = {
    "01": "neutral",
    "02": "calm",
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fearful",
    "07": "disgust",
    "08": "surprised",
}
# We will use these 8 emotions
observed_emotions = emotion_map.values()

In [None]:
def extract_features(file_path, n_mfcc=N_MFCC, max_pad_len=MAX_PAD_LEN):
    """
    Extracts MFCC features from an audio file, then pads or truncates
    to a fixed length.
    """
    try:
        # Load audio file
        audio, sample_rate = librosa.load(
            file_path, sr=SAMPLE_RATE, res_type="kaiser_fast"
        )

        # Extract MFCCs
        mfccs = librosa.feature.mfcc(y=audio, sr=sample_rate, n_mfcc=n_mfcc)

        # Pad or truncate to fixed length
        if mfccs.shape[1] > max_pad_len:
            mfccs = mfccs[:, :max_pad_len]  # Truncate
        else:
            pad_width = max_pad_len - mfccs.shape[1]
            mfccs = np.pad(mfccs, ((0, 0), (0, pad_width)), mode="constant")

        return mfccs

    except Exception as e:
        print(f"Error processing {file_path}: {e}")
        return None

In [None]:
def load_data(dataset_path):
    """
    Loads data from the RAVDESS dataset directory.
    Assumes dataset_path contains folders like 'Actor_01', 'Actor_02', ...
    """
    print("Loading data...")
    features = []
    labels = []

    # Use glob to find all .wav files in all subdirectories
    file_pattern = os.path.join(dataset_path, "Actor_*", "*.wav")
    audio_files = glob.glob(file_pattern)

    if not audio_files:
        print(f"No audio files found in {dataset_path}.")
        print("Please make sure you have downloaded the RAVDESS dataset")
        print("and placed it in a folder named 'dataset' in the same directory.")
        return None, None

    for file_path in audio_files:
        # Filename example: 03-01-04-02-01-01-01.wav
        # The 3rd part (e.g., '04') is the emotion.
        try:
            filename = os.path.basename(file_path)
            emotion_code = filename.split("-")[2]
            emotion = emotion_map.get(emotion_code)

            if emotion not in observed_emotions:
                continue

            mfccs = extract_features(file_path)
            if mfccs is not None:
                features.append(mfccs)
                labels.append(emotion)
        except Exception as e:
            print(f"Error parsing filename {file_path}: {e}")

    print(f"Loaded {len(features)} audio files.")
    return np.array(features), np.array(labels)

In [None]:
def build_model(input_shape, num_classes):
    """
    Builds a 1D CNN model for speech emotion recognition.
    """
    model = Sequential()

    # Layer 1
    model.add(
        Conv1D(
            128, 5, padding="same", activation="relu", input_shape=input_shape
        )
    )
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))

    # Layer 2
    model.add(Conv1D(256, 5, padding="same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.2))

    # Layer 3
    model.add(Conv1D(512, 5, padding="same", activation="relu"))
    model.add(BatchNormalization())
    model.add(MaxPooling1D(pool_size=2))
    model.add(Dropout(0.3))

    # Global Pooling
    model.add(GlobalAveragePooling1D())

    # Dense Layer
    model.add(Dense(256, activation="relu"))
    model.add(Dropout(0.3))

    # Output Layer
    model.add(Dense(num_classes, activation="softmax"))

    # Compile
    model.compile(
        optimizer=Adam(learning_rate=0.0005),
        loss="sparse_categorical_crossentropy",
        metrics=["accuracy"],
    )

    return model

In [None]:
# Load data
X, y = load_data(DATASET_PATH)
if X is None or y is None:
    print("None")

Loading data...
Loaded 1440 audio files.


In [None]:
# Encode labels
le = LabelEncoder()
y_encoded = le.fit_transform(y)
num_classes = len(le.classes_)
print(f"Emotions to classify: {le.classes_}")


Emotions to classify: ['angry' 'calm' 'disgust' 'fearful' 'happy' 'neutral' 'sad' 'surprised']


In [None]:
# Transpose MFCCs to (time_steps, features) for Conv1D
# Original shape: (num_samples, n_mfcc, max_pad_len)
# Required shape: (num_samples, max_pad_len, n_mfcc)
X_transposed = np.transpose(X, (0, 2, 1))

In [None]:
# Split the data
X_train, X_test, y_train, y_test = train_test_split(
X_transposed, y_encoded, test_size=0.2, random_state=42, stratify=y_encoded)
print(f"Training data shape: {X_train.shape}")
print(f"Test data shape: {X_test.shape}")

Training data shape: (1152, 174, 40)
Test data shape: (288, 174, 40)


In [None]:
# Build the model
input_shape = (X_train.shape[1], X_train.shape[2])  # (MAX_PAD_LEN, N_MFCC)
model = build_model(input_shape, num_classes) 
model.summary()

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
# Define callbacks
early_stop = EarlyStopping(monitor="val_accuracy", patience=20, restore_best_weights=True)
model_checkpoint = ModelCheckpoint(
    MODEL_FILE,
    monitor="val_accuracy",
    save_best_only=True,
    verbose=1,
    )

In [None]:
# Train the model
print("Starting model training...")
history = model.fit(
    X_train,
    y_train,
    epochs=150,
    batch_size=32,
    validation_data=(X_test, y_test),
    callbacks=[early_stop, model_checkpoint],
    verbose=1,
    )

Starting model training...
Epoch 1/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.2860 - loss: 1.8349
Epoch 1: val_accuracy improved from None to 0.11111, saving model to ser_model.h5

Epoch 1: val_accuracy improved from None to 0.11111, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m8s[0m 105ms/step - accuracy: 0.3498 - loss: 1.6834 - val_accuracy: 0.1111 - val_loss: 5.1143
Epoch 2/150
Epoch 2/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 80ms/step - accuracy: 0.4810 - loss: 1.4321
Epoch 2: val_accuracy improved from 0.11111 to 0.14583, saving model to ser_model.h5

Epoch 2: val_accuracy improved from 0.11111 to 0.14583, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 93ms/step - accuracy: 0.4878 - loss: 1.4011 - val_accuracy: 0.1458 - val_loss: 2.9101
Epoch 3/150
Epoch 3/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.5851 - loss: 1.1588
Epoch 3: val_accuracy improved from 0.14583 to 0.34722, saving model to ser_model.h5

Epoch 3: val_accuracy improved from 0.14583 to 0.34722, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 89ms/step - accuracy: 0.5720 - loss: 1.1780 - val_accuracy: 0.3472 - val_loss: 1.7992
Epoch 4/150
Epoch 4/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.6300 - loss: 1.0550
Epoch 4: val_accuracy did not improve from 0.34722
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 81ms/step - accuracy: 0.6311 - loss: 1.0129 - val_accuracy: 0.2847 - val_loss: 2.1231
Epoch 5/150

Epoch 4: val_accuracy did not improve from 0.34722
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 81ms/step - accuracy: 0.6311 - loss: 1.0129 - val_accuracy: 0.2847 - val_loss: 2.1231
Epoch 5/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.6780 - loss: 0.8945
Epoch 5: val_accuracy improved from 0.34722 to 0.48611, saving model to ser_model.h5

Epoch 5: val_accuracy improved from 0.34722 to 0.48611, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.6658 - loss: 0.9224 - val_accuracy: 0.4861 - val_loss: 1.2676
Epoch 6/150
Epoch 6/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.7012 - loss: 0.8285
Epoch 6: val_accuracy did not improve from 0.48611
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 83ms/step - accuracy: 0.6988 - loss: 0.8203 - val_accuracy: 0.4514 - val_loss: 1.4755
Epoch 7/150

Epoch 6: val_accuracy did not improve from 0.48611
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 83ms/step - accuracy: 0.6988 - loss: 0.8203 - val_accuracy: 0.4514 - val_loss: 1.4755
Epoch 7/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.7369 - loss: 0.7448
Epoch 7: val_accuracy improved from 0.48611 to 0.49306, saving model to ser_model.h5

Epoch 7: val_accuracy improved from 0.48611 to 0.49306, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 88ms/step - accuracy: 0.7378 - loss: 0.7499 - val_accuracy: 0.4931 - val_loss: 1.6424
Epoch 8/150
Epoch 8/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.7881 - loss: 0.6206
Epoch 8: val_accuracy improved from 0.49306 to 0.53472, saving model to ser_model.h5

Epoch 8: val_accuracy improved from 0.49306 to 0.53472, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 88ms/step - accuracy: 0.7769 - loss: 0.6298 - val_accuracy: 0.5347 - val_loss: 1.4234
Epoch 9/150
Epoch 9/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.8143 - loss: 0.5557
Epoch 9: val_accuracy improved from 0.53472 to 0.56944, saving model to ser_model.h5

Epoch 9: val_accuracy improved from 0.53472 to 0.56944, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.8108 - loss: 0.5547 - val_accuracy: 0.5694 - val_loss: 1.4247
Epoch 10/150
Epoch 10/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.8043 - loss: 0.5280
Epoch 10: val_accuracy did not improve from 0.56944
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.8142 - loss: 0.5106 - val_accuracy: 0.4410 - val_loss: 1.8947
Epoch 11/150

Epoch 10: val_accuracy did not improve from 0.56944
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.8142 - loss: 0.5106 - val_accuracy: 0.4410 - val_loss: 1.8947
Epoch 11/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.8455 - loss: 0.4208
Epoch 11: val_accuracy did not improve from 0.56944
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 81ms/step - accuracy: 0.8342 - loss: 0.4481 - val_accuracy: 0



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.8802 - loss: 0.3522 - val_accuracy: 0.6146 - val_loss: 1.2826
Epoch 15/150
Epoch 15/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.9140 - loss: 0.2469
Epoch 15: val_accuracy did not improve from 0.61458
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.9071 - loss: 0.2667 - val_accuracy: 0.5174 - val_loss: 1.8086
Epoch 16/150

Epoch 15: val_accuracy did not improve from 0.61458
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.9071 - loss: 0.2667 - val_accuracy: 0.5174 - val_loss: 1.8086
Epoch 16/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step - accuracy: 0.9374 - loss: 0.2114
Epoch 16: val_accuracy did not improve from 0.61458
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 85ms/step - accuracy: 0.9280 - loss: 0.2346 - val_accuracy: 0



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 86ms/step - accuracy: 0.9280 - loss: 0.2219 - val_accuracy: 0.6181 - val_loss: 1.6228
Epoch 20/150
Epoch 20/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 75ms/step - accuracy: 0.9426 - loss: 0.1636
Epoch 20: val_accuracy improved from 0.61806 to 0.66319, saving model to ser_model.h5

Epoch 20: val_accuracy improved from 0.61806 to 0.66319, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.9479 - loss: 0.1561 - val_accuracy: 0.6632 - val_loss: 1.3443
Epoch 21/150
Epoch 21/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 81ms/step - accuracy: 0.9506 - loss: 0.1376
Epoch 21: val_accuracy improved from 0.66319 to 0.75694, saving model to ser_model.h5

Epoch 21: val_accuracy improved from 0.66319 to 0.75694, saving model to ser_model.h5




[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 94ms/step - accuracy: 0.9488 - loss: 0.1465 - val_accuracy: 0.7569 - val_loss: 0.9442
Epoch 22/150
Epoch 22/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step - accuracy: 0.9562 - loss: 0.1450
Epoch 22: val_accuracy did not improve from 0.75694
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 86ms/step - accuracy: 0.9531 - loss: 0.1499 - val_accuracy: 0.6389 - val_loss: 1.5750
Epoch 23/150

Epoch 22: val_accuracy did not improve from 0.75694
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 86ms/step - accuracy: 0.9531 - loss: 0.1499 - val_accuracy: 0.6389 - val_loss: 1.5750
Epoch 23/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 78ms/step - accuracy: 0.9444 - loss: 0.1620
Epoch 23: val_accuracy did not improve from 0.75694
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 87ms/step - accuracy: 0.9453 - loss: 0.1644 - val_accuracy: 0



[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 88ms/step - accuracy: 0.9826 - loss: 0.0548 - val_accuracy: 0.7917 - val_loss: 0.9604
Epoch 41/150
Epoch 41/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step - accuracy: 0.9858 - loss: 0.0560
Epoch 41: val_accuracy did not improve from 0.79167
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.9826 - loss: 0.0672 - val_accuracy: 0.6840 - val_loss: 1.7901
Epoch 42/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 74ms/step - accuracy: 0.9709 - loss: 0.0830
Epoch 42: val_accuracy did not improve from 0.79167
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 82ms/step - accuracy: 0.9757 - loss: 0.0709 - val_accuracy: 0.7292 - val_loss: 1.2538
Epoch 43/150
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 72ms/step - accuracy: 0.9816 - loss: 0.0612
Epoch 43: val_accuracy did not improve from 0.79167
[1m36/36[0m [32

In [None]:
# Evaluate the best model on test set
print("\nEvaluating the best model...")
best_model = model  # EarlyStopping restored best weights
test_loss, test_accuracy = best_model.evaluate(X_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_accuracy*100:.2f}%")


Evaluating the best model...
Test Loss: 0.9604
Test Accuracy: 79.17%


In [None]:
# Save the label encoder
try:
    joblib.dump(le, ENCODER_FILE)
    print(f"Label encoder saved to {ENCODER_FILE}")
    print(f"Model saved to {MODEL_FILE}")
    print("\nTraining complete!")
except Exception as e:
    print(f"Error saving label encoder: {e}")

Label encoder saved to label_encoder.pkl
Model saved to ser_model.h5

Training complete!
