In [18]:
# IMPORT MODULES
import pandas as pd
import numpy as np
import os
import librosa
import seaborn as sns
import matplotlib.pyplot as plt
import librosa.display
from sklearn.preprocessing import OneHotEncoder
from keras.models import Sequential
from keras.layers import Dense, LSTM, Dropout
from keras.regularizers import l2
from keras.callbacks import EarlyStopping
import warnings
warnings.filterwarnings('ignore')

In [19]:
'''
# Get the current directory of the script
script_dir = os.path.dirname(os.path.abspath(__file__))

# Specify the relative paths to the dataset directories
tess_dataset_relative_path = 'datasets/TESS'
cremad_dataset_relative_path = 'datasets/CREMA-D/AudioWAV'
ravdess_dataset_relative_path = 'datasets/RAVDESS'
savee_dataset_relative_path = 'datasets/SAVEE'

# Construct the absolute paths to the dataset directories
tess_dataset_path = os.path.join(script_dir, tess_dataset_relative_path)
cremad_dataset_path = os.path.join(script_dir, cremad_dataset_relative_path)
ravdess_dataset_path = os.path.join(script_dir, ravdess_dataset_relative_path)
savee_dataset_path = os.path.join(script_dir, savee_dataset_relative_path)
'''

cremad_dataset_path = '/kaggle/input/speech-emotion-recognition-en/Crema'
ravdess_dataset_path = '/kaggle/input/speech-emotion-recognition-en/Ravdess'
savee_dataset_path = '/kaggle/input/speech-emotion-recognition-en/Savee'
tess_dataset_path = '/kaggle/input/speech-emotion-recognition-en/Tess'

# Create the paths and labels lists
paths = []
labels = []

# LOAD THE TESS DATASET
for dirname, _, filenames in os.walk(tess_dataset_path):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename.split('_')[-1]
        label = label.split('.')[0]
        if label == 'ps':
            label = 'surprise'
        labels.append(label)
    if len(paths) == 2800:
        break
print('TESS dataset is loaded.')

# Dictionary for mapping the CREMA-D dataset to the same labels as the TESS dataset.
cremad_emotion_dict = {
    "ANG": "angry",
    "DIS": "disgust",
    "FEA": "fear",
    "HAP": "happy",
    "NEU": "neutral",
    "SAD": "sad"
}

# LOAD THE CREMA-D DATASET
for dirname, _, filenames in os.walk(cremad_dataset_path):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename.split('_')[2]
        label = cremad_emotion_dict[label]
        labels.append(label)
    if len(paths) == 7442:
        break
print('CREMA-D dataset is loaded.')

# Dictionary for mapping the RAVDESS dataset to the same labels as the TESS dataset.
ravdess_emotion_dict = {
    "01": "neutral",
    "02": "neutral",  # originally "calm"
    "03": "happy",
    "04": "sad",
    "05": "angry",
    "06": "fear",
    "07": "disgust",
    "08": "surprise"
}

# LOAD THE RAVDESS DATASET
for dirname, _, filenames in os.walk(ravdess_dataset_path):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename.split('-')[2]
        label = ravdess_emotion_dict[label]
        labels.append(label)
    if len(paths) == 1440:
        break
print('RAVDESS dataset is loaded.')

# Dictionary for mapping the RAVDESS dataset to the same labels as the TESS dataset.
savee_emotion_dict = {
    "KL": "angry",
    "JK": "happy",
    "JE": "sad",
    "DC": "neutral"
}

# LOAD THE SAVEE DATASET
for dirname, _, filenames in os.walk(savee_dataset_path):
    for filename in filenames:
        paths.append(os.path.join(dirname, filename))
        label = filename.split('_')[0]
        label = savee_emotion_dict[label]
        labels.append(label)
    if len(paths) == 480:
        break
print('SAVEE dataset is loaded.')

print(len(paths))
print(len(labels))

## Create a dataframe
df = pd.DataFrame()
df['speech'] = paths
df['label'] = labels
print(df.head())

print(df['label'].value_counts())

TESS dataset is loaded.
CREMA-D dataset is loaded.
RAVDESS dataset is loaded.
SAVEE dataset is loaded.
12162
12162
                                              speech label
0  /kaggle/input/speech-emotion-recognition-en/Te...  fear
1  /kaggle/input/speech-emotion-recognition-en/Te...  fear
2  /kaggle/input/speech-emotion-recognition-en/Te...  fear
3  /kaggle/input/speech-emotion-recognition-en/Te...  fear
4  /kaggle/input/speech-emotion-recognition-en/Te...  fear
label
angry       1983
sad         1983
happy       1983
neutral     1895
fear        1863
disgust     1863
surprise     592
Name: count, dtype: int64


In [20]:
## Feature Extraction
def extract_mfcc(filename):
    y, sr = librosa.load(filename, duration=3, offset=0.5)
    mfcc = np.mean(librosa.feature.mfcc(y=y, sr=sr, n_mfcc=40).T, axis=0)
    return mfcc

X_mfcc = df['speech'].apply(lambda x: extract_mfcc(x))

X = [x for x in X_mfcc]
X = np.array(X)
print(X.shape)

(12162, 40)


In [21]:
## Input Split
X = np.expand_dims(X, -1)
print(X.shape)

enc = OneHotEncoder()
y = enc.fit_transform(df[['label']])
y = y.toarray()
print(y.shape)

(12162, 40, 1)
(12162, 7)


In [69]:
## Create the LSTM Model

model = Sequential([
    LSTM(512, return_sequences=True, input_shape=(40,1)),
    Dropout(0.3),  # Reduced dropout rate
    LSTM(256, return_sequences=True),
    Dropout(0.3),  # Reduced dropout rate
    LSTM(128, return_sequences=False),
    Dropout(0.2),  # Reduced dropout rate
    Dense(256, activation='relu'),
    Dropout(0.2),  # Reduced dropout rate
    Dense(128, activation='relu'),
    Dropout(0.1),  # Reduced dropout rate
    Dense(7, activation='softmax')
])


model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_49 (LSTM)              (None, 40, 512)           1052672   
                                                                 
 dropout_83 (Dropout)        (None, 40, 512)           0         
                                                                 
 lstm_50 (LSTM)              (None, 40, 256)           787456    
                                                                 
 dropout_84 (Dropout)        (None, 40, 256)           0         
                                                                 
 lstm_51 (LSTM)              (None, 128)               197120    
                                                                 
 dropout_85 (Dropout)        (None, 128)               0         
                                                                 
 dense_51 (Dense)            (None, 256)             

In [70]:
## Train the model
early_stop = EarlyStopping(monitor='val_loss', patience=20)  # Early stopping

history = model.fit(X, y, validation_split=0.2, epochs=100, batch_size=64, callbacks=[early_stop])

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100


In [None]:
## Result plotting

# Accuracy
epochs = list(range(len(history.history['accuracy'])))
acc = history.history['accuracy']
val_acc = history.history['val_accuracy']

plt.plot(epochs, acc, label='train accuracy')
plt.plot(epochs, val_acc, label='val accuracy')
plt.xlabel('epochs')
plt.ylabel('accuracy')
plt.legend()
plt.show()

# Loss
loss = history.history['loss']
val_loss = history.history['val_loss']

plt.plot(epochs, loss, label='train loss')
plt.plot(epochs, val_loss, label='val loss')
plt.xlabel('epochs')
plt.ylabel('loss')
plt.legend()
plt.show()

In [41]:
model.save('model_output2.keras')  # The file needs to end with the .keras extension