In [1]:
import librosa
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import sklearn.metrics
import os
from glob import glob
import sklearn
import numpy as np
from tools.voiceActivityDetector import VAD

In [None]:
database_path = 'https://github.com/pdadial/Speech_Emotion_Recognition_CNN-LSTM/tree/main/Database'

audio_paths = glob('{}/**'.format(database_path), recursive=True)
audio_paths = [x.replace(os.sep, '/') for x in audio_paths if '.wav' in x]
classes = os.listdir(database_path)
label_encode = {x:i for i,x in enumerate(classes)}
labels = [os.database_path.split(x)[0].split('/')[-1] for x in audio_paths]
labels = [label_encode[x] for x in labels]

In [None]:
sample_rate = 16000
FRAME_LENGTH = int(0.025*sample_rate)
HOP_LENGTH = int(0.25*FRAME_LENGTH)

n_classes = len(classes)
n_mfccs = 19
n_audio = len(audio_paths)
X = np.empty((n_audio, n_mfccs + 4), dtype=np.float32)
Y = np.empty((n_audio, n_classes), dtype=np.uint8)

for i, (path, label) in enumerate(zip(audio_paths, labels)):
    audio,_ = librosa.load(path, sr=sample_rate, res_type='fft', offset=0.5)
    waveform = VAD(audio, sample_rate, int(db))
    waveform_pad = np.zeros((int(sample_rate*5,)))
    waveform_pad[:len(waveform)] = waveform
    mfccs = np.mean(librosa.feature.mfcc(y=waveform_pad, sr=sample_rate, n_mfcc=n_mfccs, n_fft=1024, win_length=FRAME_LENGTH, hop_length=HOP_LENGTH, window='hamming', n_mels=128, fmax=sample_rate/2).T,axis=0)
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=waveform_pad, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH))
    rmse = np.mean(librosa.feature.rms(y=waveform_pad, frame_length=FRAME_LENGTH, hop_length=HOP_LENGTH))
    pitch, magnitude = librosa.piptrack(y=waveform, sr=sample_rate, n_fft=1024, hop_length=HOP_LENGTH, win_length=FRAME_LENGTH, window='hamming')
    pitch = np.mean(pitch[np.where(magnitude > 0)])
    centroid = np.mean(librosa.feature.spectral_centroid(y=waveform, sr=sample_rate, n_fft=1024, hop_length=HOP_LENGTH, win_length=FRAME_LENGTH, window='hamming'))
    X[i,...] = np.append(mfccs, (rmse, centroid, zcr, pitch))
    Y[i,...] = to_categorical(label, num_classes=n_classes)

In [None]:
scaler = StandardScaler()
X = scaler.fit_transform(X)
X = X[..., np.newaxis]

In [None]:
X_train, X_val, Y_train, Y_val = train_test_split(X, Y, test_size=0.05, random_state=42)
X_train, X_test, Y_train, Y_test = train_test_split(X_train, Y_train, test_size=0.025, random_state=42)

In [None]:
model = tf.keras.models.Sequential([
        tf.keras.layers.Conv1D(filters=32, kernel_size=9, strides=1, padding='same', input_shape=(n_mfccs + 4, 1)),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('elu'),
        tf.keras.layers.MaxPool1D(pool_size=2, strides=2),
        tf.keras.layers.Dropout(0.25),
    
        tf.keras.layers.Conv1D(filters=64, kernel_size=7, strides=1, padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('elu'),
        tf.keras.layers.MaxPool1D(pool_size=2, strides=2),
        tf.keras.layers.Dropout(0.25),
    
        tf.keras.layers.Conv1D(filters=128, kernel_size=5, strides=1, padding='same'),
        tf.keras.layers.BatchNormalization(),
        tf.keras.layers.Activation('elu'),
        tf.keras.layers.MaxPool1D(pool_size=2, strides=2),
        tf.keras.layers.Dropout(0.25),
    
        tf.keras.layers.LSTM(32, return_sequences=True),
    
        tf.keras.layers.Flatten(),
        
        tf.keras.layers.Dense(units=n_classes, activation='softmax')
        ])

model.summary()

In [None]:
opt = tf.keras.optimizers.SGD(learning_rate=0.01, decay=1e-3, momentum=0.8)

In [None]:
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['categorical_accuracy'])
history = model.fit(X_train, Y_train, batch_size=64, epochs=1500, validation_data=(X_val, Y_val))

In [None]:
import matplotlib.pyplot as plt
from IPython.display import set_matplotlib_formats
set_matplotlib_formats('svg')

In [None]:
plt.plot(history.history['loss'])
plt.plot(history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
plt.plot(history.history['categorical_accuracy'])
plt.plot(history.history['val_categorical_accuracy'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'validation'], loc='upper left')
plt.show()

In [None]:
loss, acc = model.evaluate(X_test, Y_test)
print("Model Accuracy: {:5.2f}%".format(100*acc))

In [None]:
y_pred = np.argmax(model.predict(X_test), axis=-1)

if int(db) == 1:
    class_names = ['anger', 'calm', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprised']
else:
    class_names = ['anger', 'disgust', 'fear', 'happiness', 'neutral', 'sadness', 'surprised']
    
confusion_matrix = sklearn.metrics.confusion_matrix(np.argmax(Y_test,axis=1), y_pred)
confusion_matrix = confusion_matrix.astype('float') / confusion_matrix.sum(axis=1)

disp = sklearn.metrics.ConfusionMatrixDisplay(confusion_matrix, display_labels=class_names)
disp.plot(cmap=plt.cm.Blues, xticks_rotation='vertical')
plt.show()