In [1]:
!pip install resampy

Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m24.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy
Successfully installed resampy-0.4.3


In [2]:
import sys
print(sys.executable)

/usr/bin/python3


In [3]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content
 99% 425M/429M [00:04<00:00, 84.1MB/s]
100% 429M/429M [00:04<00:00, 106MB/s] 


In [4]:
!unzip /content/ravdess-emotional-speech-audio.zip

Archive:  /content/ravdess-emotional-speech-audio.zip
  inflating: Actor_01/03-01-01-01-01-01-01.wav  
  inflating: Actor_01/03-01-01-01-01-02-01.wav  
  inflating: Actor_01/03-01-01-01-02-01-01.wav  
  inflating: Actor_01/03-01-01-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-01-01-01-01.wav  
  inflating: Actor_01/03-01-02-01-01-02-01.wav  
  inflating: Actor_01/03-01-02-01-02-01-01.wav  
  inflating: Actor_01/03-01-02-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-02-01-01-01.wav  
  inflating: Actor_01/03-01-02-02-01-02-01.wav  
  inflating: Actor_01/03-01-02-02-02-01-01.wav  
  inflating: Actor_01/03-01-02-02-02-02-01.wav  
  inflating: Actor_01/03-01-03-01-01-01-01.wav  
  inflating: Actor_01/03-01-03-01-01-02-01.wav  
  inflating: Actor_01/03-01-03-01-02-01-01.wav  
  inflating: Actor_01/03-01-03-01-02-02-01.wav  
  inflating: Actor_01/03-01-03-02-01-01-01.wav  
  inflating: Actor_01/03-01-03-02-01-02-01.wav  
  inflating: Actor_01/03-01-03-02-02-01-01.wav  
  inflating: Ac

In [6]:
import librosa
import os, glob
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import GRU, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping


In [5]:
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']


In [7]:
def extract_feature(file_name, mfcc, chroma, mel):
    X, sample_rate = librosa.load(os.path.join(file_name), res_type='kaiser_fast')
    if chroma:
        stft = np.abs(librosa.stft(X))
    result = np.array([])
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))
    if chroma:
        chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma))
    if mel:
        mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_spectrogram))
    return result

In [8]:
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob('/content/audio_speech_actors_01-24/Actor_*/*.wav'):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)
    # Encode labels
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    # Split data into training and testing sets
    return train_test_split(np.array(x), y, test_size=test_size, random_state=9)


In [20]:
x_train, x_test, y_train, y_test = load_data(test_size=0.25)


In [21]:
x_train = np.expand_dims(x_train, axis=-1)
x_test = np.expand_dims(x_test, axis=-1)

In [22]:
print(f'Training samples: {x_train.shape[0]}, Testing samples: {x_test.shape[0]}')
print(f'Features extracted: {x_train.shape[1]}')

Training samples: 1080, Testing samples: 360
Features extracted: 180


In [23]:
model = Sequential()
model.add(GRU(128, return_sequences=True, input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.3))
model.add(GRU(64))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(observed_emotions), activation='softmax'))

  super().__init__(**kwargs)


In [24]:
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [25]:
history = model.fit(x_train, y_train, epochs=150, batch_size=32, validation_split=0.2, verbose=2)


Epoch 1/150
27/27 - 14s - 514ms/step - accuracy: 0.1285 - loss: 2.0756 - val_accuracy: 0.1204 - val_loss: 2.0797
Epoch 2/150
27/27 - 7s - 253ms/step - accuracy: 0.1458 - loss: 2.0632 - val_accuracy: 0.1204 - val_loss: 2.0874
Epoch 3/150
27/27 - 10s - 359ms/step - accuracy: 0.1262 - loss: 2.0671 - val_accuracy: 0.1204 - val_loss: 2.0832
Epoch 4/150
27/27 - 7s - 274ms/step - accuracy: 0.1134 - loss: 2.0633 - val_accuracy: 0.1343 - val_loss: 2.0911
Epoch 5/150
27/27 - 12s - 426ms/step - accuracy: 0.1273 - loss: 2.0630 - val_accuracy: 0.1204 - val_loss: 2.0962
Epoch 6/150
27/27 - 8s - 302ms/step - accuracy: 0.1215 - loss: 2.0633 - val_accuracy: 0.1204 - val_loss: 2.0865
Epoch 7/150
27/27 - 9s - 329ms/step - accuracy: 0.1354 - loss: 2.0622 - val_accuracy: 0.1204 - val_loss: 2.0913
Epoch 8/150
27/27 - 12s - 442ms/step - accuracy: 0.1447 - loss: 2.0632 - val_accuracy: 0.1204 - val_loss: 2.0930
Epoch 9/150
27/27 - 8s - 287ms/step - accuracy: 0.1655 - loss: 2.0587 - val_accuracy: 0.1019 - val_l

In [26]:
y_pred = np.argmax(model.predict(x_test), axis=1)
accuracy = accuracy_score(y_true=y_test, y_pred=y_pred)
print("Accuracy: {:.2f}%".format(accuracy * 100))

[1m12/12[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 93ms/step
Accuracy: 36.11%
