In [1]:
!pip install resampy

Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m22.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy
Successfully installed resampy-0.4.3


In [2]:
import sys
print(sys.executable)

/usr/bin/python3


In [3]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content
 99% 426M/429M [00:02<00:00, 173MB/s]
100% 429M/429M [00:02<00:00, 175MB/s]


In [4]:
!unzip /content/ravdess-emotional-speech-audio.zip

Archive:  /content/ravdess-emotional-speech-audio.zip
  inflating: Actor_01/03-01-01-01-01-01-01.wav  
  inflating: Actor_01/03-01-01-01-01-02-01.wav  
  inflating: Actor_01/03-01-01-01-02-01-01.wav  
  inflating: Actor_01/03-01-01-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-01-01-01-01.wav  
  inflating: Actor_01/03-01-02-01-01-02-01.wav  
  inflating: Actor_01/03-01-02-01-02-01-01.wav  
  inflating: Actor_01/03-01-02-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-02-01-01-01.wav  
  inflating: Actor_01/03-01-02-02-01-02-01.wav  
  inflating: Actor_01/03-01-02-02-02-01-01.wav  
  inflating: Actor_01/03-01-02-02-02-02-01.wav  
  inflating: Actor_01/03-01-03-01-01-01-01.wav  
  inflating: Actor_01/03-01-03-01-01-02-01.wav  
  inflating: Actor_01/03-01-03-01-02-01-01.wav  
  inflating: Actor_01/03-01-03-01-02-02-01.wav  
  inflating: Actor_01/03-01-03-02-01-01-01.wav  
  inflating: Actor_01/03-01-03-02-01-02-01.wav  
  inflating: Actor_01/03-01-03-02-02-01-01.wav  
  inflating: Ac

In [17]:
import numpy as np
import librosa
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, LSTM, GRU, Bidirectional

In [18]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    result = np.array([])

    if chroma or mel:
        stft = np.abs(librosa.stft(X))

    # Extract MFCC features
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))

    # Extract Chroma features
    if chroma:
        chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_stft))

    # Extract Mel-spectrogram features
    if mel:
        mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_spectrogram))

    return result

In [19]:
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']


In [20]:
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob('/content/audio_speech_actors_01-24/Actor_*/*.wav'):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)

    return train_test_split(np.array(x), y, test_size=test_size, random_state=5)


In [21]:
x_train, x_test, y_train, y_test = load_data()


In [22]:
x_train = np.array(x_train).reshape(x_train.shape[0], x_train.shape[1], 1)
x_test = np.array(x_test).reshape(x_test.shape[0], x_test.shape[1], 1)

In [23]:
encoder = LabelEncoder()
y_train = tf.keras.utils.to_categorical(encoder.fit_transform(y_train))
y_test = tf.keras.utils.to_categorical(encoder.transform(y_test))

In [24]:
model = Sequential()
model.add(Bidirectional(LSTM(128, return_sequences=True), input_shape=(x_train.shape[1], 1)))
model.add(Dropout(0.3))
model.add(GRU(128, return_sequences=True))
model.add(Dropout(0.3))
model.add(Bidirectional(LSTM(64)))
model.add(Dropout(0.3))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.3))
model.add(Dense(len(observed_emotions), activation='softmax'))


  super().__init__(**kwargs)


In [25]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])



In [26]:
model.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_test, y_test), verbose=1)

Epoch 1/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m47s[0m 1s/step - accuracy: 0.1648 - loss: 2.0576 - val_accuracy: 0.1944 - val_loss: 1.9984
Epoch 2/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 1s/step - accuracy: 0.2102 - loss: 1.9876 - val_accuracy: 0.2153 - val_loss: 1.9297
Epoch 3/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m42s[0m 1s/step - accuracy: 0.2322 - loss: 1.9400 - val_accuracy: 0.2500 - val_loss: 1.8866
Epoch 4/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 1s/step - accuracy: 0.2007 - loss: 1.9573 - val_accuracy: 0.2083 - val_loss: 1.9633
Epoch 5/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m38s[0m 1s/step - accuracy: 0.1925 - loss: 1.9750 - val_accuracy: 0.2708 - val_loss: 1.8793
Epoch 6/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m44s[0m 1s/step - accuracy: 0.2308 - loss: 1.9035 - val_accuracy: 0.2882 - val_loss: 1.8444
Epoch 7/100
[1m36/36[0m [32m━━━

<keras.src.callbacks.history.History at 0x7dcd74594df0>

In [27]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("BiLSTM + GRU Model Accuracy: {:.2f}%".format(accuracy * 100))


BiLSTM + GRU Model Accuracy: 56.94%
