In [1]:
!pip install resampy

Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m13.1 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy
Successfully installed resampy-0.4.3


In [2]:
import sys
print(sys.executable)

/usr/bin/python3


In [3]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content
 99% 425M/429M [00:05<00:00, 61.9MB/s]
100% 429M/429M [00:05<00:00, 77.8MB/s]


In [4]:
!unzip /content/ravdess-emotional-speech-audio.zip

Archive:  /content/ravdess-emotional-speech-audio.zip
  inflating: Actor_01/03-01-01-01-01-01-01.wav  
  inflating: Actor_01/03-01-01-01-01-02-01.wav  
  inflating: Actor_01/03-01-01-01-02-01-01.wav  
  inflating: Actor_01/03-01-01-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-01-01-01-01.wav  
  inflating: Actor_01/03-01-02-01-01-02-01.wav  
  inflating: Actor_01/03-01-02-01-02-01-01.wav  
  inflating: Actor_01/03-01-02-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-02-01-01-01.wav  
  inflating: Actor_01/03-01-02-02-01-02-01.wav  
  inflating: Actor_01/03-01-02-02-02-01-01.wav  
  inflating: Actor_01/03-01-02-02-02-02-01.wav  
  inflating: Actor_01/03-01-03-01-01-01-01.wav  
  inflating: Actor_01/03-01-03-01-01-02-01.wav  
  inflating: Actor_01/03-01-03-01-02-01-01.wav  
  inflating: Actor_01/03-01-03-01-02-02-01.wav  
  inflating: Actor_01/03-01-03-02-01-01-01.wav  
  inflating: Actor_01/03-01-03-02-01-02-01.wav  
  inflating: Actor_01/03-01-03-02-02-01-01.wav  
  inflating: Ac

In [5]:
import numpy as np
import librosa
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Flatten, Conv1D, MaxPooling1D, LSTM, Bidirectional


In [6]:
emotions = {
    '01': 'neutral',
    '02': 'calm',
    '03': 'happy',
    '04': 'sad',
    '05': 'angry',
    '06': 'fearful',
    '07': 'disgust',
    '08': 'surprised'
}
observed_emotions = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']

In [7]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    result = np.array([])

    if chroma or mel:
        stft = np.abs(librosa.stft(X))

    # Extract MFCC features
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))

    # Extract Chroma features
    if chroma:
        chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_stft))

    # Extract Mel-spectrogram features
    if mel:
        mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_spectrogram))

    return result

In [8]:
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob('/content/audio_speech_actors_01-24/Actor_*/*.wav'):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)

    return train_test_split(np.array(x), y, test_size=test_size, random_state=5)


In [9]:
x_train, x_test, y_train, y_test = load_data()

In [10]:
x_train = np.array(x_train).reshape(x_train.shape[0], x_train.shape[1], 1)
x_test = np.array(x_test).reshape(x_test.shape[0], x_test.shape[1], 1)

In [11]:
encoder = LabelEncoder()
y_train = tf.keras.utils.to_categorical(encoder.fit_transform(y_train))
y_test = tf.keras.utils.to_categorical(encoder.transform(y_test))

In [12]:
model = Sequential()

# 1D Convolutional layer to extract local features from the input data
model.add(Conv1D(64, kernel_size=3, activation='relu', input_shape=(x_train.shape[1], 1)))
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.3))

# Adding a Bidirectional LSTM layer to capture long-term dependencies
model.add(Bidirectional(LSTM(128, return_sequences=True)))
model.add(Dropout(0.3))

# Flatten the output before passing it to the dense layer
model.add(Flatten())

# Fully connected dense layer
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.3))

# Output layer for emotion classification
model.add(Dense(len(observed_emotions), activation='softmax'))




  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [13]:
# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])


In [14]:

# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=50, validation_data=(x_test, y_test), verbose=1)

Epoch 1/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m15s[0m 256ms/step - accuracy: 0.2105 - loss: 1.9995 - val_accuracy: 0.3333 - val_loss: 1.7765
Epoch 2/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 325ms/step - accuracy: 0.3237 - loss: 1.7616 - val_accuracy: 0.3194 - val_loss: 1.7800
Epoch 3/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m17s[0m 239ms/step - accuracy: 0.3771 - loss: 1.6787 - val_accuracy: 0.3715 - val_loss: 1.6541
Epoch 4/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 326ms/step - accuracy: 0.4227 - loss: 1.5676 - val_accuracy: 0.3785 - val_loss: 1.6338
Epoch 5/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m18s[0m 245ms/step - accuracy: 0.4235 - loss: 1.5398 - val_accuracy: 0.4792 - val_loss: 1.4946
Epoch 6/50
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m13s[0m 308ms/step - accuracy: 0.4724 - loss: 1.4315 - val_accuracy: 0.4965 - val_loss: 1.4738
Epoch 7/50
[1m36/36[

<keras.src.callbacks.history.History at 0x7dfd67b30d30>

In [15]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("CNN + BiLSTM Model Accuracy: {:.2f}%".format(accuracy * 100))


CNN + BiLSTM Model Accuracy: 69.79%


In [16]:
model.save('speech_emotion_recognition_model.h5')




In [17]:
from keras.models import load_model

In [19]:
model = load_model('speech_emotion_recognition_model.h5')



In [21]:
def predict_emotion(file_name):
    # Extract features from the input audio file
    features = extract_feature(file_name)

    # Reshape the input to match the model's expected input shape
    features = features.reshape(1, -1, 1)  # Reshape for Conv1D input

    # Predict the emotion
    prediction = model.predict(features)

    # Decode the prediction to get the corresponding emotion label
    emotion_labels = ['neutral', 'calm', 'happy', 'sad', 'angry', 'fearful', 'disgust', 'surprised']  # Adjust based on your dataset
    predicted_emotion = emotion_labels[np.argmax(prediction)]

    return predicted_emotion

In [40]:
audio = "/content/Some days it’s really hard ☹️ #shorts #depression #sad.wav"

In [41]:
emotion = predict_emotion(audio)
print(f"Predicted emotion: {emotion}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
Predicted emotion: neutral
