In [1]:
!pip install resampy

Collecting resampy
  Downloading resampy-0.4.3-py3-none-any.whl.metadata (3.0 kB)
Downloading resampy-0.4.3-py3-none-any.whl (3.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m36.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: resampy
Successfully installed resampy-0.4.3


In [2]:
import sys
print(sys.executable)

/usr/bin/python3


In [3]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content
100% 428M/429M [00:12<00:00, 39.7MB/s]
100% 429M/429M [00:12<00:00, 35.4MB/s]


In [5]:
!unzip /content/ravdess-emotional-speech-audio.zip

Archive:  /content/ravdess-emotional-speech-audio.zip
  inflating: Actor_01/03-01-01-01-01-01-01.wav  
  inflating: Actor_01/03-01-01-01-01-02-01.wav  
  inflating: Actor_01/03-01-01-01-02-01-01.wav  
  inflating: Actor_01/03-01-01-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-01-01-01-01.wav  
  inflating: Actor_01/03-01-02-01-01-02-01.wav  
  inflating: Actor_01/03-01-02-01-02-01-01.wav  
  inflating: Actor_01/03-01-02-01-02-02-01.wav  
  inflating: Actor_01/03-01-02-02-01-01-01.wav  
  inflating: Actor_01/03-01-02-02-01-02-01.wav  
  inflating: Actor_01/03-01-02-02-02-01-01.wav  
  inflating: Actor_01/03-01-02-02-02-02-01.wav  
  inflating: Actor_01/03-01-03-01-01-01-01.wav  
  inflating: Actor_01/03-01-03-01-01-02-01.wav  
  inflating: Actor_01/03-01-03-01-02-01-01.wav  
  inflating: Actor_01/03-01-03-01-02-02-01.wav  
  inflating: Actor_01/03-01-03-02-01-01-01.wav  
  inflating: Actor_01/03-01-03-02-01-02-01.wav  
  inflating: Actor_01/03-01-03-02-02-01-01.wav  
  inflating: Ac

In [7]:
import numpy as np
import librosa
import os
import glob
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Dense, Dropout, LayerNormalization, MultiHeadAttention, GlobalAveragePooling1D, Embedding


In [6]:
def extract_feature(file_name, mfcc=True, chroma=True, mel=True):
    X, sample_rate = librosa.load(file_name, res_type='kaiser_fast')
    result = np.array([])

    if chroma or mel:
        stft = np.abs(librosa.stft(X))

    # Extract MFCC features
    if mfcc:
        mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
        result = np.hstack((result, mfccs))

    # Extract Chroma features
    if chroma:
        chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
        result = np.hstack((result, chroma_stft))

    # Extract Mel-spectrogram features
    if mel:
        mel_spectrogram = np.mean(librosa.feature.melspectrogram(y=X, sr=sample_rate).T, axis=0)
        result = np.hstack((result, mel_spectrogram))

    return result

In [8]:
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
# Emotions to observe
observed_emotions=['neutral','calm','happy','sad','angry','fearful', 'disgust','surprised']

In [9]:
def load_data(test_size=0.2):
    x, y = [], []
    for file in glob.glob('/content/audio_speech_actors_01-24/Actor_*/*.wav'):
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
        x.append(feature)
        y.append(emotion)

    return train_test_split(np.array(x), y, test_size=test_size, random_state=5)


In [10]:
x_train, x_test, y_train, y_test = load_data()


In [11]:
# Reshape data for the Transformer input
x_train = np.array(x_train)
x_test = np.array(x_test)


In [12]:
# Encode the labels
encoder = LabelEncoder()
y_train = tf.keras.utils.to_categorical(encoder.fit_transform(y_train))
y_test = tf.keras.utils.to_categorical(encoder.transform(y_test))

In [13]:
def transformer_model(input_shape, num_classes):
    inputs = Input(shape=input_shape)

    # Add a dense embedding layer
    x = Dense(128, activation='relu')(inputs)

    # Multi-head attention layer
    attention_output = MultiHeadAttention(num_heads=4, key_dim=128)(x, x)
    attention_output = LayerNormalization(epsilon=1e-6)(attention_output + x)

    # Add a dense feed-forward layer
    x = Dense(128, activation='relu')(attention_output)
    x = Dropout(0.3)(x)

    # Global average pooling
    x = GlobalAveragePooling1D()(x)

    # Output layer
    outputs = Dense(num_classes, activation='softmax')(x)

    # Compile the model
    model = Model(inputs=inputs, outputs=outputs)
    model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

    return model

In [14]:
# Define input shape and number of classes
input_shape = (x_train.shape[1], 1)
num_classes = len(observed_emotions)

In [15]:
# Initialize and compile the model
model = transformer_model(input_shape, num_classes)

In [16]:
# Train the model
model.fit(x_train, y_train, batch_size=32, epochs=100, validation_data=(x_test, y_test), verbose=1)

Epoch 1/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 749ms/step - accuracy: 0.1561 - loss: 2.3220 - val_accuracy: 0.1736 - val_loss: 1.9735
Epoch 2/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 763ms/step - accuracy: 0.2128 - loss: 1.9072 - val_accuracy: 0.2604 - val_loss: 1.8678
Epoch 3/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 631ms/step - accuracy: 0.2691 - loss: 1.8135 - val_accuracy: 0.2778 - val_loss: 1.8131
Epoch 4/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m41s[0m 626ms/step - accuracy: 0.2537 - loss: 1.7961 - val_accuracy: 0.2292 - val_loss: 1.8457
Epoch 5/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 689ms/step - accuracy: 0.3016 - loss: 1.7735 - val_accuracy: 0.2604 - val_loss: 1.8191
Epoch 6/100
[1m36/36[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m40s[0m 662ms/step - accuracy: 0.2983 - loss: 1.7451 - val_accuracy: 0.2917 - val_loss: 1.8488
Epoch 7/100
[1m

<keras.src.callbacks.history.History at 0x7a9531e9d9f0>

In [17]:
# Evaluate the model
loss, accuracy = model.evaluate(x_test, y_test, verbose=0)
print("Transformer Model Accuracy: {:.2f}%".format(accuracy * 100))


Transformer Model Accuracy: 34.38%
