In [1]:
from google.colab import userdata
import os

os.environ["KAGGLE_KEY"] = userdata.get('KAGGLE_KEY')
os.environ["KAGGLE_USERNAME"] = userdata.get('KAGGLE_USERNAME')

In [2]:
!pip install pandas numpy librosa torch scikit-learn -q

In [3]:
!find /content -mindepth 1 ! -name 'best_model.bin' -exec rm -rf {} +

In [4]:
!kaggle datasets download -d uwrfkaggler/ravdess-emotional-speech-audio

Dataset URL: https://www.kaggle.com/datasets/uwrfkaggler/ravdess-emotional-speech-audio
License(s): CC-BY-NC-SA-4.0
Downloading ravdess-emotional-speech-audio.zip to /content
 99% 427M/429M [00:20<00:00, 23.4MB/s]
100% 429M/429M [00:20<00:00, 22.4MB/s]


In [5]:
!mkdir -p audio_data
!unzip ravdess-emotional-speech-audio.zip -d audio_data > /dev/null 2>&1

In [6]:
import os
import shutil

main_folder = '/content/audio_data'

for i in range(1, 25):
    actor_folder = os.path.join(main_folder, f'Actor_{i:02d}')

    if os.path.exists(actor_folder):
        for file_name in os.listdir(actor_folder):
            file_path = os.path.join(actor_folder, file_name)

            if os.path.isfile(file_path):
                shutil.move(file_path, main_folder)

        os.rmdir(actor_folder)

shutil.rmtree('/content/audio_data/audio_speech_actors_01-24')
print("All files moved to audio_data")


All files moved to audio_data


File naming convention

Each of the 1440 files has a unique filename. The filename consists of a 7-part numerical identifier (e.g., 03-01-06-01-02-01-12.wav). These identifiers define the stimulus characteristics:

Filename identifiers

Modality (01 = full-AV, 02 = video-only, 03 = audio-only).

Vocal channel (01 = speech, 02 = song).

Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).

Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.

Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").

Repetition (01 = 1st repetition, 02 = 2nd repetition).

Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [7]:
import os
import pandas as pd

audio_files = os.listdir(main_folder)

data = []

for file_name in audio_files:
    if file_name.endswith('.wav'):
        file_path = os.path.join(main_folder, file_name)
        file_name = file_name.removesuffix('.wav')
        data.append([*file_name.split('-'), file_path])

df = pd.DataFrame(data, columns=["Modality", "Vocal Channel", "Emotion", "Intensity", "Statement", "Repetition", "Actor", "File Path"])

In [8]:
df.head()

Unnamed: 0,Modality,Vocal Channel,Emotion,Intensity,Statement,Repetition,Actor,File Path
0,3,1,1,1,1,1,16,/content/audio_data/03-01-01-01-01-01-16.wav
1,3,1,6,1,2,2,19,/content/audio_data/03-01-06-01-02-02-19.wav
2,3,1,4,2,1,2,7,/content/audio_data/03-01-04-02-01-02-07.wav
3,3,1,8,1,1,2,24,/content/audio_data/03-01-08-01-01-02-24.wav
4,3,1,7,2,1,1,18,/content/audio_data/03-01-07-02-01-01-18.wav


In [9]:
emotion_mapping = {
    1: 'neutral',
    2: 'calm',
    3: 'happy',
    4: 'sad',
    5: 'angry',
    6: 'fearful',
    7: 'disgust',
    8: 'surprised'
}
def get_gender_female(actor_id):
    return 0 if int(actor_id) % 2 != 0 else 1

df['Emotion'] = df['Emotion'].astype(int).map(emotion_mapping)
df['Gender_female'] = df['Actor'].apply(get_gender_female)

df = df.drop(columns=['Actor', 'Modality', 'Vocal Channel', 'Intensity', 'Statement', 'Repetition'], axis=1)

One-hot Encoding

In [10]:
def encode_and_bind(original_dataframe, feature_to_encode):
    dummies = pd.get_dummies(original_dataframe[[feature_to_encode]])
    res = pd.concat([original_dataframe, dummies], axis=1)
    res = res.drop([feature_to_encode], axis=1)
    return(res)

df = encode_and_bind(df, 'Emotion')

In [11]:
df.head()

Unnamed: 0,File Path,Gender_female,Emotion_angry,Emotion_calm,Emotion_disgust,Emotion_fearful,Emotion_happy,Emotion_neutral,Emotion_sad,Emotion_surprised
0,/content/audio_data/03-01-01-01-01-01-16.wav,1,False,False,False,False,False,True,False,False
1,/content/audio_data/03-01-06-01-02-02-19.wav,0,False,False,False,True,False,False,False,False
2,/content/audio_data/03-01-04-02-01-02-07.wav,0,False,False,False,False,False,False,True,False
3,/content/audio_data/03-01-08-01-01-02-24.wav,1,False,False,False,False,False,False,False,True
4,/content/audio_data/03-01-07-02-01-01-18.wav,1,False,False,True,False,False,False,False,False


Audio Preprocessing

In [12]:
from preprocessing import extract_audio_features

In [17]:
df = df.dropna().reset_index()
# df['Features'] = df['File Path'].apply(extract_audio_features)
df['Features'] = None
for i in range(len(df)):
    feature = extract_audio_features(df.at[i, 'File Path'])
    df.at[i, 'Features'] = feature

expanded_features = pd.DataFrame(df['Features'].apply(lambda x: x.flatten().tolist()).tolist(), columns=[f'Feature_{i}' for i in range(195)])
df = pd.concat([expanded_features, df], axis=1)
df = df.drop(['Features'], axis=1)

In [25]:
df.head()

Unnamed: 0,Feature_0,Feature_1,Feature_2,Feature_3,Feature_4,Feature_5,Feature_6,Feature_7,Feature_8,Feature_9,...,File Path,Gender_female,Emotion_angry,Emotion_calm,Emotion_disgust,Emotion_fearful,Emotion_happy,Emotion_neutral,Emotion_sad,Emotion_surprised
0,0.002649,0.004524,0.006663,0.004973,0.003573,0.006293,0.008953,0.012256,0.016009,0.014712,...,/content/audio_data/03-01-01-01-01-01-16.wav,1.0,False,False,False,False,False,True,False,False
1,0.016146,0.021302,0.064698,0.075362,0.031158,0.01002,0.007401,0.009803,0.022766,0.042859,...,/content/audio_data/03-01-06-01-02-02-19.wav,0.0,False,False,False,True,False,False,False,False
2,0.482037,0.408529,0.140093,0.029696,0.016697,0.008209,0.003913,0.006465,0.009578,0.007661,...,/content/audio_data/03-01-04-02-01-02-07.wav,0.0,False,False,False,False,False,False,True,False
3,0.002818,0.005157,0.008752,0.010877,0.008597,0.011328,0.010402,0.009984,0.015409,0.013044,...,/content/audio_data/03-01-08-01-01-02-24.wav,1.0,False,False,False,False,False,False,False,True
4,0.003064,0.005582,0.006788,0.006653,0.006806,0.007056,0.008817,0.009312,0.007859,0.007971,...,/content/audio_data/03-01-07-02-01-01-18.wav,1.0,False,False,True,False,False,False,False,False


In [19]:
emotion_cols = [i for i in df.columns if i.startswith('Emotion')]

In [20]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report

X = df.drop(columns=["File Path", *emotion_cols, "Gender_female", "index"])
y = df["Gender_female"].astype(int)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

gender_model = RandomForestClassifier(n_estimators=100, random_state=42)
gender_model.fit(X_train, y_train)

y_pred = gender_model.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.95      0.97       130
           1       0.96      1.00      0.98       158

    accuracy                           0.98       288
   macro avg       0.98      0.97      0.98       288
weighted avg       0.98      0.98      0.98       288



In [22]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

from models.models import AudioNet

df['Gender_female'] = df['Gender_female'].astype(float)

class AudioDataset(Dataset):
    def __init__(self, df):
        self.features = df.drop(columns=["File Path", *emotion_cols, 'index']).values
        self.labels = df[emotion_cols].values.argmax(axis=1).astype('int64')  # Emotion labels as class indices

    def __len__(self):
        return len(self.features)

    def __getitem__(self, idx):
        x = torch.tensor(self.features[idx], dtype=torch.float32)  # Convert to float32 tensor
        y = torch.tensor(self.labels[idx], dtype=torch.long)  # Convert to long integer tensor

        # Reshape x to have shape (channels=196, sequence_length=1)
        x = x.view(196, 1)
        return x, y

In [23]:
from sklearn.model_selection import train_test_split

batch_size = 16
train_df, test_df = train_test_split(df, test_size=0.4, random_state=42)
val_df, test_df = train_test_split(test_df, test_size=0.5, random_state=42)

train_dataset = AudioDataset(train_df)
test_dataset = AudioDataset(test_df)
val_dataset = AudioDataset(val_df)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

In [27]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import torch.optim as optim
import numpy as np


def train_model(model, train_loader, val_loader, num_epochs=81, device='cuda'):
    device = torch.device(device if torch.cuda.is_available() else "cpu")
    print(f"Using device: {device}")

    model = model.to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.AdamW(model.parameters(), lr=1e-4, weight_decay=0.01)
    scheduler = optim.lr_scheduler.OneCycleLR(
        optimizer,
        max_lr=1e-3,
        epochs=num_epochs,
        steps_per_epoch=len(train_loader),
        pct_start=0.3
    )

    best_val_loss = float('inf')
    best_model_state = None
    patience_counter = 0

    for epoch in range(num_epochs):
        # Training phase
        model.train()
        train_loss = 0.0
        train_correct = 0
        train_total = 0

        for batch_idx, (inputs, labels) in enumerate(train_loader):
            inputs, labels = inputs.to(device), labels.to(device)

            if torch.isnan(inputs).any() or torch.isinf(inputs).any():
                continue

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, labels)

            if torch.isnan(loss):
                continue

            loss.backward()

            # Clip gradients
            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)

            optimizer.step()
            scheduler.step()

            train_loss += loss.item()
            _, predicted = torch.max(outputs.data, 1)
            train_total += labels.size(0)
            train_correct += (predicted == labels).sum().item()

        avg_train_loss = train_loss / len(train_loader)
        train_acc = 100 * train_correct / train_total if train_total > 0 else 0

        # Validation phase
        model.eval()
        val_loss = 0.0
        val_correct = 0
        val_total = 0

        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, labels)

                val_loss += loss.item()
                _, predicted = torch.max(outputs.data, 1)
                val_total += labels.size(0)
                val_correct += (predicted == labels).sum().item()

        avg_val_loss = val_loss / len(val_loader)
        val_acc = 100 * val_correct / val_total if val_total > 0 else 0

        print(f'Epoch [{epoch+1}/{num_epochs}]')
        print(f'Train Loss: {avg_train_loss:.4f}, Train Acc: {train_acc:.2f}%')
        print(f'Val Loss: {avg_val_loss:.4f}, Val Acc: {val_acc:.2f}%')
        print('-' * 60)

        # Save best model
        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            best_model_state = model.state_dict().copy()
            patience_counter = 0
        else:
            patience_counter += 1

    if best_model_state is not None:
        model.load_state_dict(best_model_state)

    return model



model = AudioNet()
trained_model = train_model(model, train_loader, val_loader, num_epochs = 200)


Using device: cuda
Epoch [1/200]
Train Loss: 2.0002, Train Acc: 14.42%
Val Loss: 2.0770, Val Acc: 13.19%
------------------------------------------------------------
Epoch [2/200]
Train Loss: 1.9981, Train Acc: 14.30%
Val Loss: 2.0750, Val Acc: 14.58%
------------------------------------------------------------
Epoch [3/200]
Train Loss: 1.9955, Train Acc: 15.87%
Val Loss: 2.0732, Val Acc: 17.71%
------------------------------------------------------------
Epoch [4/200]
Train Loss: 1.9953, Train Acc: 15.26%
Val Loss: 2.0704, Val Acc: 17.36%
------------------------------------------------------------
Epoch [5/200]
Train Loss: 1.9898, Train Acc: 15.87%
Val Loss: 2.0666, Val Acc: 17.36%
------------------------------------------------------------
Epoch [6/200]
Train Loss: 1.9854, Train Acc: 18.75%
Val Loss: 2.0613, Val Acc: 21.88%
------------------------------------------------------------
Epoch [7/200]
Train Loss: 1.9770, Train Acc: 21.03%
Val Loss: 2.0558, Val Acc: 24.31%
-------------

In [32]:
from joblib import dump

dump(gender_model, 'gender_model.joblib')

['gender_model.joblib']

In [33]:
torch.save(trained_model.state_dict(), 'audio_model.pth')