#  <center> Speech Emotion Recognition (Ravdess Dataset)<center>
##   <center> (With Cross Validation) <center>

### Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.models import Model

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 



##  <center> 1. Ravdess Dataframe <center>
The filename identifiers as per the official RAVDESS website:

* Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
* Vocal channel (01 = speech, 02 = song).
* Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
* Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
* Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
* Repetition (01 = 1st repetition, 02 = 2nd repetition).
* Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

### Read Ravdess audio data

In [2]:
# Paths for data.
Ravdess = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"

In [3]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    # as their are 20 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df['Actor'] = Ravdess_df['Path'].str.split('-').str[-1].str.split('.').str[0]
Ravdess_df['Actor'] = Ravdess_df['Actor'].apply(lambda x: int(x))
Ravdess_df['Gender'] = np.where(Ravdess_df['Actor'].astype(int) % 2 == 1, 'Male', 'Female')


pd.set_option('display.max_colwidth', None)
Ravdess_df.head()

Unnamed: 0,Emotions,Path,Actor,Gender
0,surprise,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-08-01-01-01-02.wav,2,Female
1,neutral,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-01-01-01-01-02.wav,2,Female
2,disgust,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-07-02-01-02-02.wav,2,Female
3,disgust,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-07-01-01-02-02.wav,2,Female
4,neutral,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-01-01-02-01-02.wav,2,Female


### Class Imbalance

In [7]:
# Removing classes with very less emotions captured
data_path_rem=Ravdess_df[(Ravdess_df['Emotions']!='surprise')&(Ravdess_df['Emotions']!='calm')]

## Data Augmentation

- Data augmentation is the process by which we create new synthetic data samples by adding small perturbations on our initial training set.
- To generate syntactic data for audio, we can apply noise injection, shifting time, changing pitch and speed.

In [8]:
rate=0.8

In [9]:
sampling_rate=22050
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data):
    return librosa.effects.time_stretch(data, rate=2.0)

def pitch(data):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=10)

import scipy.signal as signal

def low_pass_filter(data):
    # Define the filter parameters
    nyquist_freq = 0.5 * sampling_rate
    cutoff_freq = 5000  # Set the cutoff frequency of the filter to 5 kHz
    numtaps = 10        # Set the number of filter taps (length of the filter)

    # Create a low-pass filter with the specified parameters
    taps = signal.firwin(numtaps, cutoff_freq/nyquist_freq)

    # Apply the filter to the audio signal
    filtered_audio = signal.convolve(data, taps, mode='same')

    return filtered_audio

### Feature Extraction - Statistical and accoustic 
Extracting 5 features:
- Zero Crossing Rate
- Chroma_stft
- MFCC
- RMS(root mean square) value
- MelSpectogram to train our model.

The statistical features that are extracted are:
- Mean
- Variance
- Skewness
- Kurtosis
- Audio_rms
- Spectral centroids
- Spectral bandwidth

In [10]:
sample_rate=22050
import os, glob
import librosa
import numpy as np
import scipy
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def statistical_features(data):
    result = np.array([])
    mean = np.mean(data)#
    result=np.hstack((result, mean)) # stacking horizontally
    
    variance = np.var(data)#
    result=np.hstack((result, variance)) # stacking horizontally
    
    skewness = scipy.stats.skew(data)#
    result=np.hstack((result, skewness)) # stacking horizontally
    
    kurtosis = scipy.stats.kurtosis(data)#
    result=np.hstack((result, kurtosis)) # stacking horizontally
    
    audio_rms = librosa.feature.rms(y=data)
    audio_rms_mean = audio_rms.mean()#
    result=np.hstack((result, audio_rms_mean)) # stacking horizontally
    
    spectral_centroids = librosa.feature.spectral_centroid(y=data, sr=sampling_rate)[0]#
    result=np.hstack((result, spectral_centroids.mean())) # stacking horizontally
    
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=data, sr=sampling_rate)[0]#
    result=np.hstack((result, spectral_bandwidth.mean())) # stacking horizontally
    
    mfccs = librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13)
    mfcc = mfccs.mean()#
    result=np.hstack((result, mfcc)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    

    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    stat=statistical_features(data)
    result=np.hstack((result,stat))
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    stat2=statistical_features(noise_data)
    result2=np.hstack((res2,stat2))
    result1 = np.vstack((result, result2)) # stacking vertically
    
    # data with stretch
    stretched_data = stretch(data)
    res3 = extract_features(stretched_data)
    stat3=statistical_features(stretched_data)
    result2=np.hstack((res3,stat3))
    
    # data with LPF
    low_pass_filter_data = low_pass_filter(data)
    res4 = extract_features(low_pass_filter_data)
    stat4=statistical_features(low_pass_filter_data)
    result3=np.hstack((res4,stat4))
    
    # data with pitch
    pitch_data = pitch(data)
    res5 = extract_features(pitch_data)
    stat5=statistical_features(pitch_data)
    result4=np.hstack((res5,stat5))

    result = np.vstack((result, result2,result3,result4)) # stacking vertically
    return result


### Creating a sequence of features and target

In [11]:
X_all, Y,gender_list_all = [], [],[]
count = 1
for path, emotion, gender in zip(data_path_rem.Path, data_path_rem.Emotions,data_path_rem.Gender):
    feature = get_features(path)
    for ele in feature:
        X_all.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)
        gender_list_all.append(gender)
    if (count % 500) == 0:
        print('Done Reading '+str(count)+'/'+str(len(data_path_rem))+' files')
    count+=1

Done Reading 500/1056 files
Done Reading 1000/1056 files


In [14]:
actors = data_path_rem['Actor'].to_list()

### Gender Features

In [15]:
# For the train and validation set
gender=[]
for i in gender_list_all:
    if i=='Male':
        gender.append(1)
    else:
        gender.append(0)

In [16]:
X_gender = np.expand_dims(np.stack(gender), -1)
X = np.concatenate([X_all, X_gender], axis=1)

## Data Preparation

In [17]:
def preprocess_data_and_labels(X, Y):
    encoder = OneHotEncoder()
    Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X,Y

def split_acoustic_statistical_gender(X):
    return X[:, :162], X[:, 162:170], X[:,-1]

### Deep Feature Extraction

In [18]:
num_classes=6

In [19]:
def get_feature_extraction_model(input_shape, num_classes):
    model=Sequential()
    model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

    model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(units=12, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(units=num_classes, activation='softmax'))
    
    return model

In [20]:
def fit_evaluate_return_FE_model(model, X_train, y_train, X_test, y_test):
    model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=8)
    rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=6, min_lr=0.0000001)
    callback_list=[rlrp,early_stopping_callback]
    
    model.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=(X_test, y_test), callbacks=callback_list, verbose=0)
    # model.evaluate(X_test,y_test)
    
    conv1d_feature_model=Model(inputs=model.input,outputs=model.layers[-5].output)
    return conv1d_feature_model

### MLP - Dense Network Classifier

In [21]:
def get_classification_model(input_shape, num_classes):
    model = Sequential()
    model.add(Dense(12, input_shape=input_shape, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

## Actor-wise Cross Validation

### Folds split based on actors

In [22]:
import random
actor_folds = []
train_folds = []
test_fold = []
for i in range(6):
    actor_folds.append(list(range((4*i)+1, (4*i)+5)))
    all_folds = list(range(6))
    all_folds.remove(i)
    train_folds.append(all_folds)
    test_fold.append(i)

In [23]:
def get_indices_of_actors(actors_list, list_of_actors):
    bool_list = [True if actor in list_of_actors else False for actor in actors_list]
    return [i for i,x in enumerate(bool_list) if x==True]

In [24]:
num_classes = 6
def actor_wise_cross_validation(X, Y, actors):
    X, Y = preprocess_data_and_labels(X, Y)
    X_ac, X_stat, X_gender = split_acoustic_statistical_gender(X)
    
    for i in range(len(train_folds)):
        print('Fold '+str(i+1)+'/'+str(len(train_folds)))
        
        # Get the indices for Train and Test Folds
        train_fold_actors = []
        for j in train_folds[i]:
            train_fold_actors.extend(actor_folds[j])
        test_fold_actors = actor_folds[i]
        train_actor_indices = get_indices_of_actors(actors, train_fold_actors)
        test_actor_indices = get_indices_of_actors(actors, test_fold_actors)
        
        # Separate acoustic features array for deep feature extraction
        X_ac_train = X_ac[train_actor_indices]
        X_ac_val = X_ac[test_actor_indices]
        X_ac_train = np.expand_dims(X_ac_train, -1)
        X_ac_val = np.expand_dims(X_ac_val, -1)
        
        # Seperate statistical and gender features
        X_stat_train = X_stat[train_actor_indices]
        X_stat_val = X_stat[test_actor_indices]
        X_gender_train = np.expand_dims(X_gender[train_actor_indices], -1)
        X_gender_val = np.expand_dims(X_gender[test_actor_indices], -1)
        y_train = Y[train_actor_indices]
        y_val = Y[test_actor_indices]
        
        # Get deep feature extraction model
        df_model = get_feature_extraction_model((X_ac_train.shape[1], 1), num_classes)
        
        # Train the model
        print("Extracting Deep features")
        conv1d_feature_model = fit_evaluate_return_FE_model(df_model, X_ac_train, y_train, X_ac_val, y_val)
        
        # Transform the acoustic features
        df_train = conv1d_feature_model.predict(X_ac_train)
        df_val = conv1d_feature_model.predict(X_ac_val)
        
        X_train = np.concatenate([df_train, X_stat_train, X_gender_train], axis=1)
        X_val = np.concatenate([df_val, X_stat_val, X_gender_val], axis=1)
        
        # Create Classification model
        model = get_classification_model((X_train.shape[1],), num_classes)
        print("Training Model")
        model.compile(optimizer = 'adam' , 
                       loss = 'categorical_crossentropy', 
                       metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
        model.fit(X_train,y_train,validation_data=(X_val, y_val),epochs=50,batch_size=64, verbose=0)
        model.evaluate(X_val, y_val)
        
        print('Done')

In [25]:
actor_wise_cross_validation(X, Y, actors)

Fold 1/6
Extracting Deep features
Training Model
Done
Fold 2/6
Extracting Deep features
Training Model
Done
Fold 3/6
Extracting Deep features
Training Model
Done
Fold 4/6
Extracting Deep features
Training Model
Done
Fold 5/6
Extracting Deep features
Training Model
Done
Fold 6/6
Extracting Deep features
Training Model
Done
