#  <center> Speech Emotion Recognition (SER Superset) <center>
##   <center> (With Cross Validation) <center>

### Importing Libraries

In [3]:
import pandas as pd
import numpy as np

import os
import sys

# librosa is a Python library for analyzing audio and music. It can be used to extract the data from the audio files we will see it later.
import librosa
import librosa.display
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import confusion_matrix, classification_report
from sklearn.model_selection import train_test_split

# to play the audio files
from IPython.display import Audio

import keras
from keras.callbacks import ReduceLROnPlateau
from keras.models import Sequential
from keras.layers import Dense, Conv1D, MaxPooling1D, Flatten, Dropout, BatchNormalization
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
import tensorflow as tf
from tensorflow.keras.models import Model

import warnings
if not sys.warnoptions:
    warnings.simplefilter("ignore")
warnings.filterwarnings("ignore", category=DeprecationWarning) 

### Data Preparation
* As we are working with four different datasets, so we will be creating a dataframe storing all emotions of the data in dataframe with their paths.
* We will use this dataframe to extract features for our model training.

### Read Datasets

In [4]:
# Paths for data.
Ravdess = "/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/"
Crema = "/kaggle/input/cremad/AudioWAV/"
Tess = "/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/"
Savee = "/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/"

##  <center> 1. Ravdess Dataframe <center>
Here is the filename identifiers as per the official RAVDESS website:

* Modality (01 = full-AV, 02 = video-only, 03 = audio-only).
* Vocal channel (01 = speech, 02 = song).
* Emotion (01 = neutral, 02 = calm, 03 = happy, 04 = sad, 05 = angry, 06 = fearful, 07 = disgust, 08 = surprised).
* Emotional intensity (01 = normal, 02 = strong). NOTE: There is no strong intensity for the 'neutral' emotion.
* Statement (01 = "Kids are talking by the door", 02 = "Dogs are sitting by the door").
* Repetition (01 = 1st repetition, 02 = 2nd repetition).
* Actor (01 to 24. Odd numbered actors are male, even numbered actors are female).

In [5]:
ravdess_directory_list = os.listdir(Ravdess)

file_emotion = []
file_path = []
for dir in ravdess_directory_list:
    # as their are 20 different actors in our previous directory we need to extract files for each actor.
    actor = os.listdir(Ravdess + dir)
    for file in actor:
        part = file.split('.')[0]
        part = part.split('-')
        # third part in each file represents the emotion associated to that file.
        file_emotion.append(int(part[2]))
        file_path.append(Ravdess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Ravdess_df = pd.concat([emotion_df, path_df], axis=1)

# changing integers to actual emotions.
Ravdess_df.Emotions.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'}, inplace=True)
Ravdess_df['Actor'] = Ravdess_df['Path'].str.split('-').str[-1].str.split('.').str[0]
Ravdess_df['Actor'] = Ravdess_df['Actor'].apply(lambda x: int(x))
Ravdess_df['Gender'] = np.where(Ravdess_df['Actor'].astype(int) % 2 == 1, 'Male', 'Female')


pd.set_option('display.max_colwidth', None)
Ravdess_df.head()

Unnamed: 0,Emotions,Path,Actor,Gender
0,surprise,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-08-01-01-01-02.wav,2,Female
1,neutral,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-01-01-01-01-02.wav,2,Female
2,disgust,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-07-02-01-02-02.wav,2,Female
3,disgust,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-07-01-01-02-02.wav,2,Female
4,neutral,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-01-01-02-01-02.wav,2,Female


## <center>2. Crema DataFrame</center>

In [6]:
crema_directory_list = os.listdir(Crema)

file_emotion = []
file_path = []

for file in crema_directory_list:
    # storing file paths
    file_path.append(Crema + file)
    # storing file emotions
    part=file.split('_')
    if part[2] == 'SAD':
        file_emotion.append('sad')
    elif part[2] == 'ANG':
        file_emotion.append('angry')
    elif part[2] == 'DIS':
        file_emotion.append('disgust')
    elif part[2] == 'FEA':
        file_emotion.append('fear')
    elif part[2] == 'HAP':
        file_emotion.append('happy')
    elif part[2] == 'NEU':
        file_emotion.append('neutral')
    else:
        file_emotion.append('Unknown')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Crema_df = pd.concat([emotion_df, path_df], axis=1)
crema_gender = pd.read_csv("/kaggle/input/gender-for-crema/actor-genderid - actor-genderid.csv")
crema_gender = crema_gender.rename(columns={'Sex': 'Gender'})
crema_gender = crema_gender.drop(['Age','Race','Ethnicity'],axis =1)
crema_gender.head()

Unnamed: 0,ActorID,Gender
0,1001,Male
1,1002,Female
2,1003,Female
3,1004,Female
4,1005,Male


In [7]:
Crema_df['match_actorID'] = Crema_df['Path'].str.extract(r'(\d+)')


crema_gender['match_actorID'] = crema_gender['ActorID'].astype(str)

# Perform the merge based on the common value
merged_df_crema = pd.merge(Crema_df, crema_gender, on='match_actorID',how = 'left')


merged_df_crema  = merged_df_crema .drop(['match_actorID'],axis =1)
merged_df_crema  = merged_df_crema .rename(columns={'Gender_y': 'Gender'})
merged_df_crema  = merged_df_crema.rename(columns={'ActorID': 'Actor'})

In [8]:
actor_list = list(merged_df_crema['Actor'].unique())
actor_list.sort()
actor_dict = {}
corrected_actor_id = list(range(31,122))
for i in range(len(actor_list)):
    actor_dict[actor_list[i]] = corrected_actor_id[i]
merged_df_crema['Actor'] = merged_df_crema['Actor'].replace(actor_dict)
merged_df_crema.head()

Unnamed: 0,Emotions,Path,Actor,Gender
0,disgust,/kaggle/input/cremad/AudioWAV/1028_TSI_DIS_XX.wav,58,Female
1,happy,/kaggle/input/cremad/AudioWAV/1075_IEO_HAP_LO.wav,105,Female
2,happy,/kaggle/input/cremad/AudioWAV/1084_ITS_HAP_XX.wav,114,Female
3,disgust,/kaggle/input/cremad/AudioWAV/1067_IWW_DIS_XX.wav,97,Male
4,disgust,/kaggle/input/cremad/AudioWAV/1066_TIE_DIS_XX.wav,96,Male


##  <center> 3. TESS dataset <center>

In [9]:
tess_directory_list = os.listdir(Tess)

file_emotion = []
file_path = []

for dir in tess_directory_list:
    directories = os.listdir(Tess + dir)
    for file in directories:
        part = file.split('.')[0]
        part = part.split('_')[2]
        if part=='ps':
            file_emotion.append('surprise')
        else:
            file_emotion.append(part)
        file_path.append(Tess + dir + '/' + file)
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Tess_df = pd.concat([emotion_df, path_df], axis=1)
Tess_df['Gender'] = 'Female'
pd.set_option('display.max_colwidth', None)
Tess_df['Actor'] = Tess_df['Path'].str.extract(r'/(OAF|YAF)_')

# Map the extracted values to the corresponding actor numbers
actor_mapping = {'OAF': 25, 'YAF': 26}
Tess_df['Actor'] = Tess_df['Actor'].map(actor_mapping)
Tess_df.head()

Unnamed: 0,Emotions,Path,Gender,Actor
0,fear,/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/YAF_fear/YAF_home_fear.wav,Female,26
1,fear,/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/YAF_fear/YAF_youth_fear.wav,Female,26
2,fear,/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/YAF_fear/YAF_near_fear.wav,Female,26
3,fear,/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/YAF_fear/YAF_search_fear.wav,Female,26
4,fear,/kaggle/input/toronto-emotional-speech-set-tess/tess toronto emotional speech set data/TESS Toronto emotional speech set data/YAF_fear/YAF_pick_fear.wav,Female,26


##  <center> 4. SAVEE dataset <center>
The audio files in this dataset are named in such a way that the prefix letters describes the emotion classes as follows:

* 'a' = 'anger'
* 'd' = 'disgust'
* 'f' = 'fear'
* 'h' = 'happiness'
* 'n' = 'neutral'
* 'sa' = 'sadness'
* 'su' = 'surprise'

In [10]:
savee_directory_list = os.listdir(Savee)

file_emotion = []
file_path = []

for file in savee_directory_list:
    file_path.append(Savee + file)
    part = file.split('_')[1]
    ele = part[:-6]
    if ele=='a':
        file_emotion.append('angry')
    elif ele=='d':
        file_emotion.append('disgust')
    elif ele=='f':
        file_emotion.append('fear')
    elif ele=='h':
        file_emotion.append('happy')
    elif ele=='n':
        file_emotion.append('neutral')
    elif ele=='sa':
        file_emotion.append('sad')
    else:
        file_emotion.append('surprise')
        
# dataframe for emotion of files
emotion_df = pd.DataFrame(file_emotion, columns=['Emotions'])

# dataframe for path of files.
path_df = pd.DataFrame(file_path, columns=['Path'])
Savee_df = pd.concat([emotion_df, path_df], axis=1)
Savee_df['Gender'] = 'Male'
Savee_df['Actor'] = Savee_df['Path'].str.extract(r'/(DC|JE|JK|KL)_')

# Map the extracted values to the corresponding actor numbers
actor_mapping = {'DC': 27, 'JE': 28, 'JK': 29, 'KL': 30}
Savee_df['Actor'] = Savee_df['Actor'].map(actor_mapping)
Savee_df.head()

Unnamed: 0,Emotions,Path,Gender,Actor
0,happy,/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/JE_h09.wav,Male,28
1,fear,/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/KL_f12.wav,Male,30
2,happy,/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/DC_h03.wav,Male,27
3,disgust,/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/DC_d04.wav,Male,27
4,angry,/kaggle/input/surrey-audiovisual-expressed-emotion-savee/ALL/KL_a14.wav,Male,30


In [11]:
# creating Dataframe using all the 4 dataframes we created so far.
data_path = pd.concat([Ravdess_df, merged_df_crema, Tess_df, Savee_df], axis = 0)
data_path.to_csv("data_path.csv",index=False)
data_path.head()

Unnamed: 0,Emotions,Path,Actor,Gender
0,surprise,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-08-01-01-01-02.wav,2,Female
1,neutral,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-01-01-01-01-02.wav,2,Female
2,disgust,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-07-02-01-02-02.wav,2,Female
3,disgust,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-07-01-01-02-02.wav,2,Female
4,neutral,/kaggle/input/ravdess-emotional-speech-audio/audio_speech_actors_01-24/Actor_02/03-01-01-01-02-01-02.wav,2,Female


In [12]:
data_path.shape

(12162, 4)

In [13]:
gender_distribution = data_path['Gender'].value_counts()
gender_distribution

Female    7032
Male      5130
Name: Gender, dtype: int64

### Handling Class Imbalance

In [14]:
# Removing classes with very less emotions captured
data_path_rem=data_path[(data_path['Emotions']!='surprise')&(data_path['Emotions']!='calm')]

## Data Augmentation

- Data augmentation is the process by which we create new synthetic data samples by adding small perturbations on our initial training set.
- To generate syntactic data for audio, we can apply noise injection, shifting time, changing pitch and speed.
- The objective is to make our model invariant to those perturbations and enhace its ability to generalize.

In [20]:
rate=0.8

In [21]:
sampling_rate=22050
def noise(data):
    noise_amp = 0.035*np.random.uniform()*np.amax(data)
    data = data + noise_amp*np.random.normal(size=data.shape[0])
    return data

def stretch(data):
    return librosa.effects.time_stretch(data, rate=2.0)

def pitch(data):
    return librosa.effects.pitch_shift(data, sr=sampling_rate, n_steps=10)

import scipy.signal as signal

def low_pass_filter(data):
    # Define the filter parameters
    nyquist_freq = 0.5 * sampling_rate
    cutoff_freq = 5000  # Set the cutoff frequency of the filter to 5 kHz
    numtaps = 10        # Set the number of filter taps (length of the filter)

    # Create a low-pass filter with the specified parameters
    taps = signal.firwin(numtaps, cutoff_freq/nyquist_freq)

    # Apply the filter to the audio signal
    filtered_audio = signal.convolve(data, taps, mode='same')

    return filtered_audio

### Feature Extraction - Statistical and accoustic 

Extracting 5 features:
- Zero Crossing Rate
- Chroma_stft
- MFCC
- RMS(root mean square) value
- MelSpectogram to train our model.

The statistical features that are extracted are:
- Mean
- Variance
- Skewness
- Kurtosis
- Audio_rms
- Spectral centroids
- Spectral bandwidth

In [22]:
sample_rate=22050
import os, glob
import librosa
import numpy as np
import scipy
def extract_features(data):
    # ZCR
    result = np.array([])
    zcr = np.mean(librosa.feature.zero_crossing_rate(y=data).T, axis=0)
    result=np.hstack((result, zcr)) # stacking horizontally

    # Chroma_stft
    stft = np.abs(librosa.stft(data))
    chroma_stft = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T, axis=0)
    result = np.hstack((result, chroma_stft)) # stacking horizontally

    # MFCC
    mfcc = np.mean(librosa.feature.mfcc(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mfcc)) # stacking horizontally

    # Root Mean Square Value
    rms = np.mean(librosa.feature.rms(y=data).T, axis=0)
    result = np.hstack((result, rms)) # stacking horizontally

    # MelSpectogram
    mel = np.mean(librosa.feature.melspectrogram(y=data, sr=sample_rate).T, axis=0)
    result = np.hstack((result, mel)) # stacking horizontally
    
    return result

def statistical_features(data):
    result = np.array([])
    mean = np.mean(data)#
    result=np.hstack((result, mean)) # stacking horizontally
    
    variance = np.var(data)#
    result=np.hstack((result, variance)) # stacking horizontally
    
    skewness = scipy.stats.skew(data)#
    result=np.hstack((result, skewness)) # stacking horizontally
    
    kurtosis = scipy.stats.kurtosis(data)#
    result=np.hstack((result, kurtosis)) # stacking horizontally
    
    audio_rms = librosa.feature.rms(y=data)
    audio_rms_mean = audio_rms.mean()#
    result=np.hstack((result, audio_rms_mean)) # stacking horizontally
    
    spectral_centroids = librosa.feature.spectral_centroid(y=data, sr=sampling_rate)[0]#
    result=np.hstack((result, spectral_centroids.mean())) # stacking horizontally
    
    spectral_bandwidth = librosa.feature.spectral_bandwidth(y=data, sr=sampling_rate)[0]#
    result=np.hstack((result, spectral_bandwidth.mean())) # stacking horizontally
    
    mfccs = librosa.feature.mfcc(y=data, sr=sampling_rate, n_mfcc=13)
    mfcc = mfccs.mean()#
    result=np.hstack((result, mfcc)) # stacking horizontally
    
    return result

def get_features(path):
    # duration and offset are used to take care of the no audio in start and the ending of each audio files as seen above.
    data, sample_rate = librosa.load(path, duration=2.5, offset=0.6)
    

    # without augmentation
    res1 = extract_features(data)
    result = np.array(res1)
    stat=statistical_features(data)
    result=np.hstack((result,stat))
    
    # data with noise
    noise_data = noise(data)
    res2 = extract_features(noise_data)
    stat2=statistical_features(noise_data)
    result2=np.hstack((res2,stat2))
    result1 = np.vstack((result, result2)) # stacking vertically
    
    # data with stretch
    stretched_data = stretch(data)
    res3 = extract_features(stretched_data)
    stat3=statistical_features(stretched_data)
    result2=np.hstack((res3,stat3))
    
    # data with LPF
    low_pass_filter_data = low_pass_filter(data)
    res4 = extract_features(low_pass_filter_data)
    stat4=statistical_features(low_pass_filter_data)
    result3=np.hstack((res4,stat4))
    
    # data with pitch
    pitch_data = pitch(data)
    res5 = extract_features(pitch_data)
    stat5=statistical_features(pitch_data)
    result4=np.hstack((res5,stat5))

    result = np.vstack((result, result2,result3,result4)) # stacking vertically
    return result


### Creating a sequence of features and target

In [63]:
X_all, Y,gender_list_all = [], [],[]
count = 1
for path, emotion, gender in zip(data_path_rem.Path, data_path_rem.Emotions,data_path_rem.Gender):
    feature = get_features(path)
    for ele in feature:
        X_all.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)
        gender_list_all.append(gender)
    if (count % 500) == 0:
        print('Done Reading '+str(count)+'/'+str(len(data_path_rem))+' files')
    count+=1

500/11318
1000/11318
1500/11318
2000/11318
2500/11318
3000/11318
3500/11318
4000/11318
4500/11318
5000/11318
5500/11318
6000/11318
6500/11318
7000/11318
7500/11318
8000/11318
8500/11318
9000/11318
9500/11318
10000/11318
10500/11318
11000/11318


In [74]:
X_all = np.load('/kaggle/input/features/X_all.npy')
Y = np.load('/kaggle/input/features/Y_all.npy')

In [75]:
actors = data_path_rem['Actor'].to_list()

### Extracting Gender Features

In [46]:
# For the train and validation set
gender=[]
for i in gender_list_all:
    if i=='Male':
        gender.append(1)
    else:
        gender.append(0)

In [76]:
X_gender = np.expand_dims(np.stack(gender), -1)
X = np.concatenate([X_all, X_gender], axis=1)

## Data Preparation

- As of now we have extracted the data, now we need to normalize and split our data for training and testing.

In [77]:
def preprocess_data_and_labels(X, Y):
    encoder = OneHotEncoder()
    Y = encoder.fit_transform(np.array(Y).reshape(-1,1)).toarray()
    
    scaler = StandardScaler()
    X = scaler.fit_transform(X)
    
    return X,Y

def split_acoustic_statistical_gender(X):
    return X[:, :162], X[:, 162:170], X[:,-1]

### Deep Feature Extraction

In [78]:
num_classes=6

In [79]:
def get_feature_extraction_model(input_shape, num_classes):
    model=Sequential()
    model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu', input_shape=input_shape))
    model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))

    model.add(Conv1D(256, kernel_size=5, strides=1, padding='same', activation='relu'))
    model.add(MaxPooling1D(pool_size=5, strides = 2, padding = 'same'))
    model.add(Dropout(0.2))

    model.add(Flatten())
    model.add(Dense(units=32, activation='relu'))
    model.add(Dropout(0.2))

    model.add(Dense(units=12, activation='relu'))
    model.add(Dropout(0.3))

    model.add(Dense(units=num_classes, activation='softmax'))
    
    return model

In [86]:
def fit_evaluate_return_FE_model(model, X_train, y_train, X_test, y_test):
    model.compile(optimizer = 'adam' , loss = 'categorical_crossentropy' , metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
    early_stopping_callback = EarlyStopping(monitor='val_loss', patience=8)
    rlrp = ReduceLROnPlateau(monitor='loss', factor=0.4, verbose=0, patience=6, min_lr=0.0000001)
    callback_list=[rlrp,early_stopping_callback]
    
    model.fit(X_train, y_train, batch_size=128, epochs=5, validation_data=(X_test, y_test), callbacks=callback_list, verbose=0)
    # model.evaluate(X_test,y_test)
    
    conv1d_feature_model=Model(inputs=model.input,outputs=model.layers[-5].output)
    return conv1d_feature_model

### MLP - Dense Network Classifier

In [81]:
def get_classification_model(input_shape, num_classes):
    model = Sequential()
    model.add(Dense(12, input_shape=input_shape, activation='relu'))
    model.add(Dense(10, activation='relu'))
    model.add(Dense(num_classes, activation='softmax'))
    return model

## Actor-wise Cross Validation

In [82]:
import random

random.seed(42)  # Set the random seed for reproducibility

actor_list = data_path_rem['Actor'].unique().tolist()
random.shuffle(actor_list)

actor_folds = []
train_folds = []
test_fold = []

for i in range(0, 11):
    actor_folds.append(actor_list[(11 * i):(11 * i) + 11])
    all_folds = list(range(11))
    all_folds.remove(i)
    train_folds.append(all_folds)
    test_fold.append(i)

In [89]:
print(actor_folds)

[[28, 100, 78, 102, 92, 37, 54, 62, 77, 98, 57], [64, 67, 74, 27, 112, 40, 87, 84, 45, 33, 11], [52, 115, 31, 81, 14, 75, 58, 83, 12, 10, 9], [44, 96, 73, 109, 43, 51, 103, 59, 36, 86, 23], [25, 60, 4, 55, 110, 79, 13, 120, 18, 22, 121], [20, 104, 117, 90, 108, 41, 116, 68, 35, 114, 50], [47, 29, 42, 82, 107, 48, 113, 19, 2, 16, 72], [1, 8, 56, 49, 61, 65, 21, 32, 30, 6, 5], [15, 71, 39, 111, 106, 38, 119, 118, 91, 80, 101], [24, 66, 63, 7, 26, 95, 89, 69, 76, 105, 97], [94, 46, 34, 70, 53, 3, 99, 85, 88, 93, 17]]


In [90]:
print(train_folds)

[[1, 2, 3, 4, 5, 6, 7, 8, 9, 10], [0, 2, 3, 4, 5, 6, 7, 8, 9, 10], [0, 1, 3, 4, 5, 6, 7, 8, 9, 10], [0, 1, 2, 4, 5, 6, 7, 8, 9, 10], [0, 1, 2, 3, 5, 6, 7, 8, 9, 10], [0, 1, 2, 3, 4, 6, 7, 8, 9, 10], [0, 1, 2, 3, 4, 5, 7, 8, 9, 10], [0, 1, 2, 3, 4, 5, 6, 8, 9, 10], [0, 1, 2, 3, 4, 5, 6, 7, 9, 10], [0, 1, 2, 3, 4, 5, 6, 7, 8, 10], [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]


In [92]:
print(test_fold)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10]


In [83]:
def get_indices_of_actors(actors_list, list_of_actors):
    bool_list = [True if actor in list_of_actors else False for actor in actors_list]
    return [i for i,x in enumerate(bool_list) if x==True]

In [87]:
num_classes = 6
def actor_wise_cross_validation(X, Y, actors):
    X, Y = preprocess_data_and_labels(X, Y)
    X_ac, X_stat, X_gender = split_acoustic_statistical_gender(X)
    
    for i in range(len(train_folds)):
        print('Fold '+str(i+1)+'/'+str(len(train_folds)))
        
        # Get the indices for Train and Test Folds
        train_fold_actors = []
        for j in train_folds[i]:
            train_fold_actors.extend(actor_folds[j])
        test_fold_actors = actor_folds[i]
        train_actor_indices = get_indices_of_actors(actors, train_fold_actors)
        test_actor_indices = get_indices_of_actors(actors, test_fold_actors)
        
        # Separate acoustic features array for deep feature extraction
        X_ac_train = X_ac[train_actor_indices]
        X_ac_val = X_ac[test_actor_indices]
        X_ac_train = np.expand_dims(X_ac_train, -1)
        X_ac_val = np.expand_dims(X_ac_val, -1)
        
        # Seperate statistical and gender features
        X_stat_train = X_stat[train_actor_indices]
        X_stat_val = X_stat[test_actor_indices]
        X_gender_train = np.expand_dims(X_gender[train_actor_indices], -1)
        X_gender_val = np.expand_dims(X_gender[test_actor_indices], -1)
        y_train = Y[train_actor_indices]
        y_val = Y[test_actor_indices]
        
        # Get deep feature extraction model
        df_model = get_feature_extraction_model((X_ac_train.shape[1], 1), num_classes)
        
        # Train the model
        print("Extracting Deep features")
        conv1d_feature_model = fit_evaluate_return_FE_model(df_model, X_ac_train, y_train, X_ac_val, y_val)
        
        # Transform the acoustic features
        df_train = conv1d_feature_model.predict(X_ac_train)
        df_val = conv1d_feature_model.predict(X_ac_val)
        
        X_train = np.concatenate([df_train, X_stat_train, X_gender_train], axis=1)
        X_val = np.concatenate([df_val, X_stat_val, X_gender_val], axis=1)
        
        # Create Classification model
        model = get_classification_model((X_train.shape[1],), num_classes)
        print("Training Model")
        model.compile(optimizer = 'adam' , 
                       loss = 'categorical_crossentropy', 
                       metrics=['accuracy', tf.keras.metrics.Precision(), tf.keras.metrics.Recall()])
        model.fit(X_train,y_train,validation_data=(X_val, y_val),epochs=50,batch_size=64, verbose=0)
        model.evaluate(X_val, y_val)
        
        print('Done')

In [88]:
actor_wise_cross_validation(X, Y, actors)

Fold 1/11
Extracting Deep features
Training Model
Done
Fold 2/11
Extracting Deep features
Training Model
Done
Fold 3/11
Extracting Deep features
Training Model
Done
Fold 4/11
Extracting Deep features
Training Model
Done
Fold 5/11
Extracting Deep features
Training Model
Done
Fold 6/11
Extracting Deep features
Training Model
Done
Fold 7/11
Extracting Deep features
Training Model
Done
Fold 8/11
Extracting Deep features
Training Model
Done
Fold 9/11
Extracting Deep features
Training Model
Done
Fold 10/11
Extracting Deep features
Training Model
Done
Fold 11/11
Extracting Deep features
Training Model
Done
