Download dataset and unzip it

In [0]:
%%capture

fileid='1FUZJECbq2GSXnzoBXpptz0-3IapCXUKx'
filename='emotion.zip'

#https://drive.google.com/open?id=1FUZJECbq2GSXnzoBXpptz0-3IapCXUKx

! wget --save-cookies cookies.txt 'https://docs.google.com/uc?export=download&id={fileid}' -O- | sed -rn 's/.*confirm=([0-9A-Za-z_]+).*/\1/p' > confirm.txt

! wget --load-cookies cookies.txt -O {filename} 'https://docs.google.com/uc?export=download&id='{fileid}'&confirm='$(<confirm.txt)

! unzip -q -n emotion.zip

# pip install librosa

In [0]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [0]:
import librosa
from fastai.vision import *
from IPython.display import Audio
import librosa.display

import re

from keras import backend as K
from keras.models import Model, load_model

from keras.layers import Input, Dense, Embedding, Conv2D, MaxPool2D, Lambda, LSTM, TimeDistributed, Masking, Bidirectional
from keras import losses, optimizers
from keras.layers.normalization import BatchNormalization

from sklearn.metrics import confusion_matrix, precision_recall_fscore_support
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import StandardScaler
from sklearn.multiclass import OneVsRestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

Function to extract features from sound

In [0]:
def feature_extractor3(folder_name):
    df = pd.DataFrame(columns=['feature', 'label'])
    audio_files_dir = Path('meld')/folder_name
    bookmark=0
    duration = 12
    for label in ['neutral','happy','disgust','sad','fear']:
        label_str = label
        audio_files = list(Path(audio_files_dir/label).glob('*.wav'))
        print(label_str)

        for audio_file in audio_files:
            samples, sr = librosa.load(audio_file, res_type='kaiser_fast',duration=duration,sr=16000*2)
            # print(samples.shape, sample_rate)
            samples = np.concatenate([samples, np.zeros(duration*sr-samples.shape[0])])
            # print(samples.shape)

            mfcc = librosa.feature.mfcc(y=samples, sr=sr, n_mfcc=50, hop_length=256)
            
            
            
            # print(mfcc.shape)
            # print(spectral_center.shape)
            # print(chroma.shape)
            # print(spectral_contrast.shape)
            # print(spectral_bandwidth.shape)
            # print(flatness.shape)
            # print(zero_crossing.shape)
            # print(fourier_tempogram.shape, "\n---\n")
            
            # data = np.concatenate([mfcc, spectral_center, chroma, spectral_contrast, spectral_bandwidth, flatness, zero_crossing], axis = 0)
            data = np.concatenate([mfcc], axis = 0)

            df.loc[bookmark] = [data, label_str]
            bookmark=bookmark+1

    return df

#Generating the features for train and val

# df = feature_extractor3('train')
# df.to_pickle('drive/My Drive/data/features_4_train.csv')
# df = feature_extractor3('val')
# df.to_pickle('drive/My Drive/data/features_4_test.csv')

Function to get one hot encoded vector according to label int

In [0]:
def get_one_hot(label,num_classes=5):
    label_arr = [0]*num_classes
    label_arr[label]=1
    return label_arr

Function to get a dictionary having dialogueId_utteranceId as the key

In [0]:
def feature_df_convertor(folder_name, df):
    # df = pd.DataFrame(columns=['feature', 'label'])
    audio_files_dir = Path('meld')/folder_name
    bookmark=0

    index_list = []
    for label in ['neutral','happy','disgust','sad','fear']:
        label_str = label
        audio_files = list(Path(audio_files_dir/label).glob('*.wav'))
        # print(label_str)

        for audio_file in audio_files:
            audio_file_name = audio_file.as_posix().split("/")[3]
            x = re.findall("\d+", audio_file_name)
            feature_id = x[0]+"_"+x[1]
            index_list.append(feature_id)
    
    df.index = index_list
    return df

Function to get a dictionary having dialogues, utterances ids as keys and features as outputs. Also, returns the maximum number of utterances in the dataset

In [0]:
def get_max_utts(folder_name):
    audio_files_dir = Path('meld')/folder_name

    index_list = {}
    max_l=0
    for label in ['neutral','happy','disgust','sad','fear']:
        label_str = label
        audio_files = list(Path(audio_files_dir/label).glob('*.wav'))
        # print(label_str)

        for audio_file in audio_files:
            audio_file_name = audio_file.as_posix().split("/")[3]
            x = re.findall("\d+", audio_file_name)

            key = (int)(x[0])
            x[1] = (int)(x[1])
            try:
                index_list[key].append(x[1])
            except:
                index_list[key] = [x[1]]
            
            max_l = max(max_l, len(index_list[key]))
    
    return index_list, max_l

index_list, max_utts = get_max_utts('train')

Function to return an array of embeddings of the sound arranged according to dialogues_utterances. 

Also, returns the no. of utterances in it, labels of the data and a weight matrix which would be passed to the neural network which would tell which utterances should be considered while training/testing it.

In [0]:
def get_embeddings(df_utt,dialogue_list):
    embedding_size = np.mean(df_utt.iloc[0].feature, axis=0).shape
    embeddings = []
    lengths = []
    labels = []
    weights = []

    for dialogue in dialogue_list.keys():
        dialogue_embeddings = []
        dialogue_labels = []
        weights_local = np.zeros((max_utts))

        dialogue_list[dialogue].sort()
        for utt_id in dialogue_list[dialogue]:
            d = str(dialogue)
            ui = str(utt_id)
            dialogue_embeddings.append(np.mean(df_utt.loc[d+'_'+ui].feature[:30,:], axis=0).T)
            dialogue_labels.append(get_one_hot(df_utt.loc[d+'_'+ui].label2))
        
        weights_local[:len(dialogue_list[dialogue])] = 1.0
        ##Do something about missings utterances

        for k in range(max_utts-len(dialogue_embeddings)):
            dialogue_embeddings.append(np.zeros(embedding_size).T)
            dialogue_labels.append(get_one_hot(3))
        # print(dialogue_embeddings)
        embeddings.append(dialogue_embeddings)
        lengths.append(len(dialogue_embeddings))
        labels.append(dialogue_labels)
        weights.append(weights_local)

    return np.array(embeddings), lengths, np.array(labels), np.array(weights)

A pipeline function which runs the above functions in the required order

In [0]:
def pipeline(folder_name, input_df):
    df = feature_df_convertor(folder_name, input_df)
    
    df.sort_index(inplace=True)
    label_df = pd.DataFrame(df['label'])
    label_df['label'] = pd.Categorical(label_df['label'])
    df['label2'] = label_df['label'].cat.codes
    print(dict( enumerate(label_df['label'].cat.categories ) ))

    id_list, max_utts = get_max_utts(folder_name)

    embeddings, lengths, labels, weights = get_embeddings(df, id_list)
    return embeddings, lengths, labels, weights

In [10]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/drive


Reading the features, after running the function for extraction

In [0]:
df_train = pd.read_pickle('drive/My Drive/data/features_5_train.csv')
df_test = pd.read_pickle('drive/My Drive/data/features_5_test.csv')

In [14]:
embeddings_train, lengths_train, labels_train, weights_train  = pipeline('train', df_train)
embeddings_train.shape, len(lengths_train), labels_train.shape, weights_train.shape

{0: 'disgust', 1: 'fear', 2: 'happy', 3: 'neutral', 4: 'sad'}


((931, 21, 1251), 931, (931, 21, 5), (931, 21))

In [15]:
embeddings_val, lengths_val, labels_val, weights_val = pipeline('val', df_test)
embeddings_val.shape, len(lengths_val), labels_val.shape

{0: 'disgust', 1: 'fear', 2: 'happy', 3: 'neutral', 4: 'sad'}


((93, 21, 1251), 93, (93, 21, 5))

Here, we can see that the data is heavily imbalanced. We'll be training using weighted loss functions

In [16]:
df_train['label'].value_counts()

neutral    4592
happy      1609
sad         705
disgust     232
fear        216
Name: label, dtype: int64

In [0]:
def weighted_categorical_crossentropy(weights):
    weights = K.variable(weights)
        
    def loss(y_true, y_pred):
        # scale predictions so that the class probas of each sample sum to 1
        y_pred /= K.sum(y_pred, axis=-1, keepdims=True)
        # clip to prevent NaN's and Inf's
        y_pred = K.clip(y_pred, K.epsilon(), 1 - K.epsilon())
        # calc
        loss = y_true * K.log(y_pred) * weights
        loss = -K.sum(loss, -1)
        return loss
    
    return loss

Loading a model which was trained in experiments

The model is trained on a sequence of Bidirectional LSTMs. LSTMs seemed a good fit because they usually work very well when given the context (*which in this case were the past and future utterances in the dialogue*). 


In [0]:
weights = [14.79310344827586,15.25925925925926,2.853946550652579,1.1,6.513475177304964]
weights_2 = [0.38493166911477*5, 0.41344512608623446*5, 0.055502888275094246*5, 0.019447767254927403*5, 0.12667254926897395*5]

loss = weighted_categorical_crossentropy(weights)

adadelta = optimizers.Adadelta(decay = 0.1)

model = load_model('drive/My Drive/data/models_ker/weights_try_2.hdf5',
                        custom_objects={'loss': loss})

Code to train the neural network

In [0]:
from keras.callbacks import ModelCheckpoint
epochs = 100
batch_size = 64

checkpoint = ModelCheckpoint('drive/My Drive/data/models_ker/weights_try_5.hdf5', monitor='val_loss', verbose=1, save_best_only=True, mode='auto')
history = model.fit(embeddings_train, labels_train,
                epochs=epochs,
                batch_size=batch_size,
                sample_weight = weights_train,
                shuffle=True, 
                callbacks=[checkpoint],
                validation_data=(embeddings_val, labels_val, weights_val))

In [53]:
gc.collect()

899

In [49]:
confusion_matrix(np.argmax(labels_val, axis=2).flatten(), np.argmax(model.predict(embeddings_val), axis=2).flatten())

array([[  14,    0,    0,   11,    3],
       [  11,    3,    0,   10,    1],
       [  89,    2,   10,   77,    3],
       [1344,    7,   14,  260,   15],
       [  33,    0,    2,   40,    4]])

In [63]:
true_label = np.argmax(labels_val, axis=2).flatten()
predicted_label = np.argmax(model.predict(embeddings_val), axis=2).flatten()
print('Weighted F-Score: ', precision_recall_fscore_support(true_label, predicted_label, average='weighted'))

Weighted F-Score:  (0.5347761750039339, 0.2263184843830005, 0.31235768336112757, None)


  'precision', 'predicted', average, warn_for)


In [66]:
val_acc = accuracy_score(true_label, predicted_label)
val_acc

0.2263184843830005

However, this model wasn't able to converge well, maybe because of the imbalance of data, or maybe because of bad features. So, I tried running it on some standard sklearn classifiers, which gave better results.

In [0]:
df_train = pd.read_pickle('drive/My Drive/data/features_5_train.csv')
df_test = pd.read_pickle('drive/My Drive/data/features_5_test.csv')

In [0]:
x_train = np.array(df_train.feature.values.tolist())
x_train = np.mean(x_train, axis=1)
y_train = np.array(df_train.label.tolist())

x_test = np.array(df_test.feature.values.tolist())
x_test = np.mean(x_test, axis=1)
y_test = np.array(df_test.label.tolist())

scaler = StandardScaler().fit(x_train)
x_train = scaler.transform(x_train)
x_test = scaler.transform(x_test)

In [67]:
clf = OneVsRestClassifier(KNeighborsClassifier(n_neighbors=7))

clf.fit(x_train, y_train)

y_pred=clf.predict(x_train)
print("Train accuracy    :",accuracy_score(y_true=y_train, y_pred=y_pred))

y_pred=clf.predict(x_test)
print("Validation accuracy: ",accuracy_score(y_true=y_test, y_pred=y_pred))
print('Weighted FScore: ', precision_recall_fscore_support(y_test, y_pred, average='weighted'))

Train accuracy    : 0.6418275768289367
Validation accuracy:  0.6060240963855422
Weighted FScore:  (0.4797828748147671, 0.6060240963855422, 0.5129072559047266, None)


  'precision', 'predicted', average, warn_for)
