In [2]:
# Keras
import keras
from keras import regularizers
from keras.preprocessing import sequence
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential, Model, model_from_json
from keras.layers import Dense, Embedding, LSTM
from keras.layers import Input, Flatten, Dropout, Activation, BatchNormalization
from keras.layers import Conv1D, MaxPooling1D, AveragePooling1D
from keras.utils import np_utils, to_categorical
from keras.callbacks import ModelCheckpoint

# sklearn
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Other  
import librosa
import librosa.display
import json
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from matplotlib.pyplot import specgram
import pandas as pd
import seaborn as sns
import glob 
import os
import pickle
import IPython.display as ipd  # To play sound in the notebook

In [3]:
audio = "F:/audio_speech_actors_01-24/"
actor_folders = os.listdir(audio) #list files in audio directory
actor_folders.sort() 
actor_folders[:5]

['Actor_01', 'Actor_02', 'Actor_03', 'Actor_04', 'Actor_05']

In [4]:
emotion = []
gender = []
actor = []
file_path = []
for i in actor_folders:
    filename = os.listdir(audio + i) #in actor folder
    for f in filename: #in files of actor folder
        part = f.split('.')[0].split('-')
        emotion.append(int(part[2]))
        actor.append(int(part[6]))
        bg = int(part[6])
        if bg%2 == 0:
            bg = "female"
        else:
            bg = "male"
        gender.append(bg)
        file_path.append(audio + i + '/' + f)

In [5]:
audio_df = pd.DataFrame(emotion)
audio_df = audio_df.replace({1:'neutral', 2:'calm', 3:'happy', 4:'sad', 5:'angry', 6:'fear', 7:'disgust', 8:'surprise'})
audio_df = pd.concat([pd.DataFrame(gender),audio_df,pd.DataFrame(actor)],axis=1)
audio_df.columns = ['gender','emotion','actor']
audio_df = pd.concat([audio_df,pd.DataFrame(file_path, columns = ['path'])],axis=1)
audio_df

Unnamed: 0,gender,emotion,actor,path
0,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...
1,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...
2,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...
3,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...
4,male,calm,1,F:/audio_speech_actors_01-24/Actor_01/03-01-02...
...,...,...,...,...
1435,female,surprise,24,F:/audio_speech_actors_01-24/Actor_24/03-01-08...
1436,female,surprise,24,F:/audio_speech_actors_01-24/Actor_24/03-01-08...
1437,female,surprise,24,F:/audio_speech_actors_01-24/Actor_24/03-01-08...
1438,female,surprise,24,F:/audio_speech_actors_01-24/Actor_24/03-01-08...


In [6]:
df = pd.DataFrame(columns=['feature'])

# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(audio_df.path):
    X, sample_rate = librosa.load(path, res_type='kaiser_fast')
    sample_rate = np.array(sample_rate)
 
    mfccs = np.mean(librosa.feature.mfcc(y=X,sr=sample_rate,n_mfcc=40).T,axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

# Checkthe first few recordings
print(len(df))
df.head()

1440


Unnamed: 0,feature
0,"[-700.3989, 58.63021, -3.025852, 16.040241, 4...."
1,"[-695.55786, 59.240154, -5.3727765, 19.776367,..."
2,"[-694.00433, 61.49651, -3.2627435, 16.971298, ..."
3,"[-687.51337, 59.44154, -0.703714, 16.645708, 3..."
4,"[-729.98016, 66.51589, -0.9419841, 19.070974, ..."


In [7]:
df = pd.concat([audio_df,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df[:5]

Unnamed: 0,gender,emotion,actor,path,0,1,2,3,4,5,...,30,31,32,33,34,35,36,37,38,39
0,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-700.398926,58.630211,-3.025852,16.040241,4.248529,3.869935,...,-1.411359,-2.769772,-2.042009,-2.522663,-2.507448,-2.250499,-0.381507,-2.481059,-2.791023,-2.244865
1,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-695.557861,59.240154,-5.372777,19.776367,5.200387,3.32463,...,-1.055913,-3.102514,-1.69788,-2.922661,-2.544465,-1.289832,-0.797254,-3.586074,-2.706395,-2.812933
2,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-694.004333,61.49651,-3.262743,16.971298,2.142968,4.266798,...,-1.382858,-2.590943,-1.929074,-2.390322,-2.269381,-2.486079,-0.589257,-3.248326,-2.979813,-2.769281
3,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-687.513367,59.44154,-0.703714,16.645708,3.730826,6.181194,...,-2.024173,-2.58619,-2.420421,-3.243219,-2.762588,-1.960003,-0.45389,-2.976706,-2.914763,-3.909605
4,male,calm,1,F:/audio_speech_actors_01-24/Actor_01/03-01-02...,-729.980164,66.515892,-0.941984,19.070974,4.29708,5.635082,...,-1.519869,-1.797158,-1.296814,-3.051891,-1.131349,-1.063672,-1.141021,-2.373389,-3.204345,-3.363193


In [8]:
df=df.fillna(0)

print(df.shape)
df[:5]

(1440, 44)


Unnamed: 0,gender,emotion,actor,path,0,1,2,3,4,5,...,30,31,32,33,34,35,36,37,38,39
0,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-700.398926,58.630211,-3.025852,16.040241,4.248529,3.869935,...,-1.411359,-2.769772,-2.042009,-2.522663,-2.507448,-2.250499,-0.381507,-2.481059,-2.791023,-2.244865
1,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-695.557861,59.240154,-5.372777,19.776367,5.200387,3.32463,...,-1.055913,-3.102514,-1.69788,-2.922661,-2.544465,-1.289832,-0.797254,-3.586074,-2.706395,-2.812933
2,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-694.004333,61.49651,-3.262743,16.971298,2.142968,4.266798,...,-1.382858,-2.590943,-1.929074,-2.390322,-2.269381,-2.486079,-0.589257,-3.248326,-2.979813,-2.769281
3,male,neutral,1,F:/audio_speech_actors_01-24/Actor_01/03-01-01...,-687.513367,59.44154,-0.703714,16.645708,3.730826,6.181194,...,-2.024173,-2.58619,-2.420421,-3.243219,-2.762588,-1.960003,-0.45389,-2.976706,-2.914763,-3.909605
4,male,calm,1,F:/audio_speech_actors_01-24/Actor_01/03-01-02...,-729.980164,66.515892,-0.941984,19.070974,4.29708,5.635082,...,-1.519869,-1.797158,-1.296814,-3.051891,-1.131349,-1.063672,-1.141021,-2.373389,-3.204345,-3.363193


In [None]:
X_train, X_test, y_train, y_test = train_test_split(df.drop(['path','emotion','gender','actor'],axis=1)
                                                    , df['emotion']
                                                    , test_size=0.25
                                                    , shuffle=True
                                                    , random_state=42
                                                   )

# Lets see how the data present itself before normalisation 
X_train[150:160]

In [None]:
y_train.value_counts()

In [None]:
mean = np.mean(X_train, axis=0)
std = np.std(X_train, axis=0)

X_train = (X_train - mean)/std
X_test = (X_test - mean)/std

# Check the dataset now 
X_train.head()

In [None]:
X_train = np.array(X_train)
y_train = np.array(y_train)
X_test = np.array(X_test)
y_test = np.array(y_test)

# one hot encode the target 
lb = LabelEncoder()
y_train = np_utils.to_categorical(lb.fit_transform(y_train))
y_test = np_utils.to_categorical(lb.fit_transform(y_test))

print(X_train.shape)
print(lb.classes_)
#print(y_train[0:10])
#print(y_test[0:10])

# Pickel the lb object for future use 
filename = 'labels'
outfile = open(filename,'wb')
pickle.dump(lb,outfile)
outfile.close()

In [None]:
X_train = np.expand_dims(X_train, axis=2)
X_test = np.expand_dims(X_test, axis=2)
X_train.shape

In [None]:
model = Sequential()
model.add(Conv1D(32, 5, padding='same',input_shape=(X_train.shape[1],1)))  # X_train.shape[1] = No. of Columns
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(MaxPooling1D(pool_size=(5)))
model.add(Dropout(0.2))
model.add(Conv1D(64, 5, padding='same'))
model.add(MaxPooling1D(pool_size=(5)))
model.add(Conv1D(64, 5, padding='same'))
model.add(BatchNormalization())
model.add(Activation('relu'))
model.add(Flatten())
model.add(Dense(8)) # Target class number
model.add(Activation('softmax'))

model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',optimizer='RMSprop',metrics=['accuracy'])
model_history=model.fit(X_train, y_train, batch_size=16, epochs=40, validation_data=(X_test, y_test))

In [None]:
plt.plot(model_history.history['loss'])
plt.plot(model_history.history['val_loss'])
plt.title('model loss')
plt.ylabel('loss')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()

In [None]:
plt.plot(model_history.history['accuracy'])
plt.plot(model_history.history['val_accuracy'])
plt.title('model accuracy')
plt.ylabel('acc')
plt.xlabel('epoch')
plt.legend(['train', 'test'], loc='upper left')
plt.show()


In [None]:
print("Loss of the model is - " , model.evaluate(X_test,y_test)[0])
print("Accuracy of the model is - " , model.evaluate(X_test,y_test)[1]*100 , "%")

In [None]:
# PREDICTIONS
pred = model.predict(X_test)
pred=pred.argmax(axis=1)
pred = pred.astype(int).flatten()
pred = (lb.inverse_transform((pred)))
pred = pd.DataFrame({'Predicted Values': pred})

# ACTUAL LABELS
y=y_test.argmax(axis=1)
y = y.astype(int).flatten()
y= (lb.inverse_transform((y)))
y = pd.DataFrame({'Actual Values': y})

# COMBINE BOTH 
finaldf = y.join(pred)
finaldf[140:150]

In [None]:
print(classification_report(y, pred, target_names = ['angry','calm','disgust','fear','happy','neutral','sad','surprise']))

In [None]:
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y, pred)
plt.figure(figsize = (12, 10))
cm = pd.DataFrame(cm , index = [i for i in lb.classes_] , columns = [i for i in lb.classes_])
ax = sns.heatmap(cm, linecolor='white', cmap='Purples', linewidth=1, annot=True, fmt='')
bottom, top = ax.get_ylim()
ax.set_ylim(bottom + 0.5, top - 0.5)
plt.title('Confusion Matrix', size=20)
plt.xlabel('Predicted Labels', size=14)
plt.ylabel('Actual Labels', size=14)
plt.savefig('Initial_Model_Confusion_Matrix.png')
plt.show()

In [None]:
# Save model and weights
model_name = 'Audio_Emotion_Model.h5'
save_dir = os.path.join(os.getcwd(), 'saved_models')

if not os.path.isdir(save_dir):
    os.makedirs(save_dir)
model_path = os.path.join(save_dir, model_name)
model.save(model_path)
print('Save model and weights at %s ' % model_path)

# Save the model to disk
model_json = model.to_json()
with open("model_json.json", "w") as json_file:
    json_file.write(model_json)

In [None]:
json_file = open('model_json.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)

# load weights into new model
loaded_model.load_weights("saved_models/Audio_Emotion_Model.h5")
print("Loaded model from disk")
 
# Keras optimiser
opt = keras.optimizers.rmsprop(lr=0.00001, decay=1e-6)
loaded_model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
score = loaded_model.evaluate(X_test, y_test, verbose=0)
print("%s: %.2f%%" % (loaded_model.metrics_names[1], score[1]*100))

In [None]:
from keras.utils.vis_utils import plot_model
plot_model(model)