<a href="https://colab.research.google.com/github/nikhildr22/Speech-Emotion-Recognition/blob/master/master.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
%tensorflow_version 1.x

TensorFlow 1.x selected.


In [0]:
!ls

sample_data


In [0]:
!wget https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip

--2020-04-16 15:40:25--  https://zenodo.org/record/1188976/files/Audio_Speech_Actors_01-24.zip
Resolving zenodo.org (zenodo.org)... 188.184.117.155
Connecting to zenodo.org (zenodo.org)|188.184.117.155|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 208468073 (199M) [application/octet-stream]
Saving to: ‘Audio_Speech_Actors_01-24.zip’


2020-04-16 15:40:28 (80.5 MB/s) - ‘Audio_Speech_Actors_01-24.zip’ saved [208468073/208468073]



In [0]:
!mkdir dataset
!ls

Audio_Speech_Actors_01-24.zip  dataset	sample_data


In [0]:
!unzip -q Audio_Speech_Actors_01-24.zip -d dataset

In [0]:
!ls

Audio_Speech_Actors_01-24.zip  dataset	sample_data


In [0]:
!pip install soundfile

Collecting soundfile
  Downloading https://files.pythonhosted.org/packages/eb/f2/3cbbbf3b96fb9fa91582c438b574cff3f45b29c772f94c400e2c99ef5db9/SoundFile-0.10.3.post1-py2.py3-none-any.whl
Installing collected packages: soundfile
Successfully installed soundfile-0.10.3.post1


In [0]:
import librosa
from tqdm import tqdm
import soundfile
from scipy.io import wavfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [0]:
''' directory to keep clean files '''
!mkdir clean 

In [0]:
''' downsampling audio files to 16KHz and storing them in /clean directory ''' 
for f in tqdm(glob.glob("dataset/*/*")):
    signal, rate = librosa.load(path=f, sr=16000)
    wavfile.write(filename='clean/'+f[-24:],rate=rate,data=signal)

100%|██████████| 1440/1440 [05:49<00:00,  4.12it/s]


In [0]:
#Extract features (mfcc, chroma, mel) from a sound file
def extract_feature(file_name, mfcc, chroma, mel):
    with soundfile.SoundFile(file_name) as sound_file:
        X = sound_file.read(dtype="float32")
        sample_rate=sound_file.samplerate
        if chroma:
            stft=np.abs(librosa.stft(X))
        result=np.array([])
        if mfcc:
            mfccs=np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
            result=np.hstack((result, mfccs))
        if chroma:
            chroma=np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
            result=np.hstack((result, chroma))
        if mel:
            mel=np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
            result=np.hstack((result, mel))
    return result

In [0]:
# Emotions in the RAVDESS dataset
emotions={
  '01':'neutral',
  '02':'calm',
  '03':'happy',
  '04':'sad',
  '05':'angry',
  '06':'fearful',
  '07':'disgust',
  '08':'surprised'
}
# Emotions to observe (remove the prefix "/" to include that emotion)
observed_emotions=['sad','angry','happy','neutral','/calm','/fearful','/disgust','/surprised']

In [0]:
# Load the data and extract features for each sound file

def load_data(test_size=0.2):
    x,y=[],[] 
    for file in tqdm(glob.glob("clean/*")):
        file_name=os.path.basename(file[-24:])
#         print(file_name)
        emotion=emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        try:
            feature=extract_feature(file, mfcc=True, chroma=True, mel=True)
            x.append(feature)
            y.append(int(file_name.split("-")[2]))
        except:continue
    return train_test_split(np.array(x), np.array(y), test_size=test_size, random_state=2)

In [0]:
#Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.3)

100%|██████████| 1440/1440 [00:33<00:00, 42.74it/s]


In [0]:
#Get the shape of the training and testing datasets
print((x_train.shape[0], x_test.shape[0]))

(470, 202)


In [0]:
# Get the number of features extracted
print(f'Features extracted: {x_train.shape[1]}')

Features extracted: 180


In [0]:
#Initialize the Multi Layer Perceptron Classifier
model=MLPClassifier(alpha=0.01, batch_size=256, epsilon=1e-08, hidden_layer_sizes=(300,), learning_rate='adaptive', max_iter=500,verbose=2)

In [0]:
#DataFlair - Train the model
model.fit(x_train,y_train)

Iteration 1, loss = 22.87191766
Iteration 2, loss = 11.22955960
Iteration 3, loss = 4.57241991
Iteration 4, loss = 8.94152890
Iteration 5, loss = 7.09372804
Iteration 6, loss = 5.64829204
Iteration 7, loss = 2.38699633
Iteration 8, loss = 3.73084042
Iteration 9, loss = 3.69471201
Iteration 10, loss = 3.52600292
Iteration 11, loss = 3.03349813
Iteration 12, loss = 1.85571044
Iteration 13, loss = 2.22516522
Iteration 14, loss = 2.21240703
Iteration 15, loss = 1.89134670
Iteration 16, loss = 1.47478099
Iteration 17, loss = 1.71333958
Iteration 18, loss = 1.18655212
Iteration 19, loss = 1.44503522
Iteration 20, loss = 1.29782545
Iteration 21, loss = 1.09038538
Iteration 22, loss = 1.22090763
Iteration 23, loss = 1.07245173
Iteration 24, loss = 1.06101409
Iteration 25, loss = 1.02308693
Iteration 26, loss = 0.96864344
Iteration 27, loss = 0.95421722
Iteration 28, loss = 0.89689903
Iteration 29, loss = 0.90674557
Iteration 30, loss = 0.86290837
Iteration 31, loss = 0.86180472
Iteration 32, l

MLPClassifier(activation='relu', alpha=0.01, batch_size=256, beta_1=0.9,
              beta_2=0.999, early_stopping=False, epsilon=1e-08,
              hidden_layer_sizes=(300,), learning_rate='adaptive',
              learning_rate_init=0.001, max_fun=15000, max_iter=500,
              momentum=0.9, n_iter_no_change=10, nesterovs_momentum=True,
              power_t=0.5, random_state=None, shuffle=True, solver='adam',
              tol=0.0001, validation_fraction=0.1, verbose=2, warm_start=False)

In [0]:
#Predict for the test set
y_pred=model.predict(x_test)

In [0]:
#Calculate the accuracy of our model
accuracy=accuracy_score(y_true=y_test, y_pred=y_pred)
#Print the accuracy
print("Accuracy: {:.2f}%".format(accuracy*100))

Accuracy: 73.27%


In [0]:
x_train.shape

(470, 180)

In [0]:
y_train.shape

(470,)

In [0]:
import keras
from keras.models import Sequential
from keras.layers import Dense,Conv1D,Flatten# Neural network
input_shape = (180,1)
num_files = x_train.shape[0]
x_train = x_train.reshape(num_files,180,1)
y_train = y_train.reshape(num_files)
model0 = Sequential()
model0.add(Conv1D(32, kernel_size=(3), input_shape=input_shape))
model0.add(Conv1D(64, kernel_size=(3)))
model0.add(Conv1D(128, kernel_size=(3)))
model0.add(Conv1D(64, kernel_size=(3)))
model0.add(Conv1D(32, kernel_size=(3)))
model0.add(Flatten())
model0.add(Dense(128, activation='relu'))
model0.add(Dense(12, activation='relu'))
model0.add(Dense(9, activation='softmax'))

Using TensorFlow backend.


Instructions for updating:
If using Keras pass *_constraint arguments to layers.


In [0]:
model0.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model0.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv1d_1 (Conv1D)            (None, 178, 32)           128       
_________________________________________________________________
conv1d_2 (Conv1D)            (None, 176, 64)           6208      
_________________________________________________________________
conv1d_3 (Conv1D)            (None, 174, 128)          24704     
_________________________________________________________________
conv1d_4 (Conv1D)            (None, 172, 64)           24640     
_________________________________________________________________
conv1d_5 (Conv1D)            (None, 170, 32)           6176      
_________________________________________________________________
flatten_1 (Flatten)          (None, 5440)              0         
_________________________________________________________________
dense_1 (Dense)              (None, 128)              

In [0]:
model0.fit(x_train, y_train, epochs=85,verbose=1, batch_size=10)


Epoch 1/85
Epoch 2/85
Epoch 3/85
Epoch 4/85
Epoch 5/85
Epoch 6/85
Epoch 7/85
Epoch 8/85
Epoch 9/85
Epoch 10/85
Epoch 11/85
Epoch 12/85
Epoch 13/85
Epoch 14/85
Epoch 15/85
Epoch 16/85
Epoch 17/85
Epoch 18/85
Epoch 19/85
Epoch 20/85
Epoch 21/85
Epoch 22/85
Epoch 23/85
Epoch 24/85
Epoch 25/85
Epoch 26/85
Epoch 27/85
Epoch 28/85
Epoch 29/85
Epoch 30/85
Epoch 31/85
Epoch 32/85
Epoch 33/85
Epoch 34/85
Epoch 35/85
Epoch 36/85
Epoch 37/85
Epoch 38/85
Epoch 39/85
Epoch 40/85
Epoch 41/85
Epoch 42/85
Epoch 43/85
Epoch 44/85
Epoch 45/85
Epoch 46/85
Epoch 47/85
Epoch 48/85
Epoch 49/85
Epoch 50/85
Epoch 51/85
Epoch 52/85
Epoch 53/85
Epoch 54/85
Epoch 55/85
Epoch 56/85
Epoch 57/85
Epoch 58/85
Epoch 59/85
Epoch 60/85
Epoch 61/85
Epoch 62/85
Epoch 63/85
Epoch 64/85
Epoch 65/85
Epoch 66/85
Epoch 67/85
Epoch 68/85
Epoch 69/85
Epoch 70/85
Epoch 71/85
Epoch 72/85
Epoch 73/85
Epoch 74/85
Epoch 75/85
Epoch 76/85
Epoch 77/85
Epoch 78/85
Epoch 79/85
Epoch 80/85
Epoch 81/85
Epoch 82/85
Epoch 83/85
Epoch 84/85


<keras.callbacks.callbacks.History at 0x7f3e623055c0>

In [0]:
num_testfiles = y_test.shape[0]

In [0]:
x_test = x_test.reshape(num_testfiles,180,1)
y_test = y_test.reshape(num_testfiles)

In [0]:
model0.evaluate(x_test, y_test)



[4.006552399975239, 0.6534653306007385]

In [0]:
model0.predict(x_test[3].reshape(1,180,1)).argmax()

4