# 1. Imports

## 1.1. Libraries

In [1]:
import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
import matplotlib.pyplot as plt

## 1.2. Import files

In [2]:
notebook_path = os.path.abspath("Speech_emotion_recognition.ipynb")
notebook_folder = os.path.dirname(notebook_path)
datasets_1_path = os.path.join(os.path.dirname(notebook_path), "Audio_Speech_Actors_01-24/")
datasets_2_path = os.path.join(os.path.dirname(notebook_path), "Audio_Song_Actors_01-24/")

# 2. Data preparation

In [3]:
# Defining a function, taking as inputs: a sound file, and booleans that indicates information to extract from the file, if needed.

def extract_feature(filename,mfcc,chroma,mel):
    """ Extract features from a soundfile - including Mel-frequency cepstral coefficients (MFCCs),/
    chromagram, mel-scaled spectrogram"""
    
    with soundfile.SoundFile(filename) as sound_file:
        X = sound_file.read(dtype = "float32")
        #sound_file = AudioSegment.from_wav(BOSpeeches)
        #sound_file = sound_file.set_channels(1)

        sample_rate = sound_file.samplerate
        
        if chroma:
            short_term_fourier_transform = np.abs(librosa.stft(X))
        result = np.array([])
        if mfcc:
            res1 = np.mean(librosa.feature.mfcc(y = X, sr = sample_rate, n_mfcc = 40).T, axis = 0)
            result = np.hstack((result,res1))
        if chroma:
            res2 = np.mean(librosa.feature.chroma_stft(S = short_term_fourier_transform, sr = sample_rate).T,axis = 0)
            result = np.hstack((result, res2))
        if mel:
            res3 = np.mean(librosa.feature.melspectrogram(y = X, sr = sample_rate).T,axis = 0)
            result = np.hstack((result, res3))
        return result

In [4]:
# Emotions correspondance with the name of the files. E.g., neutral emotions would be linked with the code "01".
emotions = {
    "01":"neutral",
    "02":"calm",
    "03":"happy",
    "04":"sad",
    "05":"angry",
    "06":"fearful",
    "07":"disgust",
    "08":"surprised"
}

# Emotions to observe - useful to only select a few
observed_emotions = ["neutral","calm","happy","sad","angry","fearful","disgust","surprised"]

In [5]:
# Load the data and extract features for each sound file

def load_data(test_size=0.2):
    """ Load data from speech and songs files playing the specified emotions - and return train / test datasets,/
    based on the specified test size."""
    x,y=[],[]
    
    for file in glob.glob(datasets_1_path+"/Actor_*/*.wav"): # Speeches
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        try:
            feature = extract_feature(file, mfcc = True, chroma = True, mel = True)
            x.append(feature)
            y.append(emotion)
        except:
            print(file_name)
    
    for file in glob.glob(datasets_2_path+"/Actor_*/*.wav"): # Songs
        file_name = os.path.basename(file)
        emotion = emotions[file_name.split("-")[2]]
        if emotion not in observed_emotions:
            continue
        try:
            feature = extract_feature(file, mfcc = True, chroma = True, mel = True)
            x.append(feature)
            y.append(emotion)
        except:
            print(file_name)
        
    return train_test_split(np.array(x), y, test_size=test_size, random_state=0)

In [6]:
# Split the dataset
x_train,x_test,y_train,y_test=load_data(test_size=0.25)

03-01-06-01-01-02-20.wav
03-01-03-01-02-01-20.wav
03-01-02-01-02-02-05.wav
03-01-02-01-01-02-01.wav
03-01-08-01-02-02-01.wav
03-02-01-01-01-01-24.wav


### Sanity checks

In [7]:
x_train.shape

(1834, 180)

In [8]:
len(y_train)

1834

# 3. Model building

## 3.1. Finding the best parameters

In [9]:
model_test = MLPClassifier(max_iter=500)

parameter_space = {
    "hidden_layer_sizes": [(300,),(100,),(300,200,400),(180,),(360,180,360)],
    #"activation": ["tanh", "relu","logistic","identity"],
    "solver": ["adam"],
    #"alpha": [1e-08, 1e-09, 1e-07],
    "batch_size": [128,256,512],
    #"learning_rate": ["constant" ,"adaptive","invscaling"],
    "learning_rate": ["constant"],
}

from sklearn.model_selection import GridSearchCV

clf = GridSearchCV(model, parameter_space, n_jobs=-1, cv=5)

clf.fit(x_train, y_train)

print('Best parameters found:\n', clf.best_params_)

NameError: name 'model' is not defined

## 3.2. Setting the model

In [None]:
model = MLPClassifier(batch_size = 128, solver = "adam", alpha = 1e-08, 
                      hidden_layer_sizes = (360,180,360), learning_rate = 'constant', max_iter = 1000)

In [None]:
model.fit(x_train,y_train)

In [None]:
y_pred = model.predict(x_test)

## 3.3. Visualizing results

In [None]:
# Calculate the accuracy of our model
accuracy = accuracy_score(y_true = y_test, y_pred = y_pred)
# Print the accuracy
print("Accuracy: {:.3f}%".format(accuracy*100))

In [None]:
disp = plot_confusion_matrix(model, x_test, y_test,
                            display_labels = observed_emotions,
                            cmap = plt.cm.Blues,
                            normalize = "true")
plt.xticks(rotation=90)
plt.show()

## 3.4. Exporting the model

In [None]:
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# 4. Using the trained model to make predictions on new data.

In [None]:
# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(x_test, y_test)
print(result)

In [None]:
def test_model(folder):
    """ Will test the model on clips present inside the specified folder."""
    x=[]
    for file in glob.glob(folder+"/chunk*.wav"):
        file_name = os.path.basename(file)
        try:
            feature = extract_feature(file, mfcc=True, chroma=True, mel=True)
            x.append(feature)
        except:
            print(file_name)
    return x

In [None]:
x_bospeech = test_model(notebook_folder)

In [None]:
y_bospeech=loaded_model.predict(x_bospeech)

In [None]:
y_bospeech

In [None]:
plt.hist(y_bospeech)