# Feature Extraction
 
 

Before building a model that recognises the emotion from audio file,we need to extract the features from them. 
Here are some of the features introduced, that can be extracted from audio data. These features are used to predict for example the age, gender and accent type of the speaker. 



1. MFCC
2. mel 
4. Chroma

In [30]:
# Array operations and useful analysis functionalities
import pandas as pd
import numpy as np



import librosa
import soundfile
import os, glob, pickle
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import accuracy_score

In [32]:
# take the meta-data generated in the first part
ref= pd.read_csv("data_path.csv")
ref.sample(5)

Unnamed: 0,path,emotion,actor_gender
7492,data/Crema/AudioWAV/1012_IOM_HAP_XX.wav,happy,female
3948,data/Crema/AudioWAV/1022_MTI_DIS_XX.wav,disgust,male
1449,data/TESS Toronto emotional speech set data/TE...,angry,female
9983,data/Crema/AudioWAV/1035_IWW_SAD_XX.wav,sad,male
443,data/Crema/AudioWAV/1008_IEO_ANG_LO.wav,angry,female


So we've already seen the shape of an MFCC output for each file, and it's a 2D matrix of the number of bands by time.
In order to optimise space and memory, we're going to read each audio file, extract its mean across all MFCC bands by time, and just keep the extracted features, dropping the entire audio file data.

In [3]:
# Note this takes a couple of minutes (~10 mins) as we're iterating over 4 datasets 
df = pd.DataFrame(columns=['feature'])

# loop feature extraction over the entire dataset
counter=0
for index,path in enumerate(ref.path):
    X, sample_rate = librosa.load(path
                                  , res_type='kaiser_fast'
                                  ,duration=2.5
                                  ,sr=44100
                                  ,offset=0.5
                                 )
    sample_rate = np.array(sample_rate)
    
    # mean as the feature. Could do min and max etc as well. 
    mfccs = np.mean(librosa.feature.mfcc(y=X, 
                                        sr=sample_rate, 
                                        n_mfcc=13),
                    axis=0)
    df.loc[counter] = [mfccs]
    counter=counter+1   

# Check a few records to make sure its processed successfully

df.head()

12162


Unnamed: 0,feature
0,"[-10.582805, -11.897231, -13.121095, -12.33489..."
1,"[-11.052746, -13.703402, -17.22484, -17.253716..."
2,"[-11.002943, -13.060469, -15.322118, -13.94199..."
3,"[-11.964613, -8.894431, -9.93376, -10.790269, ..."
4,"[-11.0525875, -11.965088, -13.133052, -13.0424..."


In [4]:
# Now extract the mean bands to its own feature columns
df = pd.concat([ref,pd.DataFrame(df['feature'].values.tolist())],axis=1)
df.head()

Unnamed: 0,path,emotion,actor_gender,0,1,2,3,4,5,6,...,206,207,208,209,210,211,212,213,214,215
0,data/Crema/AudioWAV/1082_IEO_ANG_MD.wav,angry,female,-10.582805,-11.897231,-13.121095,-12.334898,-10.539809,-10.11869,-11.975557,...,,,,,,,,,,
1,data/Crema/AudioWAV/1025_IEO_ANG_LO.wav,angry,female,-11.052746,-13.703402,-17.22484,-17.253716,-16.179312,-16.374655,-15.442269,...,,,,,,,,,,
2,data/Crema/AudioWAV/1025_IEO_ANG_MD.wav,angry,female,-11.002943,-13.060469,-15.322118,-13.941996,-15.831221,-16.475147,-16.201067,...,-16.881512,-16.517555,-17.443207,-16.626356,-17.677702,-18.296568,-17.951214,-15.782665,-15.628963,-17.136906
3,data/Crema/AudioWAV/1081_MTI_ANG_XX.wav,angry,male,-11.964613,-8.894431,-9.93376,-10.790269,-10.494925,-12.386085,-12.196976,...,,,,,,,,,,
4,data/Crema/AudioWAV/1025_IOM_ANG_XX.wav,angry,female,-11.052588,-11.965088,-13.133052,-13.04248,-13.006114,-12.806181,-15.715652,...,,,,,,,,,,


In [21]:
# replace NA with 0
df=df.fillna(0)
print(df.shape)
df[:5]

(12162, 219)


Unnamed: 0,path,emotion,actor_gender,0,1,2,3,4,5,6,...,206,207,208,209,210,211,212,213,214,215
0,data/Crema/AudioWAV/1082_IEO_ANG_MD.wav,angry,female,-10.582805,-11.897231,-13.121095,-12.334898,-10.539809,-10.11869,-11.975557,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,data/Crema/AudioWAV/1025_IEO_ANG_LO.wav,angry,female,-11.052746,-13.703402,-17.22484,-17.253716,-16.179312,-16.374655,-15.442269,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,data/Crema/AudioWAV/1025_IEO_ANG_MD.wav,angry,female,-11.002943,-13.060469,-15.322118,-13.941996,-15.831221,-16.475147,-16.201067,...,-16.881512,-16.517555,-17.443207,-16.626356,-17.677702,-18.296568,-17.951214,-15.782665,-15.628963,-17.136906
3,data/Crema/AudioWAV/1081_MTI_ANG_XX.wav,angry,male,-11.964613,-8.894431,-9.93376,-10.790269,-10.494925,-12.386085,-12.196976,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,data/Crema/AudioWAV/1025_IOM_ANG_XX.wav,angry,female,-11.052588,-11.965088,-13.133052,-13.04248,-13.006114,-12.806181,-15.715652,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [33]:
def extract_feature(data):
  
    X, sample_rate = librosa.load(file_name)

    # sftf
    stft = np.abs(librosa.stft(X))

    # mfcc
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)

    # chroma
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)

    # melspectrogram
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)

    # spectral contrast
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)

    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    
    return sftf,mfccs,chroma,mel



In [34]:
X, Y = [], []
for index,path in enumerate(ref.path):
    feature = get_features(path)
    for ele in feature:
        X.append(ele)
        # appending emotion 3 times as we have made 3 augmentation techniques on each audio file.
        Y.append(emotion)

UnboundLocalError: local variable 'result' referenced before assignment