## Extract feature with speaker dependent

In [None]:
# import library
import glob
import os
import librosa
import numpy as np

In [None]:
# clone dataset from github
!git clone https://github.com/miftanurfarid/speech_emotion_recognition/

Cloning into 'speech_emotion_recognition'...
remote: Enumerating objects: 6970, done.[K
remote: Counting objects: 100% (109/109), done.[K
remote: Compressing objects: 100% (50/50), done.[K
remote: Total 6970 (delta 66), reused 101 (delta 59), pack-reused 6861[K
Receiving objects: 100% (6970/6970), 745.22 MiB | 26.12 MiB/s, done.
Resolving deltas: 100% (386/386), done.
Checking out files: 100% (5306/5306), done.


In [None]:
cd speech_emotion_recognition

/content/speech_emotion_recognition


In [None]:
data_path = 'data/speech/'
files = glob.glob(os.path.join(data_path + '/*/', '*.wav'))
files.sort()
files[0]

'data/speech/Actor_01/03-01-01-01-01-01-01.wav'

In [None]:
# function to extract feature
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name, sr=None)
    stft = np.abs(librosa.stft(X))
    mfcc = np.mean(librosa.feature.mfcc(
        y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    mfcc_std = np.std(librosa.feature.mfcc(
        y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    chroma_std = np.std(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    mel_std = np.std(librosa.feature.melspectrogram(
        X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(
        S=stft, sr=sample_rate).T, axis=0)
    contrast_std = np.std(librosa.feature.spectral_contrast(
        S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(
        y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    tonnetz_std = np.std(librosa.feature.tonnetz(
        y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    return (mfcc, chroma, mel, contrast, tonnetz,
            mfcc_std, chroma_std, mel_std, contrast_std, tonnetz_std)

In [None]:
# create empty list to store features and labels
feat = []
lab = []

In [None]:
# iterate over all files
for file in files:
    print("Extracting features from ", file)
    feat_i = np.hstack(extract_feature(file))
    lab_i = os.path.basename(file).split('-')[2]
    feat.append(feat_i)
    lab.append(int(lab_i)-1)  # make labels start from 0

Extracting features from  data/speech/Actor_01/03-01-01-01-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-01-01-01-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-01-01-02-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-01-01-02-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-01-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-02-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-02-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-01-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-02-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-02-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-03-01-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-03-01-01-02

In [None]:
np.save(data_path + 'x.npy', feat)
np.save(data_path + 'y.npy', lab)