In [1]:
# import library
import glob
import os
import librosa
import numpy as np

In [2]:
# clone dataset from github
!git clone https://github.com/miftanurfarid/speech_emotion_recognition/

Cloning into 'speech_emotion_recognition'...
remote: Enumerating objects: 6952, done.[K
remote: Counting objects: 100% (91/91), done.[K
remote: Compressing objects: 100% (39/39), done.[K
remote: Total 6952 (delta 52), reused 91 (delta 52), pack-reused 6861[K
Receiving objects: 100% (6952/6952), 742.23 MiB | 24.84 MiB/s, done.
Resolving deltas: 100% (372/372), done.
Checking out files: 100% (5297/5297), done.


In [3]:
cd speech_emotion_recognition

/content/speech_emotion_recognition


In [4]:
data_path = 'data/speech/'
files = glob.glob(os.path.join(data_path + '/*/', '*.wav'))
files.sort()
files[0]

'data/speech/Actor_01/03-01-01-01-01-01-01.wav'

In [5]:
# function to extract feature
def extract_feature(file_name):
    X, sample_rate = librosa.load(file_name, sr=None)
    stft = np.abs(librosa.stft(X))
    mfcc = np.mean(librosa.feature.mfcc(
        y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    mfcc_std = np.std(librosa.feature.mfcc(
        y=X, sr=sample_rate, n_mfcc=40).T, axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    chroma_std = np.std(librosa.feature.chroma_stft(
        S=stft, sr=sample_rate).T, axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T, axis=0)
    mel_std = np.std(librosa.feature.melspectrogram(
        X, sr=sample_rate).T, axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(
        S=stft, sr=sample_rate).T, axis=0)
    contrast_std = np.std(librosa.feature.spectral_contrast(
        S=stft, sr=sample_rate).T, axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(
        y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    tonnetz_std = np.std(librosa.feature.tonnetz(
        y=librosa.effects.harmonic(X), sr=sample_rate).T, axis=0)
    return (mfcc, chroma, mel, contrast, tonnetz,
            mfcc_std, chroma_std, mel_std, contrast_std, tonnetz_std)

In [6]:
# create empty list to store features and labels
feat_train = []
feat_test = []
lab_train = []
lab_test = []

In [7]:
# iterate over all files
for file in files:
    print("Extracting features from ", file)
    feat_i = np.hstack(extract_feature(file))
    lab_i = os.path.basename(file).split('-')[2]
    # create speaker independent split
    if int(file[-6:-4]) > 20:
        feat_test.append(feat_i)
        lab_test.append(int(lab_i)-1)
    else:
        feat_train.append(feat_i)
        lab_train.append(int(lab_i)-1)  # make labels start from 0

Extracting features from  data/speech/Actor_01/03-01-01-01-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-01-01-01-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-01-01-02-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-01-01-02-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-01-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-02-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-01-02-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-01-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-02-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-02-02-02-02-01.wav
Extracting features from  data/speech/Actor_01/03-01-03-01-01-01-01.wav
Extracting features from  data/speech/Actor_01/03-01-03-01-01-02

In [8]:
# save as npy files
np.save(data_path + 'x_train.npy', feat_train)
np.save(data_path + 'x_test.npy', feat_test)
np.save(data_path + 'y_train.npy', lab_train)
np.save(data_path + 'y_test.npy', lab_test)