# Speech Feature Extraction

Compute MFCC features from an audio signal.

Parameters:	
signal – the audio signal from which to compute features. Should be an N*1 array

samplerate – the samplerate of the signal we are working with.

winlen – the length of the analysis window in seconds. Default is 0.025s (25 milliseconds)

winstep – the step between successive windows in seconds. Default is 0.01s (10 milliseconds)

numcep – the number of cepstrum to return, default 13

nfilt – the number of filters in the filterbank, default 26.

nfft – the FFT size. Default is 512.

lowfreq – lowest band edge of mel filters. In Hz, default is 0.

highfreq – highest band edge of mel filters. In Hz, default is samplerate/2

preemph – apply preemphasis filter with preemph as coefficient. 0 is no filter. Default is 0.97.

ceplifter – apply a lifter to final cepstral coefficients. 0 is no lifter. Default is 22.

appendEnergy – if this is true, the zeroth cepstral coefficient is replaced with the log of the total frame energy.

winfunc – the analysis window to apply to each frame. By default no window is applied. You can use numpy window functions here e.g. winfunc=numpy.hamming



# Returns:	
A numpy array of size (NUMFRAMES by numcep) containing features. Each row holds 1 feature vector.

# Extracting features from speech (1)

#This code is being used to extract features from audio files and save those features as an nxm array
#Speech features are extracted using library from: https://github.com/jameslyons/python_speech_features

In [30]:
from python_speech_features import mfcc, fbank, logfbank, ssc
import scipy.io.wavfile as wav
import numpy as np
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav
import glob
import librosa
import librosa.display
import numpy as np
import _pickle as pickle
import pandas as pd
# from features.base import ssc

In [31]:
cepCount=13 #no of MFCC coefficients
nfeatures = 7 #features per coefficient
elcount = 6

In [32]:
def audioread(datafs,gender_flag):
    (data, fs) = wav.read ("E:\Research Project\srp\speech_emotion_recognition-master/train_sounds/*.wav")#(datafs)
    ceps = mfcc(fs,numcep=cepCount)
    feat2 = ssc(fs,samplerate=16000,winlen=0.025,winstep=0.01,
          nfilt=26,nfft=512,lowfreq=0,highfreq=None,preemph=0.97)
    
    ls = []
    for i in range(ceps.shape[1]):
        temp = ceps[:,i]
        dtemp = np.gradient(temp)
        lfeatures  = [np.mean(temp), np.var(temp), np.amax(temp), np.amin(temp), 
                      np.var(dtemp), np.mean(temp[0:temp.shape[0]/2]), np.mean(temp[temp.shape[0]/2:temp.shape[0]])]
        temp2 = np.array(lfeatures)
        ls.append(temp2)
        ls2 = []

    for i in range(feat2.shape[1]):
        temp = feat2[:,i]
        dtemp = np.gradient(temp)
        lfeatures = [np.mean(temp), np.var(temp), np.amax(temp), np.amin(temp), 
                     np.var(dtemp), np.mean(temp[0:temp.shape[0]/2]), np.mean(temp[temp.shape[0]/2:temp.shape[0]])]
        temp2 = np.array(lfeatures)
        ls2.append(temp2)
        source = np.array(ls).flatten()
    source = np.append(source, np.array(ls2).flatten())
    return source

In [33]:
def load_data():
    emotions = ['anger','disgust','fear','happy','neutral','sadness','sarcastic','surprise']

    female_path = 'E:\Research Project\srp\speech_emotion_recognition-master/train_sounds/*.wav'
    
    max_len_male=max_len_female=0

    X_female=np.empty(shape=(1200,(cepCount + 26)*nfeatures ))

    y_female=np.empty(1200)
    mcount=fcount=0
    print ("Loop Started....")
#filename style: 3.2.anger-01.wav

    for j in range (1,16):
        if(j<=9):
            jstring = '0' + str(j)
        else:
            jstring = str(j)
    for i in range(1,11):

            for emo in emotions:
                x = female_path+str(i) + '/' + emo + '/' + '4.' + str(i) + '.' +emo +'-' + str(jstring) + '.wav'
                X_female[fcount]=audioread(x,gender_flag='female')
                y_female[fcount]=emotions.index(emo)
                fcount+=1
            return X_female, y_female
        
    if __name__ == '__main__':
        print ("Start loading...")
        X_female, y_female =load_data('E:\Research Project\srp\speech_emotion_recognition-master/train_sounds/*.wav')
        print ("Start dumping...")

        X_female.dump("X_female.dat")

        y_female.dump("y_female.dat")
        print ("Done.")

# Extracting features from speech (2)

This feature extraction is based on python_speech_features

given on it's documentation website

In [None]:
from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav

(rate,sig) = wav.read('E:\Research Project\srp\speech_emotion_recognition-master/train_sounds/*.wav')  #("E:\Research Project\Speech_data_set\sad.wav")
mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig,rate)

print(fbank_feat[1:3,:])

In [18]:


from python_speech_features import mfcc
from python_speech_features import logfbank
import scipy.io.wavfile as wav

(rate,sig) = wav.read("E:\Research Project\Speech_data_set\happy.wav")
mfcc_feat = mfcc(sig,rate)
fbank_feat = logfbank(sig,rate)

print(fbank_feat[1:3,:])



[[ 3.98925002  1.55628669  1.02290869  1.31898838  0.10848723 -1.21632936
   0.22212411  0.26025452  0.73097947  0.67905076  2.23369275  2.50297406
   0.98005822  2.11773165  2.61645334  2.27871831  2.91823917  3.70747811
   3.88890932  4.18140709  3.71037332  4.37614944  4.48221616  4.62118497
   5.01414063  4.97182261]
 [ 4.83645032  3.22506597  2.26641494  1.21201116  0.65638843  0.51337736
   0.87283654  1.30207716  0.88341416  1.69492847  4.42419862  5.53619654
   4.99265266  4.72749986  3.9100239   4.20551209  7.06144029  8.97861143
   7.29488082  4.82054291  5.81333404  5.96198238  5.51754492  5.53133111
   5.20259873  5.44707537]]
