In [1]:

# Import libraries
import sys
import numpy as np
import pandas as pd
import sklearn as sk
from os import listdir
from os.path import isfile, join
from timeit import default_timer as timer

from sklearn.preprocessing import OneHotEncoder, LabelEncoder
from six.moves import cPickle as pickle
from six.moves import range

import librosa
import soundfile as sf
from python_speech_features import mfcc
from python_speech_features import logfbank

from scipy.stats import kurtosis, skew

In [2]:
raw_sound = pd.read_csv('UrbanSound8K/metadata/UrbanSound8K.csv')
raw_sound.head()

Unnamed: 0,slice_file_name,fsID,start,end,salience,fold,classID,class
0,100032-3-0-0.wav,100032,0.0,0.317551,1,5,3,dog_bark
1,100263-2-0-117.wav,100263,58.5,62.5,1,5,2,children_playing
2,100263-2-0-121.wav,100263,60.5,64.5,1,5,2,children_playing
3,100263-2-0-126.wav,100263,63.0,67.0,1,5,2,children_playing
4,100263-2-0-137.wav,100263,68.5,72.5,1,5,2,children_playing


In [3]:
data, samplerate = sf.read('UrbanSound8K/audio/fold1/102106-3-0-0.wav')
print(data)
print(samplerate)

[[-0.01174927  0.03039551]
 [-0.01153564  0.02471924]
 [-0.01644897  0.01794434]
 ...
 [-0.00588989  0.00012207]
 [ 0.00314331  0.00585938]
 [ 0.00540161  0.00689697]]
44100


In [4]:
fold_list = ['fold1', 'fold2', 'fold3', 'fold4', 'fold5', 'fold6', 'fold7', 'fold8', 'fold9', 'fold10']

In [5]:
def extract_feature(file_name: str) -> tuple:
    """
    Extracts 193 chromatographic features from sound file. 
    including: MFCC's, Chroma_StFt, Melspectrogram, Spectral Contrast, and Tonnetz
    NOTE: this extraction technique changes the time series nature of the data
    """
    X, sample_rate = librosa.load(file_name)
    stft = np.abs(librosa.stft(X))
    mfccs = np.mean(librosa.feature.mfcc(y=X, sr=sample_rate, n_mfcc=40).T,axis=0)
    chroma = np.mean(librosa.feature.chroma_stft(S=stft, sr=sample_rate).T,axis=0)
    mel = np.mean(librosa.feature.melspectrogram(X, sr=sample_rate).T,axis=0)
    contrast = np.mean(librosa.feature.spectral_contrast(S=stft, sr=sample_rate).T,axis=0)
    tonnetz = np.mean(librosa.feature.tonnetz(y=librosa.effects.harmonic(X), sr=sample_rate).T,axis=0)
    return mfccs,chroma,mel,contrast,tonnetz

In [6]:
start_time = timer()
mfcc,chroma,mel,contrast,tonnetz = extract_feature('Sound/fold1/102106-3-0-0.wav')
end_time = timer()
print('time to extract features from one file: {:.3f}sec'.format((end_time-start_time)/60))

time to extract features from one file: 0.051sec


In [7]:
print(mfcc.shape,chroma.shape,mel.shape,contrast.shape,tonnetz.shape)
print(mfcc[0])
print(40+12+128+13)

(40,) (12,) (128,) (7,) (6,)
-253.1975508663975
193


In [8]:
mfcc_data = []
exception_count = 0

start_time = timer()
for i in range(10):
    # get file names
    mypath = 'UrbanSound8K/audio/'+ fold_list[i] + '/'
    files = [mypath + f for f in listdir(mypath) if isfile(join(mypath, f))]
    
    for fn in files:
        try: # extract features
            mfccs_feature,chroma_feature,mel_feature,contrast_feature,tonnetz_feature = extract_feature(fn)
            features = np.empty((0,193))
            ext_features = np.hstack([mfccs_feature,chroma_feature,mel_feature,contrast_feature,tonnetz_feature])
            features = np.vstack([features,ext_features])
            
        except: # else exception (.ds_store files are part of mac file systems)
            print(fn)
            exception_count += 1
            continue
            
        l_row = raw_sound.loc[raw_sound['slice_file_name']==fn.split('/')[-1]].values.tolist()
        label = l_row[0][-1]
        fold = i+1
    
        mfcc_data.append([features, features.shape, label, fold])
        
            #print(f,old_samplerate,ss)
        
print("Exceptions: ", exception_count)
end_time = timer()
print(print("time taken: {0} minutes {1:.1f} seconds".format((end_time - start_time)//60, (end_time - start_time)%60)))




UrbanSound8K/audio/fold1/.DS_Store


  if np.any(X < 0) or np.any(X_ref < 0):
  Z = np.maximum(X, X_ref).astype(dtype)
  bad_idx = (Z < np.finfo(dtype).tiny)


UrbanSound8K/audio/fold1/87275-1-1-0.wav
UrbanSound8K/audio/fold2/.DS_Store




UrbanSound8K/audio/fold3/.DS_Store
UrbanSound8K/audio/fold4/.DS_Store




UrbanSound8K/audio/fold5/.DS_Store




UrbanSound8K/audio/fold6/.DS_Store




UrbanSound8K/audio/fold7/.DS_Store




UrbanSound8K/audio/fold8/.DS_Store
UrbanSound8K/audio/fold9/.DS_Store
UrbanSound8K/audio/fold10/.DS_Store
Exceptions:  11
time taken: 157.0 minutes 40.3 seconds
None


In [9]:

cols=["features", "shape","label", "fold"]
mfcc_pd = pd.DataFrame(data = mfcc_data, columns=cols)
mfcc_pd.head(1)

Unnamed: 0,features,shape,label,fold
0,"[[-360.65090359535833, 102.50213616533911, -49...","(1, 193)",children_playing,1


In [10]:

# Convert label to class number
le = LabelEncoder()
label_num = le.fit_transform(mfcc_pd["label"])

# one hot encode
ohe = OneHotEncoder()
onehot = ohe.fit_transform(label_num.reshape(-1, 1))

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [11]:
for i in range(10):
    mfcc_pd[le.classes_[i]] = onehot[:,i].toarray()

In [12]:

mfcc_pd.head(1)

Unnamed: 0,features,shape,label,fold,air_conditioner,car_horn,children_playing,dog_bark,drilling,engine_idling,gun_shot,jackhammer,siren,street_music
0,"[[-360.65090359535833, 102.50213616533911, -49...","(1, 193)",children_playing,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [13]:
ll = [mfcc_pd['features'][i].ravel() for i in range(mfcc_pd.shape[0])]
mfcc_pd['sample'] = pd.Series(ll, index=mfcc_pd.index)
del mfcc_pd['features']

In [14]:

mfcc_pd.head(1)

Unnamed: 0,shape,label,fold,air_conditioner,car_horn,children_playing,dog_bark,drilling,engine_idling,gun_shot,jackhammer,siren,street_music,sample
0,"(1, 193)",children_playing,1,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,"[-360.65090359535833, 102.50213616533911, -49...."


In [15]:

# for use in Networks with 193 features.ipynb
pickle.dump(mfcc_pd, open('193_features.p','wb'))