In [1]:
import numpy as np
import librosa
import math
import re
import os
from keras.models import Sequential
from keras.layers.recurrent import LSTM
from keras.layers import Dense
from sklearn.preprocessing import LabelEncoder
from keras.utils.np_utils import to_categorical 
from sklearn.model_selection import train_test_split

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
class AudioDataGenerator:
    
    dir_train = "./digit_dataset/train"
    num_classes = 10 #  Digits [0-9]
    
    def __init__(self):
        self.hop_length = 512
        self.timeseries_length_list = []
        
    def preprocessSingleFile(self, file):
        timeseries_length = 6
        hop_length = 512
        data = np.zeros((1, timeseries_length, 33), dtype=np.float64)
        
        y, sr = librosa.load(file)
        mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length= 512, n_mfcc= 13)
        spectral_center = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length= hop_length)
        chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
        spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)


        data[0, :, 0:13] = mfcc.T[0:timeseries_length, :]
        data[0, :, 13:14] = spectral_center.T[0:timeseries_length, :]
        data[0, :, 14:26] = chroma.T[0:timeseries_length, :]
        data[0, :, 26:33] = spectral_contrast.T[0:timeseries_length, :]
        
        return data
        
        
        
        
    def oneHotEncode(self,num_classes):
        encoder = LabelEncoder()
        
        self.train_Y = encoder.fit_transform(self.train_Y)
        self.train_Y = to_categorical(self.train_Y, num_classes)
  
    def path_to_audiofiles(self, dir_folder):
        list_of_audio = []
        for file in os.listdir(dir_folder):
            if file.endswith(".au") or file.endswith(".wav") or file.endswith(".wma"):
                directory = "%s/%s" % (dir_folder, file)
                list_of_audio.append(directory)
        return list_of_audio    
     
        
    def completePreprocessAudio(self):
        trainfiles_list = self.path_to_audiofiles(self.dir_train)
        
        #Whole dataset.
        self.train_X, self.train_Y=self.extractAudioFeatures(trainfiles_list)
    
        self.oneHotEncode(self.num_classes)
        
    
    def extractAudioFeatures(self, list_of_audiofiles):
        timeseries_length = 6
        hop_length = 512
        data = np.zeros((len(list_of_audiofiles), timeseries_length, 33), dtype=np.float64)
        target = []

        for i, file in enumerate(list_of_audiofiles):

            y, sr = librosa.load(file)
            mfcc = librosa.feature.mfcc(y=y, sr=sr, hop_length= 512, n_mfcc= 13)
            spectral_center = librosa.feature.spectral_centroid(y=y, sr=sr, hop_length= hop_length)
            chroma = librosa.feature.chroma_stft(y=y, sr=sr, hop_length=hop_length)
            spectral_contrast = librosa.feature.spectral_contrast(y=y, sr=sr, hop_length=hop_length)

            genre  = file[22]
            target.append(genre)

            data[i, :, 0:13] = mfcc.T[0:timeseries_length, :]
            data[i, :, 13:14] = spectral_center.T[0:timeseries_length, :]
            data[i, :, 14:26] = chroma.T[0:timeseries_length, :]
            data[i, :, 26:33] = spectral_contrast.T[0:timeseries_length, :]
            print("File:",file," ",i + 1," of ", len(list_of_audiofiles))
            
        print("All dataset features have been extracted.")
        return data, np.expand_dims(np.asarray(target), axis=1)

    

In [3]:
datagen = AudioDataGenerator()
datagen.completePreprocessAudio()

File: ./digit_dataset/train/0_jackson_0.wav   1  of  1500
File: ./digit_dataset/train/0_jackson_1.wav   2  of  1500
File: ./digit_dataset/train/0_jackson_10.wav   3  of  1500
File: ./digit_dataset/train/0_jackson_11.wav   4  of  1500
File: ./digit_dataset/train/0_jackson_12.wav   5  of  1500
File: ./digit_dataset/train/0_jackson_13.wav   6  of  1500
File: ./digit_dataset/train/0_jackson_14.wav   7  of  1500
File: ./digit_dataset/train/0_jackson_15.wav   8  of  1500
File: ./digit_dataset/train/0_jackson_16.wav   9  of  1500
File: ./digit_dataset/train/0_jackson_17.wav   10  of  1500
File: ./digit_dataset/train/0_jackson_18.wav   11  of  1500
File: ./digit_dataset/train/0_jackson_19.wav   12  of  1500
File: ./digit_dataset/train/0_jackson_2.wav   13  of  1500
File: ./digit_dataset/train/0_jackson_20.wav   14  of  1500
File: ./digit_dataset/train/0_jackson_21.wav   15  of  1500
File: ./digit_dataset/train/0_jackson_22.wav   16  of  1500
File: ./digit_dataset/train/0_jackson_23.wav   17  o

File: ./digit_dataset/train/0_theo_44.wav   140  of  1500
File: ./digit_dataset/train/0_theo_45.wav   141  of  1500
File: ./digit_dataset/train/0_theo_46.wav   142  of  1500
File: ./digit_dataset/train/0_theo_47.wav   143  of  1500
File: ./digit_dataset/train/0_theo_48.wav   144  of  1500
File: ./digit_dataset/train/0_theo_49.wav   145  of  1500
File: ./digit_dataset/train/0_theo_5.wav   146  of  1500
File: ./digit_dataset/train/0_theo_6.wav   147  of  1500
File: ./digit_dataset/train/0_theo_7.wav   148  of  1500
File: ./digit_dataset/train/0_theo_8.wav   149  of  1500
File: ./digit_dataset/train/0_theo_9.wav   150  of  1500
File: ./digit_dataset/train/1_jackson_0.wav   151  of  1500
File: ./digit_dataset/train/1_jackson_1.wav   152  of  1500
File: ./digit_dataset/train/1_jackson_10.wav   153  of  1500
File: ./digit_dataset/train/1_jackson_11.wav   154  of  1500
File: ./digit_dataset/train/1_jackson_12.wav   155  of  1500
File: ./digit_dataset/train/1_jackson_13.wav   156  of  1500
Fil

File: ./digit_dataset/train/1_theo_33.wav   278  of  1500
File: ./digit_dataset/train/1_theo_34.wav   279  of  1500
File: ./digit_dataset/train/1_theo_35.wav   280  of  1500
File: ./digit_dataset/train/1_theo_36.wav   281  of  1500
File: ./digit_dataset/train/1_theo_37.wav   282  of  1500
File: ./digit_dataset/train/1_theo_38.wav   283  of  1500
File: ./digit_dataset/train/1_theo_39.wav   284  of  1500
File: ./digit_dataset/train/1_theo_4.wav   285  of  1500
File: ./digit_dataset/train/1_theo_40.wav   286  of  1500
File: ./digit_dataset/train/1_theo_41.wav   287  of  1500
File: ./digit_dataset/train/1_theo_42.wav   288  of  1500
File: ./digit_dataset/train/1_theo_43.wav   289  of  1500
File: ./digit_dataset/train/1_theo_44.wav   290  of  1500
File: ./digit_dataset/train/1_theo_45.wav   291  of  1500
File: ./digit_dataset/train/1_theo_46.wav   292  of  1500
File: ./digit_dataset/train/1_theo_47.wav   293  of  1500
File: ./digit_dataset/train/1_theo_48.wav   294  of  1500
File: ./digit_d

File: ./digit_dataset/train/2_theo_24.wav   418  of  1500
File: ./digit_dataset/train/2_theo_25.wav   419  of  1500
File: ./digit_dataset/train/2_theo_26.wav   420  of  1500
File: ./digit_dataset/train/2_theo_27.wav   421  of  1500
File: ./digit_dataset/train/2_theo_28.wav   422  of  1500
File: ./digit_dataset/train/2_theo_29.wav   423  of  1500
File: ./digit_dataset/train/2_theo_3.wav   424  of  1500
File: ./digit_dataset/train/2_theo_30.wav   425  of  1500
File: ./digit_dataset/train/2_theo_31.wav   426  of  1500
File: ./digit_dataset/train/2_theo_32.wav   427  of  1500
File: ./digit_dataset/train/2_theo_33.wav   428  of  1500
File: ./digit_dataset/train/2_theo_34.wav   429  of  1500
File: ./digit_dataset/train/2_theo_35.wav   430  of  1500
File: ./digit_dataset/train/2_theo_36.wav   431  of  1500
File: ./digit_dataset/train/2_theo_37.wav   432  of  1500
File: ./digit_dataset/train/2_theo_38.wav   433  of  1500
File: ./digit_dataset/train/2_theo_39.wav   434  of  1500
File: ./digit_d

File: ./digit_dataset/train/3_theo_14.wav   557  of  1500
File: ./digit_dataset/train/3_theo_15.wav   558  of  1500
File: ./digit_dataset/train/3_theo_16.wav   559  of  1500
File: ./digit_dataset/train/3_theo_17.wav   560  of  1500
File: ./digit_dataset/train/3_theo_18.wav   561  of  1500
File: ./digit_dataset/train/3_theo_19.wav   562  of  1500
File: ./digit_dataset/train/3_theo_2.wav   563  of  1500
File: ./digit_dataset/train/3_theo_20.wav   564  of  1500
File: ./digit_dataset/train/3_theo_21.wav   565  of  1500
File: ./digit_dataset/train/3_theo_22.wav   566  of  1500
File: ./digit_dataset/train/3_theo_23.wav   567  of  1500
File: ./digit_dataset/train/3_theo_24.wav   568  of  1500
File: ./digit_dataset/train/3_theo_25.wav   569  of  1500
File: ./digit_dataset/train/3_theo_26.wav   570  of  1500
File: ./digit_dataset/train/3_theo_27.wav   571  of  1500
File: ./digit_dataset/train/3_theo_28.wav   572  of  1500
File: ./digit_dataset/train/3_theo_29.wav   573  of  1500
File: ./digit_d

File: ./digit_dataset/train/4_nicolas_5.wav   696  of  1500
File: ./digit_dataset/train/4_nicolas_6.wav   697  of  1500
File: ./digit_dataset/train/4_nicolas_7.wav   698  of  1500
File: ./digit_dataset/train/4_nicolas_8.wav   699  of  1500
File: ./digit_dataset/train/4_nicolas_9.wav   700  of  1500
File: ./digit_dataset/train/4_theo_0.wav   701  of  1500
File: ./digit_dataset/train/4_theo_1.wav   702  of  1500
File: ./digit_dataset/train/4_theo_10.wav   703  of  1500
File: ./digit_dataset/train/4_theo_11.wav   704  of  1500
File: ./digit_dataset/train/4_theo_12.wav   705  of  1500
File: ./digit_dataset/train/4_theo_13.wav   706  of  1500
File: ./digit_dataset/train/4_theo_14.wav   707  of  1500
File: ./digit_dataset/train/4_theo_15.wav   708  of  1500
File: ./digit_dataset/train/4_theo_16.wav   709  of  1500
File: ./digit_dataset/train/4_theo_17.wav   710  of  1500
File: ./digit_dataset/train/4_theo_18.wav   711  of  1500
File: ./digit_dataset/train/4_theo_19.wav   712  of  1500
File: 

File: ./digit_dataset/train/5_nicolas_44.wav   840  of  1500
File: ./digit_dataset/train/5_nicolas_45.wav   841  of  1500
File: ./digit_dataset/train/5_nicolas_46.wav   842  of  1500
File: ./digit_dataset/train/5_nicolas_47.wav   843  of  1500
File: ./digit_dataset/train/5_nicolas_48.wav   844  of  1500
File: ./digit_dataset/train/5_nicolas_49.wav   845  of  1500
File: ./digit_dataset/train/5_nicolas_5.wav   846  of  1500
File: ./digit_dataset/train/5_nicolas_6.wav   847  of  1500
File: ./digit_dataset/train/5_nicolas_7.wav   848  of  1500
File: ./digit_dataset/train/5_nicolas_8.wav   849  of  1500
File: ./digit_dataset/train/5_nicolas_9.wav   850  of  1500
File: ./digit_dataset/train/5_theo_0.wav   851  of  1500
File: ./digit_dataset/train/5_theo_1.wav   852  of  1500
File: ./digit_dataset/train/5_theo_10.wav   853  of  1500
File: ./digit_dataset/train/5_theo_11.wav   854  of  1500
File: ./digit_dataset/train/5_theo_12.wav   855  of  1500
File: ./digit_dataset/train/5_theo_13.wav   85

File: ./digit_dataset/train/6_nicolas_38.wav   983  of  1500
File: ./digit_dataset/train/6_nicolas_39.wav   984  of  1500
File: ./digit_dataset/train/6_nicolas_4.wav   985  of  1500
File: ./digit_dataset/train/6_nicolas_40.wav   986  of  1500
File: ./digit_dataset/train/6_nicolas_41.wav   987  of  1500
File: ./digit_dataset/train/6_nicolas_42.wav   988  of  1500
File: ./digit_dataset/train/6_nicolas_43.wav   989  of  1500
File: ./digit_dataset/train/6_nicolas_44.wav   990  of  1500
File: ./digit_dataset/train/6_nicolas_45.wav   991  of  1500
File: ./digit_dataset/train/6_nicolas_46.wav   992  of  1500
File: ./digit_dataset/train/6_nicolas_47.wav   993  of  1500
File: ./digit_dataset/train/6_nicolas_48.wav   994  of  1500
File: ./digit_dataset/train/6_nicolas_49.wav   995  of  1500
File: ./digit_dataset/train/6_nicolas_5.wav   996  of  1500
File: ./digit_dataset/train/6_nicolas_6.wav   997  of  1500
File: ./digit_dataset/train/6_nicolas_7.wav   998  of  1500
File: ./digit_dataset/train/

File: ./digit_dataset/train/7_nicolas_25.wav   1119  of  1500
File: ./digit_dataset/train/7_nicolas_26.wav   1120  of  1500
File: ./digit_dataset/train/7_nicolas_27.wav   1121  of  1500
File: ./digit_dataset/train/7_nicolas_28.wav   1122  of  1500
File: ./digit_dataset/train/7_nicolas_29.wav   1123  of  1500
File: ./digit_dataset/train/7_nicolas_3.wav   1124  of  1500
File: ./digit_dataset/train/7_nicolas_30.wav   1125  of  1500
File: ./digit_dataset/train/7_nicolas_31.wav   1126  of  1500
File: ./digit_dataset/train/7_nicolas_32.wav   1127  of  1500
File: ./digit_dataset/train/7_nicolas_33.wav   1128  of  1500
File: ./digit_dataset/train/7_nicolas_34.wav   1129  of  1500
File: ./digit_dataset/train/7_nicolas_35.wav   1130  of  1500
File: ./digit_dataset/train/7_nicolas_36.wav   1131  of  1500
File: ./digit_dataset/train/7_nicolas_37.wav   1132  of  1500
File: ./digit_dataset/train/7_nicolas_38.wav   1133  of  1500
File: ./digit_dataset/train/7_nicolas_39.wav   1134  of  1500
File: ./d

File: ./digit_dataset/train/8_nicolas_15.wav   1258  of  1500
File: ./digit_dataset/train/8_nicolas_16.wav   1259  of  1500
File: ./digit_dataset/train/8_nicolas_17.wav   1260  of  1500
File: ./digit_dataset/train/8_nicolas_18.wav   1261  of  1500
File: ./digit_dataset/train/8_nicolas_19.wav   1262  of  1500
File: ./digit_dataset/train/8_nicolas_2.wav   1263  of  1500
File: ./digit_dataset/train/8_nicolas_20.wav   1264  of  1500
File: ./digit_dataset/train/8_nicolas_21.wav   1265  of  1500
File: ./digit_dataset/train/8_nicolas_22.wav   1266  of  1500
File: ./digit_dataset/train/8_nicolas_23.wav   1267  of  1500
File: ./digit_dataset/train/8_nicolas_24.wav   1268  of  1500
File: ./digit_dataset/train/8_nicolas_25.wav   1269  of  1500
File: ./digit_dataset/train/8_nicolas_26.wav   1270  of  1500
File: ./digit_dataset/train/8_nicolas_27.wav   1271  of  1500
File: ./digit_dataset/train/8_nicolas_28.wav   1272  of  1500
File: ./digit_dataset/train/8_nicolas_29.wav   1273  of  1500
File: ./d

File: ./digit_dataset/train/9_jackson_49.wav   1395  of  1500
File: ./digit_dataset/train/9_jackson_5.wav   1396  of  1500
File: ./digit_dataset/train/9_jackson_6.wav   1397  of  1500
File: ./digit_dataset/train/9_jackson_7.wav   1398  of  1500
File: ./digit_dataset/train/9_jackson_8.wav   1399  of  1500
File: ./digit_dataset/train/9_jackson_9.wav   1400  of  1500
File: ./digit_dataset/train/9_nicolas_0.wav   1401  of  1500
File: ./digit_dataset/train/9_nicolas_1.wav   1402  of  1500
File: ./digit_dataset/train/9_nicolas_10.wav   1403  of  1500
File: ./digit_dataset/train/9_nicolas_11.wav   1404  of  1500
File: ./digit_dataset/train/9_nicolas_12.wav   1405  of  1500
File: ./digit_dataset/train/9_nicolas_13.wav   1406  of  1500
File: ./digit_dataset/train/9_nicolas_14.wav   1407  of  1500
File: ./digit_dataset/train/9_nicolas_15.wav   1408  of  1500
File: ./digit_dataset/train/9_nicolas_16.wav   1409  of  1500
File: ./digit_dataset/train/9_nicolas_17.wav   1410  of  1500
File: ./digit_d

  y = column_or_1d(y, warn=True)


In [9]:
train_X = datagen.train_X
train_Y = datagen.train_Y

train_X, valid_X, train_Y, valid_Y = train_test_split(train_X, train_Y, test_size = 0.2)

In [10]:
model = Sequential()
model.add(LSTM(units=128, dropout=0.01, recurrent_dropout=0.35, return_sequences=True, input_shape=(train_X.shape[1], train_X.shape[2])))
model.add(LSTM(units=32, dropout=0.01, recurrent_dropout=0.35, return_sequences=False))
model.add(Dense(units= 256, activation='relu'))
model.add(Dense(units= 128, activation='relu'))
model.add(Dense(units= 64, activation='relu'))
model.add(Dense(units= 10, activation='softmax'))

In [11]:
model.compile(optimizer = "adam", loss = "categorical_crossentropy", metrics = ["accuracy"])

In [13]:
model.fit(
        train_X,
        train_Y,
        epochs = 20,
        steps_per_epoch = 26922 // 16
)

Epoch 1/20
Epoch 2/20

KeyboardInterrupt: 

In [212]:
model.save("digits_spoken.h5")
print("Model saved&ready")

Model saved&ready


In [12]:
test_file_path = "./digit_dataset/test/3_lukasz.wav"
test_X = AudioDataGenerator().preprocessSingleFile(test_file_path)
test_X

array([[[-4.47053150e+02,  1.90021621e+02, -2.73052835e+01,
         -5.37567322e+00,  3.07618111e+00,  2.71011216e+01,
         -2.39545980e+00, -2.58920524e+00, -1.42352031e+01,
         -2.22032945e+01, -2.17246606e+01, -3.97334638e+01,
         -3.69143796e+00,  8.02314951e+02,  1.73705659e-01,
          8.87109898e-02,  2.48424438e-01,  5.53451784e-01,
          4.36127649e-01,  1.00000000e+00,  5.46225540e-01,
          2.13360538e-01,  1.00092874e-01,  5.72113169e-02,
          4.75720589e-02,  9.88044305e-02,  3.55600077e+01,
          3.12887067e+01,  2.40110754e+01,  2.47616459e+01,
          3.79179969e+01,  2.25720360e+01,  1.93892781e+01],
        [-4.27316841e+02,  2.14075246e+02, -2.28026925e+01,
         -1.52494769e+01, -6.15745352e+00,  2.54344092e+01,
         -3.93632670e+00, -8.90940235e+00, -1.06663081e+01,
         -2.28204892e+01, -2.86568235e+01, -4.72200504e+01,
         -9.18515229e+00,  6.66589097e+02,  2.90391766e-01,
          3.99425957e-02,  9.83560419e-

In [331]:
predictions = model.predict(test_X)

In [332]:
predictions

array([[1.5539719e-03, 1.6671412e-07, 2.5102361e-05, 9.5862466e-01,
        1.2686050e-07, 3.4277420e-02, 3.0887662e-10, 4.8169372e-06,
        2.2578424e-10, 5.5138553e-03]], dtype=float32)