In [255]:
import glob
import numpy as np
import random
import librosa
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelBinarizer

import keras
from keras.layers import LSTM, Dense, Dropout, Flatten, Conv1D, MaxPooling2D
from keras.models import Sequential
from keras.optimizers import Adam
from keras.callbacks import EarlyStopping, ModelCheckpoint

In [256]:
# Data can be downloaded at http://pannous.net/spoken_numbers.tar

In [257]:
SEED = 2017
DATA_DIR = 'Data/spoken_numbers_pcm/'

In [258]:
files = glob.glob(DATA_DIR + "*.wav")
X_train, X_val = train_test_split(files, test_size=0.2, random_state=SEED)

print('# Training examples: {}'.format(len(X_train)))
print('# Validation examples: {}'.format(len(X_val)))

# Training examples: 1200
# Validation examples: 300


In [259]:
labels = []
for i in range(len(X_train)):
    label = X_train[i].split('/')[-1].split('_')[-2]
    if label not in labels:
        labels.append(label)
print(labels)

['jackson', 'nicolas', 'theo']


In [260]:
label_binarizer = LabelBinarizer()
label_binarizer.fit(list(set(labels)))

def one_hot_encode(x): return label_binarizer.transform(x)

In [261]:
n_features = 20
max_length = 100
n_classes = len(labels)

In [262]:
def batch_generator(data, batch_size=16):
    while batch_size:
        random.shuffle(data)
        X, y = [], []
        for i in range(batch_size):
            wav = data[i]
            wave, sr = librosa.load(wav, mono=True)
            label = wav.split('/')[-1].split('_')[-2]
            #print(label)
            y.append(label)
            mfcc = librosa.feature.mfcc(y=wave, sr=sr)
            mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0) 
            X.append(np.array(mfcc))
        yield np.array(X), np.array(one_hot_encode(y))

In [263]:
def batch_generator2(data, batch_size=16):
    while 1:
        random.shuffle(data)
        X, y = [], []
        for i in range(batch_size):
            wav = data[i]
            wave, sr = librosa.load(wav, mono=True)
            label = wav.split('/')[-1].split('_')[-2]
            
            #print(label)
            y.append(label)
            mfcc = librosa.feature.mfcc(y=wave, sr=sr)
            mfcc = np.pad(mfcc, ((0,0), (0, max_length-len(mfcc[0]))), mode='constant', constant_values=0) 
            X.append(np.array(mfcc))
        yield np.array(X), np.array(one_hot_encode(y))

In [264]:
learning_rate = 0.001
batch_size = 64
n_epochs = 50
dropout = 0.5

input_shape = (n_features, max_length)
steps_per_epoch = 50

In [265]:
model = Sequential()
model.add(LSTM(256, return_sequences=True, input_shape=input_shape, dropout=dropout))
model.add(Flatten())
model.add(Dense(128, activation='relu'))
model.add(Dropout(dropout))
model.add(Dense(n_classes, activation='softmax'))

In [266]:
opt = Adam(learning_rate=learning_rate)
model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy'])
model.summary()

Model: "sequential_19"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 lstm_17 (LSTM)              (None, 20, 256)           365568    
                                                                 
 flatten_17 (Flatten)        (None, 5120)              0         
                                                                 
 dense_34 (Dense)            (None, 128)               655488    
                                                                 
 dropout_17 (Dropout)        (None, 128)               0         
                                                                 
 dense_35 (Dense)            (None, 3)                 387       
                                                                 
Total params: 1,021,443
Trainable params: 1,021,443
Non-trainable params: 0
_________________________________________________________________


In [267]:
callbacks = [ModelCheckpoint('checkpoints/voice_recognition_best_model_{epoch:02d}.hdf5', save_best_only=True),
            EarlyStopping(restore_best_weights=True, patience=50)]

In [268]:
history = model.fit(
 batch_generator(X_train, batch_size),
 steps_per_epoch=steps_per_epoch,
 epochs=n_epochs,
 verbose=1,
 validation_data=batch_generator2(X_val, 32),
 validation_steps=5,
 callbacks=callbacks
 )

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


In [269]:
X_train[0].split('/')[-1].split('_')[-2]

'jackson'