Dataset: "Warden P. Speech Commands: A public dataset for single-word speech recognition, 2017. Available from http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz".

Example code: https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands

In [1]:
# Define Imports
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

import os.path
import csv

import numpy as np
import tensorflow as tf
import tflearn

# Modified from https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/speech_commands
import input_data_ as input_data_
import models_ as models_

from mcfly import modelgen, find_architecture, storage
from keras.models import load_model
from keras.preprocessing.image import ImageDataGenerator
from keras.callbacks import ModelCheckpoint, Callback, EarlyStopping, TensorBoard
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense, Bidirectional, Conv2D
from keras.layers import Activation, Dropout, BatchNormalization, Flatten
from keras.layers import MaxPooling2D, GlobalAveragePooling2D
from keras.optimizers import RMSprop, Adam
from keras.preprocessing.text import one_hot
import pandas as pd

Using TensorFlow backend.


In [2]:
# Set parameters
wanted_words = 'yes,no,up,down,left,right,on,off,stop,go'
sample_rate = 16000
clip_duration_ms = 1000
window_size_ms = 30.0
window_stride_ms = 10.0
dct_coefficient_count = 40

#data_url = 'http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz'
# data_url = None - to avoid downloading the data again
data_url = None
data_dir = 'speech_dataset/'
data_dir_kaggle = 'data/test/'
silence_percentage = 30.0
unknown_percentage = 30.0
validation_percentage = 10
validation_percentage_kaggle = 0
testing_percentage = 10
testing_percentage_kaggle = 0

time_shift_ms = 200.0
learning_rate = 0.001

batch_size = 1000
n_epoch = 100
tensorboard_dir='tensorboard/'
model_dir = 'models/'

# Automatically save checkpoints, if the validation accuracy is above the threshold
best_checkpoint_path = 'best_checkpoints/'
best_val_accuracy = 0.95

In [3]:
# Function to get a batch generator
def get_batches(x, y, batch_size=100):
    while 1:
        if len(x) >= batch_size:
            yield x, y
            x = []
            y = []

In [4]:
# Data settings
# We want to see all the logging messages for this tutorial.
tf.logging.set_verbosity(tf.logging.INFO)

# Start a new TensorFlow session.
sess = tf.InteractiveSession()

# Begin by making sure we have the training data we need. If you already have
# training data of your own, use `--data_url= ` on the command line to avoid
# downloading.
model_settings = models_.prepare_model_settings(
    len(input_data_.prepare_words_list(wanted_words.split(','))),
    sample_rate, clip_duration_ms, window_size_ms,
    window_stride_ms, dct_coefficient_count)

fingerprint_size = model_settings['fingerprint_size']
label_count = model_settings['label_count']
input_frequency_size = model_settings['dct_coefficient_count']
input_time_size = model_settings['spectrogram_length']

In [5]:
# Build model
cell_units = 1024
activation = 'tanh'
dropout = 0.1
optimizer = 'adam'
loss = 'categorical_crossentropy'
model_name = 'birnn_gru'
batch_norm = True

#def ctc_loss(y_pred, y_true):
#    with tf.name_scope(None):
#        a_t = tf.constant(y_pred)
#        idx = tf.where(tf.not_equal(a_t, 0))
#        sparse = tf.SparseTensor(idx, tf.gather_nd(a_t, idx), a_t.get_shape())
#        sequence_length = input_frequency_size * input_time_size
#        loss = tf.nn.ctc_loss(sparse, y_true, sequence_length)
#        return tf.reduce_mean(loss)
    
#loss = ctc_loss

# Create a run_id in the following format
# 'birnn_lstm-196_tanh_drop-0-5_batch-size-100_lrate-0-001_epoch-1000_batch-norm-True'
run_id = '_'.join([model_name + '-' + str(cell_units),
                  activation,
                   'dp-' + str(dropout).replace('.', '-'),
                   'bs-' + str(batch_size),
                   'lr-' + str(learning_rate).replace('.', '-'),
                   'ep-' + str(n_epoch),
                   'bn-' + str(batch_norm),
                   'unkn-' + str(int(unknown_percentage)),
                   'sil-' + str(int(silence_percentage)),
                   'ts-' + str(int(time_shift_ms))
                  ])

In [6]:
# tflearn models
def lstm(input_time_size, input_frequency_size, label_count, 
         cell_units=128, activation='leaky_relu', dropout=0.5, batch_norm=False):
    net = tflearn.input_data([None, input_time_size, input_frequency_size])
    net = tflearn.lstm(net, cell_units, activation=activation, dropout=dropout)
    if batch_norm:
        net = tflearn.batch_normalization(net)
    net = tflearn.fully_connected(net, label_count, activation='softmax')
    return net

def gru(input_time_size, input_frequency_size, label_count, 
         cell_units=128, activation='leaky_relu', dropout=0.5, batch_norm=False):
    net = tflearn.input_data([None, input_time_size, input_frequency_size])
    net = tflearn.gru(net, cell_units, activation=activation, dropout=dropout)
    if batch_norm:
        net = tflearn.batch_normalization(net)
    net = tflearn.fully_connected(net, label_count, activation='softmax')
    return net

def birnn_lstm(input_time_size, input_frequency_size, label_count, 
         cell_units=128, activation='tanh', dropout=0.5, batch_norm=False):
    net = tflearn.input_data([None, input_time_size, input_frequency_size])
    net = tflearn.bidirectional_rnn(net,
        tflearn.BasicLSTMCell(cell_units, activation=activation, batch_norm=batch_norm),
        tflearn.BasicLSTMCell(cell_units, activation=activation, batch_norm=batch_norm))
    net = tflearn.dropout(net, dropout)
    net = tflearn.fully_connected(net, label_count, activation='softmax')
    return net

def birnn_gru(input_time_size, input_frequency_size, label_count, 
         cell_units=128, activation='tanh', dropout=0.5, batch_norm=False):
    net = tflearn.input_data([None, input_time_size, input_frequency_size])
    net = tflearn.bidirectional_rnn(net,
        tflearn.GRUCell(cell_units, activation=activation),
        tflearn.GRUCell(cell_units, activation=activation))
    if batch_norm:
        net = tflearn.batch_normalization(net)
    net = tflearn.dropout(net, dropout)
    net = tflearn.fully_connected(net, label_count, activation='softmax')
    return net

def cnn_birnn_gru(input_time_size, input_frequency_size, label_count, 
         cell_units=128, activation='tanh', dropout=0.5, batch_norm=False):
    net = tflearn.input_data([None, input_time_size, input_frequency_size, 1])
    net = tflearn.conv_2d(net, 32, 3, activation='relu')
    net = tflearn.max_pool_2d(net, 2)
    if batch_norm:
        net = tflearn.batch_normalization(net)
    net = tflearn.dropout(net, dropout)
    net = tflearn.conv_2d(net, 64, 3, activation='relu')
    net = tflearn.conv_2d(net, 64, 3, activation='relu')
    net = tflearn.max_pool_2d(net, 2)
    if batch_norm:
        net = tflearn.batch_normalization(net)
    net = tflearn.dropout(net, dropout)
    net = tflearn.bidirectional_rnn(net,
        tflearn.GRUCell(cell_units, activation=activation),
        tflearn.GRUCell(cell_units, activation=activation))
    if batch_norm:
        net = tflearn.batch_normalization(net)
    net = tflearn.dropout(net, dropout)
    network = fully_connected(network, cell_units, activation='relu')
    network = dropout(network, dropout)
    net = tflearn.fully_connected(net, label_count, activation='softmax')
    return net

def build_model(input_time_size, input_frequency_size, label_count, 
         model='birnn', cell_units=128, activation='tanh', dropout=0.5, batch_norm=False):
    if 'birnn_lstm' in model:
        return birnn_lstm(input_time_size, input_frequency_size, label_count, 
         cell_units, activation, dropout, batch_norm)
    elif 'birnn_gru' in model:
        return birnn_gru(input_time_size, input_frequency_size, label_count, 
         cell_units, activation, dropout, batch_norm)
    elif 'lstm' in model:
        return lstm(input_time_size, input_frequency_size, label_count, 
         cell_units, activation, dropout, batch_norm)
    elif 'gru' in model:
        return gru(input_time_size, input_frequency_size, label_count, 
         cell_units, activation, dropout, batch_norm)
    elif 'cnn_birnn_gru' in model:
        return cnn_birnn_gru(input_time_size, input_frequency_size, label_count, 
         cell_units, activation, dropout, batch_norm)
    else:
        raise ValueError('Please provide a valid model')

# Bidirectional RNN
net = build_model(input_time_size, input_frequency_size, label_count, 
         model_name, cell_units, activation, dropout, batch_norm)
net = tflearn.regression(net, optimizer=optimizer, learning_rate=learning_rate, loss=loss)

model = tflearn.DNN(net, tensorboard_verbose=0, tensorboard_dir=tensorboard_dir,
            best_checkpoint_path=best_checkpoint_path, best_val_accuracy=best_val_accuracy)

In [11]:
# Prepare training and validation data
audio_processor_training = input_data_.AudioProcessor(
    data_url, data_dir, silence_percentage,
    unknown_percentage,
    wanted_words.split(','), validation_percentage,
    testing_percentage, model_settings)

# Get validation data
validation_fingerprints, validation_ground_truth = audio_processor_training.get_data(-1, 0, model_settings, 0.0,
                             0.0, 0, 'validation', sess)
# Turn validation data into a batch generator
validation_fingerprints, validation_ground_truth = next(get_batches(validation_fingerprints, validation_ground_truth, batch_size))

# Reshape validation data to match the model input
validation_fingerprints = np.reshape(validation_fingerprints, (-1, input_time_size, input_frequency_size))

def get_training_data(audio_processor_training, model_settings, sess):
    time_shift_samples = np.random.uniform(0.5, 1.5) * int((time_shift_ms * sample_rate) / 1000)
    background_frequency = np.random.uniform(0.5, 1.0)
    background_volume = np.random.uniform(0.0, 0.5)
    # Pull the audio samples we'll use for training.
    # Get training data
    train_fingerprints, train_ground_truth = audio_processor_training.get_data(
        -1, 0, model_settings, background_frequency,
        background_volume, time_shift_samples, 'training', sess)
    # Turn training data into a batch generator
    train_fingerprints, train_ground_truth = next(get_batches(train_fingerprints, train_ground_truth, batch_size))
    # Reshape training data to match the model input
    train_fingerprints = np.reshape(train_fingerprints, (-1, input_time_size, input_frequency_size))
    
    return train_fingerprints, train_ground_truth

In [8]:
# Training tflearn
tflearn.is_training(True)
for i in range(n_epoch):
    train_fingerprints, train_ground_truth = get_training_data(audio_processor_training,
        model_settings, sess)
    model.fit(train_fingerprints, train_ground_truth, n_epoch=1,
              validation_set=(validation_fingerprints, validation_ground_truth),
              show_metric=True, batch_size=batch_size, run_id=run_id)
    
model.save(model_dir + run_id)

Training Step: 2011  | total loss: [1m[32m0.27628[0m[0m | time: 17.717s
[2K| Adam | epoch: 068 | loss: 0.27628 - acc: 0.9381 -- iter: 01000/29662


KeyboardInterrupt: 

In [10]:
# Evaluate on test set
set_size = audio_processor_training.set_size('testing')
total_accuracy = 0
total_conf_matrix = None

test_fingerprints, test_ground_truth = audio_processor_training.get_data(
        -1, 0, model_settings, 0.0, 0.0, 0, 'testing', sess)
test_fingerprints, test_ground_truth = next(get_batches(test_fingerprints, test_ground_truth, batch_size))
test_fingerprints = np.reshape(test_fingerprints, (-1, input_time_size, input_frequency_size))

tflearn.is_training(False)
#model.load(model_dir + run_id)
model.load(best_checkpoint_path + '9535')
#model = load_model(model_dir + run_id)
total_accuracy = model.evaluate(test_fingerprints, test_ground_truth, batch_size=batch_size)
tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy[0] * 100,
                                                       set_size))

INFO:tensorflow:Restoring parameters from /home/sladomic/Projects/TensorFlow Speech Recognition Challenge/best_checkpoints/9535
INFO:tensorflow:Final test accuracy = 95.8% (N=4109)


In [None]:
# Predict on the Kaggle test set
audio_processor_predict = input_data_.AudioProcessor(
    data_url, data_dir_kaggle, silence_percentage,
    unknown_percentage,
    wanted_words.split(','), validation_percentage_kaggle,
    testing_percentage_kaggle, model_settings)

predict_fingerprints, predict_ground_truth = audio_processor_predict.get_data(
        -1, 0, model_settings, 0.0, 0.0, 0, 'training', sess, False)
predict_fingerprints, predict_ground_truth = next(get_batches(predict_fingerprints, predict_ground_truth, batch_size))
predict_fingerprints = np.reshape(predict_fingerprints, (-1, input_time_size, input_frequency_size))

tflearn.is_training(False)
model.load(model_dir + run_id)
y_ = model.predict(predict_fingerprints)

In [10]:
# Export predictions to csv for submission
words_list = audio_processor_training.words_list
word_to_index = audio_processor_training.word_to_index
silence_idx = word_to_index['_silence_']
index_to_word = {y:x if x in words_list else 'unknown' for x,y in word_to_index.items()}
index_to_word[silence_idx] = 'silence'
y_labels = [np.argmax(x) for x in y_]
y_names = [index_to_word[x] for x in y_labels]
data_index = audio_processor_predict.data_index
with open('submissions/submission_' + run_id + '.csv', 'w') as csvfile:
    spamwriter = csv.writer(csvfile, delimiter=',')
    spamwriter.writerow(['fname', 'label'])
    for i in range(len(y_names)):
        spamwriter.writerow([os.path.basename(data_index['training'][i]['file']), y_names[i]])
print('Finished')

NameError: name 'audio_processor_training' is not defined

### Ideas
    - [ ] seq2seq
    - [x] bidirectional RNN
    - [x] more units
    - [ ] https://arxiv.org/pdf/1710.04515.pdf
    - [ ] Capsule network
    - [ ] softmax_categorical_crossentropy
    - [ ] AdaGrad

In [None]:
model.save(model_dir + run_id)

In [7]:
audio_processor_predict = input_data_.AudioProcessor(
    data_url, data_dir_kaggle, silence_percentage,
    unknown_percentage,
    wanted_words.split(','), validation_percentage_kaggle,
    testing_percentage_kaggle, model_settings)

predict_fingerprints, predict_ground_truth = audio_processor_predict.get_data(
        -1, 0, model_settings, 0.0, 0.0, 0, 'training', sess, False)
predict_fingerprints, predict_ground_truth = next(get_batches(predict_fingerprints, predict_ground_truth, batch_size))
predict_fingerprints = np.reshape(predict_fingerprints, (-1, input_time_size, input_frequency_size))

In [8]:
tflearn.is_training(False)
model.load(best_checkpoint_path + '9535')
y_ = []
for i in range(0, len(predict_fingerprints), 1000):
    y_.append(model.predict(predict_fingerprints[i:i+1000]))
y2 = []
for x in y_:
    for i in x:
        y2.append(i)
print(np.array(y2).shape)

INFO:tensorflow:Restoring parameters from /home/sladomic/Projects/TensorFlow Speech Recognition Challenge/best_checkpoints/9535
(158538, 12)


In [9]:
y_ = y2
print("Ready")

Ready
