In [1]:
import argparse
import os.path
import sys

import numpy as np
from six.moves import xrange  # pylint: disable=redefined-builtin
import tensorflow as tf

from speech_commands import input_data
from speech_commands import models
from tensorflow.python.platform import gfile

FLAGS = None

  from ._conv import register_converters as _register_converters


In [2]:
data_url = 'http://download.tensorflow.org/data/speech_commands_v0.01.tar.gz' #Location of speech training data archive on the web.
data_dir = 'dataset/' #Where to download the speech training data to.
background_volume = 0.1 #How loud the background noise should be, between 0 and 1.
background_frequency = 0.8 #How many of the training samples have background noise mixed in.
silence_percentage = 10.0 #How much of the training data should be silence.
unknown_percentage = 10.0 #How much of the training data should be unknown words.
time_shift_ms = 100.0 #Range to randomly shift the training audio by in time.
testing_percentage = 10 #What percentage of wavs to use as a test set.
validation_percentage = 10 #What percentage of wavs to use as a validation set.
sample_rate = 16000 #Expected sample rate of the wavs
clip_duration_ms = 1000 #Expected duration in milliseconds of the wavs
window_size_ms = 30.0 #How long each spectrogram timeslice is.
window_stride_ms = 10.0 #How far to move in time between spectogram timeslices.
dct_coefficient_count = 40 #How many bins to use for the MFCC fingerprint
how_many_training_steps = '15000,3000'#How many training loops to run
eval_step_interval = 400 #How often to evaluate the training results.
learning_rate = '0.001,0.0001' #How large a learning rate to use when training.
batch_size = 100 #How many items to train with at once'
summaries_dir = 'log/retrain_logs' #Where to save summary logs for TensorBoard.
wanted_words = 'yes,no,up,down,left,right,on,off,stop,go' #Words to use (others will be added to an unknown label)
train_dir = 'log/train_dir' #Directory to write event logs and checkpoint.
save_step_interval = 100 #Save model checkpoint every save_steps.
start_checkpoint = '' #If specified, restore this pretrained model before any training.
model_architecture = 'conv' #What model architecture to use
check_nans = False #Whether to check for invalid numbers during processing

In [3]:
# We want to see all the logging messages for this tutorial.
tf.logging.set_verbosity(tf.logging.INFO)

# Start a new TensorFlow session.
sess = tf.InteractiveSession()

# Begin by making sure we have the training data we need. If you already have
# training data of your own, use `--data_url= ` on the command line to avoid
# downloading.
model_settings = models.prepare_model_settings(
  len(input_data.prepare_words_list(wanted_words.split(','))),
  sample_rate, clip_duration_ms, window_size_ms,
  window_stride_ms, dct_coefficient_count)
audio_processor = input_data.AudioProcessor(
  data_url, data_dir, silence_percentage,
  unknown_percentage,
  wanted_words.split(','), validation_percentage,
  testing_percentage, model_settings)
fingerprint_size = model_settings['fingerprint_size']
label_count = model_settings['label_count']
time_shift_samples = int((time_shift_ms * sample_rate) / 1000)
# Figure out the learning rates for each training phase. Since it's often
# effective to have high learning rates at the start of training, followed by
# lower levels towards the end, the number of steps and learning rates can be
# specified as comma-separated lists to define the rate at each stage. For
# example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
# will run 13,000 training loops in total, with a rate of 0.001 for the first
# 10,000, and 0.0001 for the final 3,000.
training_steps_list = list(map(int, how_many_training_steps.split(',')))
learning_rates_list = list(map(float, learning_rate.split(',')))
if len(training_steps_list) != len(learning_rates_list):
    raise Exception('--how_many_training_steps and --learning_rate must be equal length '
                    'lists, but are %d and %d long instead' % (len(training_steps_list),
                    len(learning_rates_list)))

fingerprint_input = tf.placeholder(
  tf.float32, [None, fingerprint_size], name='fingerprint_input')

logits, dropout_prob = models.create_model(fingerprint_input, model_settings, model_architecture, is_training=True)

# Define loss and optimizer
ground_truth_input = tf.placeholder(tf.int64, [None], name='groundtruth_input')

# Optionally we can add runtime checks to spot when NaNs or other symptoms of
# numerical errors start occurring during training.
control_dependencies = []
if check_nans:
    checks = tf.add_check_numerics_ops()
    control_dependencies = [checks]

# Create the back propagation and training evaluation machinery in the graph.
with tf.name_scope('cross_entropy'):
    cross_entropy_mean = tf.losses.sparse_softmax_cross_entropy(labels=ground_truth_input, logits=logits)
tf.summary.scalar('cross_entropy', cross_entropy_mean)
with tf.name_scope('train'), tf.control_dependencies(control_dependencies):
    learning_rate_input = tf.placeholder(tf.float32, [], name='learning_rate_input')
    train_step = tf.train.GradientDescentOptimizer(learning_rate_input).minimize(cross_entropy_mean)
predicted_indices = tf.argmax(logits, 1)
correct_prediction = tf.equal(predicted_indices, ground_truth_input)
confusion_matrix = tf.confusion_matrix(ground_truth_input, predicted_indices, num_classes=label_count)
evaluation_step = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar('accuracy', evaluation_step)

global_step = tf.train.get_or_create_global_step()
increment_global_step = tf.assign(global_step, global_step + 1)

saver = tf.train.Saver(tf.global_variables())

# Merge all the summaries and write them out to /tmp/retrain_logs (by default)
merged_summaries = tf.summary.merge_all()
train_writer = tf.summary.FileWriter(summaries_dir + '/train', sess.graph)
validation_writer = tf.summary.FileWriter(summaries_dir + '/validation')

tf.global_variables_initializer().run()

start_step = 1

if start_checkpoint:
    models.load_variables_from_checkpoint(sess, start_checkpoint)
    start_step = global_step.eval(session=sess)

tf.logging.info('Training from step: %d ', start_step)

# Save graph.pbtxt.
tf.train.write_graph(sess.graph_def, train_dir,  model_architecture + '.pbtxt')

# Save list of words.
with gfile.GFile(os.path.join(train_dir, model_architecture + '_labels.txt'), 'w') as f:
    f.write('\n'.join(audio_processor.words_list))



>> Downloading speech_commands_v0.01.tar.gz 99.9%

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



INFO:tensorflow:Training from step: 1 


In [4]:
# Training loop.
training_steps_max = np.sum(training_steps_list)
for training_step in xrange(start_step, training_steps_max + 1):
    # Figure out what the current learning rate is.
    training_steps_sum = 0
    for i in range(len(training_steps_list)):
        training_steps_sum += training_steps_list[i]
        if training_step <= training_steps_sum:
            learning_rate_value = learning_rates_list[i]
            break
    # Pull the audio samples we'll use for training.
    train_fingerprints, train_ground_truth = audio_processor.get_data(
        batch_size, 0, model_settings, background_frequency,
        background_volume, time_shift_samples, 'training', sess)
    # Run the graph with this batch of training data.
    parameters = [ merged_summaries, evaluation_step, cross_entropy_mean, train_step, increment_global_step]
    feed_dict = {fingerprint_input: train_fingerprints, ground_truth_input: train_ground_truth,
                 learning_rate_input: learning_rate_value, dropout_prob: 0.5 }
    train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run( parameters, feed_dict)
    train_writer.add_summary(train_summary, training_step)
    tf.logging.info('Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
                    (training_step, learning_rate_value, train_accuracy * 100,
                     cross_entropy_value))
    is_last_step = (training_step == training_steps_max)
    if (training_step % eval_step_interval) == 0 or is_last_step:
        set_size = audio_processor.set_size('validation')
        total_accuracy = 0
        total_conf_matrix = None
        for i in xrange(0, set_size, batch_size):
            validation_fingerprints, validation_ground_truth = (
                audio_processor.get_data(batch_size, i, model_settings, 0.0,
                                         0.0, 0, 'validation', sess))
            # Run a validation step and capture training summaries for TensorBoard
            # with the `merged` op.
            parameters = [merged_summaries, evaluation_step, confusion_matrix]
            feed_dict = { fingerprint_input: validation_fingerprints, ground_truth_input: validation_ground_truth,
                    dropout_prob: 1.0 }
            validation_summary, validation_accuracy, conf_matrix = sess.run(parameters, feed_dict)
            validation_writer.add_summary(validation_summary, training_step)
            batch_size = min(batch_size, set_size - i)
            total_accuracy += (validation_accuracy * batch_size) / set_size
            if total_conf_matrix is None:
                total_conf_matrix = conf_matrix
            else:
                total_conf_matrix += conf_matrix
        tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
        tf.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
                      (training_step, total_accuracy * 100, set_size))

    # Save the model checkpoint periodically.
    if (training_step % save_step_interval == 0 or
        training_step == training_steps_max):
        checkpoint_path = os.path.join(train_dir, model_architecture + '.ckpt')
        tf.logging.info('Saving to "%s-%d"', checkpoint_path, training_step)
        saver.save(sess, checkpoint_path, global_step=training_step)



INFO:tensorflow:Step #1: rate 0.001000, accuracy 8.0%, cross entropy 2.555859
INFO:tensorflow:Step #2: rate 0.001000, accuracy 7.0%, cross entropy 2.568493
INFO:tensorflow:Step #3: rate 0.001000, accuracy 4.0%, cross entropy 2.548818
INFO:tensorflow:Step #4: rate 0.001000, accuracy 11.0%, cross entropy 2.525087
INFO:tensorflow:Step #5: rate 0.001000, accuracy 10.0%, cross entropy 2.510934
INFO:tensorflow:Step #6: rate 0.001000, accuracy 9.0%, cross entropy 2.569896
INFO:tensorflow:Step #7: rate 0.001000, accuracy 4.0%, cross entropy 2.551708
INFO:tensorflow:Step #8: rate 0.001000, accuracy 8.0%, cross entropy 2.492843
INFO:tensorflow:Step #9: rate 0.001000, accuracy 6.0%, cross entropy 2.555687
INFO:tensorflow:Step #10: rate 0.001000, accuracy 6.0%, cross entropy 2.531054
INFO:tensorflow:Step #11: rate 0.001000, accuracy 11.0%, cross entropy 2.478223
INFO:tensorflow:Step #12: rate 0.001000, accuracy 12.0%, cross entropy 2.510865
INFO:tensorflow:Step #13: rate 0.001000, accuracy 11.0%, 

INFO:tensorflow:Step #104: rate 0.001000, accuracy 15.0%, cross entropy 2.459380
INFO:tensorflow:Step #105: rate 0.001000, accuracy 10.0%, cross entropy 2.395886
INFO:tensorflow:Step #106: rate 0.001000, accuracy 13.0%, cross entropy 2.454031
INFO:tensorflow:Step #107: rate 0.001000, accuracy 16.0%, cross entropy 2.406504
INFO:tensorflow:Step #108: rate 0.001000, accuracy 12.0%, cross entropy 2.450357
INFO:tensorflow:Step #109: rate 0.001000, accuracy 15.0%, cross entropy 2.405913
INFO:tensorflow:Step #110: rate 0.001000, accuracy 10.0%, cross entropy 2.467561


KeyboardInterrupt: 

In [5]:
set_size = audio_processor.set_size('testing')
tf.logging.info('set_size=%d', set_size)
total_accuracy = 0
total_conf_matrix = None
for i in xrange(0, set_size, batch_size):
    test_fingerprints, test_ground_truth = audio_processor.get_data(
        batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
    test_accuracy, conf_matrix = sess.run(
        [evaluation_step, confusion_matrix],
        feed_dict={
            fingerprint_input: test_fingerprints,
            ground_truth_input: test_ground_truth,
            dropout_prob: 1.0
        })
    batch_size = min(batch_size, set_size - i)
    total_accuracy += (test_accuracy * batch_size) / set_size
    if total_conf_matrix is None:
        total_conf_matrix = conf_matrix
    else:
        total_conf_matrix += conf_matrix
tf.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
tf.logging.info('Final test accuracy = %.1f%% (N=%d)' % (total_accuracy * 100,
                                                       set_size))

INFO:tensorflow:set_size=3081
INFO:tensorflow:Confusion Matrix:
 [[ 99   5  14   0  10   0   4   4 103  10   8   0]
 [  2  11   4  11  20  25  13  14 152   1   1   3]
 [  4   9   4  12  31  14  30  27 120   1   3   1]
 [  2   8   3  17  29  32  10  10 139   0   0   2]
 [  4   6   0   3  32   6  23   7 171   1  19   0]
 [  5   9   1  14  12  46   8  15 138   0   3   2]
 [  3   6   5  14  20  16  33  29 137   0   4   0]
 [  3   6   5  13  22  22  31  37 116   0   3   1]
 [  2   9   2  13  13  33   5   5 159   1   3   1]
 [  4   4   0   2  24  12  17  19 164   1  15   0]
 [  7  16   2   7  22   8  17   8 143   0  17   2]
 [  8   8   1  20  20  30   9  15 136   0   1   3]]
INFO:tensorflow:Final test accuracy = 14.9% (N=3081)
