In [None]:
import os
import util
import shutil
import random
import pathlib
import numpy as np
import tensorflow as tf
import tensorflow_io as tfio
import scipy.io.wavfile as wav

from CNN import CNN
from tensorflow.keras import models, layers, callbacks
from tensorflow.keras.layers.experimental import preprocessing

In [None]:
# Project Paths
_ROOT_DIR = os.path.dirname(pathlib.Path.cwd())
_DATASET_DIRECTORY_PATH = _ROOT_DIR+'/data/speech_commands'
_PATH_TO_RESULTS = _ROOT_DIR+'/results'
_PATH_TO_MODEL = _PATH_TO_RESULTS+'/model/'
_PATH_TO_NOISE_SRC = [os.path.join(_DATASET_DIRECTORY_PATH+'/_background_noise_', name)
                            for name in os.listdir(_DATASET_DIRECTORY_PATH+'/_background_noise_') if name.endswith('.wav')]

# Settings
_SAMPLE_RATE = 16000
_DATASET_SPLIT = (0.7, 0.2, 0.1)
_WORDS = ['unknown', 'silence', 'yes', 'no', 'up',
          'down', 'left', 'right', 'on', 'off', 'stop', 'go']

_BATCH_SIZE = 1
_EPOCHS = 10
_LEARNING_RATE = 1e-3
_DROPOUT = 0.2

In [None]:
# Check if dataset directory already exist, otherwise download, extract and remove the archive
if not os.path.isdir(_DATASET_DIRECTORY_PATH):
    print('Downloading from ' + _DOWNLOAD_URL)
    util.download_file(_DOWNLOAD_URL, _ROOT_DIR+'/data/speech_commands.tar.gz')
    print("Extracting archive...")
    shutil.unpack_archive(
        _ROOT_DIR+'/data/speech_commands.tar.gz', _DATASET_DIRECTORY_PATH)
    os.remove('data/speech_commands.tar.gz')
    print("Done.")

    # Create samples for 'silence' category using the _background_noises_ recordings
    silence_samples = random.randint(1500, 4000)
    util.generateSilenceSamples(silence_samples, _DATASET_DIRECTORY_PATH)

In [None]:
data_dir = pathlib.Path(_DATASET_DIRECTORY_PATH)
filenames = tf.io.gfile.glob(str(data_dir) + '/*/*')
filenames = tf.random.shuffle(filenames)

train_samples = int(len(filenames) * _DATASET_SPLIT[0])
val_samples = int(len(filenames) * _DATASET_SPLIT[1])

train, remainder = filenames[:train_samples], filenames[train_samples:]
val, test = remainder[:val_samples], remainder[val_samples:]



In [280]:
def decode_audio(audio_binary):
    audio, _ = tf.audio.decode_wav(audio_binary)
    return tf.squeeze(audio, axis=-1)

def get_signal_and_label(file_path):
    label = tf.strings.split(file_path, os.path.sep)[-2]
    audio_binary = tf.io.read_file(file_path)
    signal = decode_audio(audio_binary)

    # Add padding in case the recording has less than _SAMPLE_RATE samples
    zero_padding = tf.zeros([_SAMPLE_RATE] - tf.shape(signal), dtype=tf.float32)
    signal = tf.cast(signal, tf.float32)
    padded_signal = tf.concat([signal, zero_padding], 0)

    return padded_signal, label

def add_noise(padded_signal, label):
    noise_source = random.choice(_PATH_TO_NOISE_SRC)
    audio_binary = tf.io.read_file(noise_source)
    noise = decode_audio(audio_binary)
    
    # Choose random section of the noise file
    offset = tf.random.uniform((), minval=0, maxval=tf.shape(noise)[0] - _SAMPLE_RATE, dtype=tf.int32)
    slice_indices = tf.range(0, _SAMPLE_RATE, dtype=tf.int32)
    noise_segment = tf.gather(noise, slice_indices + offset, axis=0)
    return tf.math.add(padded_signal, noise_segment), label

def power_to_db(S, amin=1e-16, top_db=80.0):
    """Convert a power-spectrogram (magnitude squared) to decibel (dB) units.
    Computes the scaling ``10 * log10(S / max(S))`` in a numerically
    stable way.
    Based on:
    https://librosa.github.io/librosa/generated/librosa.core.power_to_db.html
    """
    def _tf_log10(x):
        numerator = tf.math.log(x)
        denominator = tf.math.log(tf.constant(10, dtype=numerator.dtype))
        return numerator / denominator
    
    # Scale magnitude relative to maximum value in S. Zeros in the output 
    # correspond to positions where S == ref.
    ref = tf.reduce_max(S)

    log_spec = 10.0 * _tf_log10(tf.maximum(amin, S))
    log_spec -= 10.0 * _tf_log10(tf.maximum(amin, ref))

    log_spec = tf.maximum(log_spec, tf.reduce_max(log_spec) - top_db)

    return log_spec

def generate_log_mel_spectrograms(signal, label):

    # Compute short time fourier transform 
    spectrogram = tf.signal.stft(
      signal, frame_length=255, frame_step=1600)
    
    # Compute magnitudes to avoid complex values
    magnitude_spectrogram = tf.abs(spectrogram)    
    
    # Instantiate 
    mel_filterbank = tf.signal.linear_to_mel_weight_matrix()

    # Warp the linear-scale magnitude-spectrograms to mel-scale
    mel_power_spectrograms = tf.matmul(tf.square(magnitude_spectrogram),
                                mel_filterbank)
    
    # Transform magnitudes to log-scale
    log_magnitude_mel_spectrograms = power_to_db(mel_power_spectrograms)

    log_magnitude_mel_spectrograms = tf.expand_dims(log_magnitude_mel_spectrograms, axis=-1)

    # Create one-hot encoding of label
    label_id = tf.argmax(label == _WORDS)
    one_hot_label = tf.one_hot(label_id, len(_WORDS))
    one_hot_label = tf.reshape(one_hot_label, [1, 12])

    return log_magnitude_mel_spectrograms, one_hot_label


def configure_data_stream(files, batch_size=1, data_augmentation=False):
    ds = tf.data.Dataset.from_tensor_slices(files)
    ds = ds.map(get_signal_and_label, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    if data_augmentation:
        ds = ds.map(add_noise, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds = ds.map(generate_log_mel_spectrograms, num_parallel_calls=tf.data.experimental.AUTOTUNE)
    ds.batch(batch_size)
    ds.prefetch(tf.data.experimental.AUTOTUNE)
    return ds

# Configure generators for datasets
train_ds = configure_data_stream(train, _BATCH_SIZE, data_augmentation=True)
test_ds = configure_data_stream(test)

# Cache validation set since it wont change due to data augmentation
val_ds = configure_data_stream(val)
val_ds.cache()

for feature, label in train_ds.take(1):
    input_shape = feature.shape
    label_shape = label.shape

print('Input shape:', input_shape)
print('Label shape:', label_shape)

model = models.Sequential([
    layers.Input(shape=input_shape),
    preprocessing.Resizing(32,32),
    layers.Conv2D(32, 3, activation='relu'),
    layers.Conv2D(64, 3, activation='relu'),
    layers.MaxPooling2D(),
    layers.Dropout(_DROPOUT),
    layers.Flatten(),
    layers.Dense(128, activation='sigmoid'),
    layers.Dropout(_DROPOUT),
    layers.Dense(len(_WORDS), activation='softmax'),
])

model.summary()

model.compile(optimizer=tf.keras.optimizers.Adam(),
              loss=tf.keras.losses.CategoricalCrossentropy(from_logits=True),
              metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])


# Callbacks
checkpoint = callbacks.ModelCheckpoint(
    _PATH_TO_MODEL, save_best_only=True, monitor='val_loss', mode='min')

reduce_lr = callbacks.ReduceLROnPlateau(
    monitor='val_loss', factor=0.1, patience=2, verbose=1, min_lr=1e-5, mode='min')

early_stopping = callbacks.EarlyStopping(verbose=1, patience=3)

tensorboard = callbacks.TensorBoard(
    log_dir=_PATH_TO_RESULTS+'/logs', histogram_freq=1)


history = model.fit(
    train_ds, 
    validation_data=val_ds,  
    epochs=_EPOCHS,
    callbacks=[checkpoint, reduce_lr, early_stopping, tensorboard])

Input shape: (10, 20, 1)
Label shape: (1, 12)
Model: "sequential_43"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
resizing_36 (Resizing)       (None, 32, 32, 1)         0         
_________________________________________________________________
conv2d_121 (Conv2D)          (None, 30, 30, 32)        320       
_________________________________________________________________
conv2d_122 (Conv2D)          (None, 28, 28, 64)        18496     
_________________________________________________________________
max_pooling2d_59 (MaxPooling (None, 14, 14, 64)        0         
_________________________________________________________________
dropout_93 (Dropout)         (None, 14, 14, 64)        0         
_________________________________________________________________
flatten_49 (Flatten)         (None, 12544)             0         
_________________________________________________________________
dense_1

ValueError: in user code:

    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:796 step_function  **
        outputs = model.distribute_strategy.run(run_step, args=(data,))
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:1211 run
        return self._extended.call_for_each_replica(fn, args=args, kwargs=kwargs)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2585 call_for_each_replica
        return self._call_for_each_replica(fn, args, kwargs)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/distribute/distribute_lib.py:2945 _call_for_each_replica
        return fn(*args, **kwargs)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:789 run_step  **
        outputs = model.train_step(data)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:748 train_step
        loss = self.compiled_loss(
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/engine/compile_utils.py:204 __call__
        loss_value = loss_obj(y_t, y_p, sample_weight=sw)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:149 __call__
        losses = ag_call(y_true, y_pred)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:253 call  **
        return ag_fn(y_true, y_pred, **self._fn_kwargs)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/losses.py:1535 categorical_crossentropy
        return K.categorical_crossentropy(y_true, y_pred, from_logits=from_logits)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper
        return target(*args, **kwargs)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/keras/backend.py:4687 categorical_crossentropy
        target.shape.assert_is_compatible_with(output.shape)
    /home/marcus/Documents/Projects/Speech-Recognition/env/lib/python3.8/site-packages/tensorflow/python/framework/tensor_shape.py:1134 assert_is_compatible_with
        raise ValueError("Shapes %s and %s are incompatible" % (self, other))

    ValueError: Shapes (1, 12) and (10, 12) are incompatible


In [None]:
tf.saved_model.save(model, _PATH_TO_MODEL)


# Plot results
plt.figure(figsize=(20.0, 10.0))
plt.suptitle('{}'.format(model.name))
plt.subplot(1, 2, 1, label='Loss plot')
plt.plot(np.arange(1, len(history.history['loss'])+1), history.history['loss'])
plt.plot(
    np.arange(1, len(history.history['val_loss'])+1), history.history['val_loss'])
plt.ylabel('Loss')
plt.xlabel('Epoch')
plt.legend(['Training set', 'Validation set'], loc='upper left')

plt.subplot(1, 2, 2, label='Accuracy plot')
plt.plot(np.arange(
    1, len(history.history['acc'])+1), history.history['acc'])
plt.plot(np.arange(
    1, len(history.history['val_acc'])+1), history.history['val_acc'])
plt.ylabel('Accuracy')
plt.xlabel('Epoch')
plt.legend(['Training set', 'Validation set'], loc='upper left')
plt.savefig(_PATH_TO_RESULTS+'/images/training_process.png')


print('\nEvaluating model on test set...')
predictions = model.predict(
    test_ds, epochs=1, batch_size=1, verbose=1)
