# Voice User Interfaces: Speech Recognition with Neural Networks


In [26]:
from data_generator import vis_train_features

# extract label and audio features for a single training example
vis_text, vis_raw_audio, vis_mfcc_feature, vis_spectrogram_feature, vis_audio_path = vis_train_features()

There are 2023 total training examples.


In [2]:
from keras import backend as K
from keras.models import Model, Sequential
from keras.layers import BatchNormalization, Conv1D, Dense, Input, Dropout, TimeDistributed, Activation, Bidirectional, SimpleRNN, GRU, LSTM

Using TensorFlow backend.


In [3]:
def simple_rnn_model(input_dim, 
                     output_dim = 29):

    input_data = Input(shape = (None, input_dim),
                       name = 'input')
    
    all_hidden = GRU(output_dim, 
                     return_sequences = True, 
                     implementation = 2,       # for hardware application
                     name = 'rnn')(input_data)
    
    y_pred = Activation('softmax', 
                        name ='softmax')(all_hidden)
    
    model = Model(inputs = input_data, 
                  outputs = y_pred)
    
    model.output_length = lambda x: x
    
    print(model.summary())
    
    return model


model_0 = simple_rnn_model(input_dim = 161)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, None, 161)         0         
_________________________________________________________________
rnn (GRU)                    (None, None, 29)          16617     
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
Total params: 16,617
Trainable params: 16,617
Non-trainable params: 0
_________________________________________________________________
None


In [4]:
from train_utils import train_model

train_model(input_to_softmax = model_0, 
            pickle_path = 'model_0.pickle', 
            save_model_path = 'model_0.h5',
            spectrogram = True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Model 1: RNN + TimeDistributed Dense

<img src="images/rnn_model.png" width="50%">


<img src="images/rnn_model_unrolled.png" width="50%">

In [1]:
def rnn_model(input_dim, 
              units, 
              activation, 
              output_dim = 29):
    
    input_data = Input(name = 'input', 
                       shape = (None, input_dim))
    
    all_hidden = GRU(units = units, 
                     activation = activation,
                     return_sequences = True, 
                     implementation = 2, 
                     name = 'rnn')(input_data)
    
    bn_all_hidden = BatchNormalization()(all_hidden)

    logits = TimeDistributed(Dense(output_dim),
                             name = "dense_layer")(bn_all_hidden)
    
    y_pred = Activation('softmax', name='softmax')(logits)
    
    model = Model(inputs = input_data, 
                  outputs = y_pred)
    
    model.output_length = lambda x: x
    
    print(model.summary())
    
    return model


model_1 = rnn_model(input_dim = 161, # change to 13 if you would like to use MFCC features
                    units = 200,
                    activation='relu')

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, None, 161)         0         
_________________________________________________________________
rnn (GRU)                    (None, None, 200)         217200    
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 200)         800       
_________________________________________________________________
dense_layer (TimeDistributed (None, None, 29)          5829      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
Total params: 223,829
Trainable params: 223,429
Non-trainable params: 400
_________________________________________________________________
None


In [2]:
from train_utils import train_model

train_model(input_to_softmax = model_1, 
            pickle_path = 'model_1.pickle', 
            save_model_path = 'model_1.h5',
            spectrogram = True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Model 2: 1D CNN (over temporal dimension) + RNN + TimeDistributed Dense

<img src="images/cnn_rnn_model.png" width="100%">

In [14]:
def cnn_rnn_model(input_dim, 
                  filters, 
                  kernel_size, 
                  conv_stride,
                  conv_border_mode, 
                  units, 
                  output_dim=29):
    
    input_data = Input(name='input', shape=(None, input_dim))
    
    conv_1d = Conv1D(filters, 
                     kernel_size, 
                     strides = conv_stride, 
                     padding = conv_border_mode,
                     activation = 'relu',
                     name = 'conv1d')(input_data)
    
    bn_cnn = BatchNormalization(name='bn_conv_1d')(conv_1d)
    
    simp_rnn = GRU(units, 
                   activation='relu',
                   return_sequences=True, 
                   implementation=2, 
                   name='rnn')(bn_cnn)
    
    bn_rnn = BatchNormalization()(simp_rnn)
    
    time_dense = TimeDistributed(Dense(output_dim))(bn_rnn)
    
    y_pred = Activation('softmax', 
                        name='softmax')(time_dense)
    
    model = Model(inputs=input_data, outputs=y_pred)
    
    model.output_length = lambda x: cnn_output_length(x, 
                                                      kernel_size, 
                                                      conv_border_mode, 
                                                      conv_stride)
    
    print(model.summary())
    
    return model


def cnn_output_length(input_length, 
                      filter_size, 
                      border_mode, 
                      stride,
                      dilation = 1):
    """ Computes the length of the output sequence after 1D convolution along time
    Params:
        input_length (int): Length of the input sequence.
        filter_size (int): Width of the convolution kernel.
        border_mode (str): Only support `same` or `valid`.
        stride (int): Stride size used in 1D convolution.
        dilation (int)
    """
    if input_length is None:
        return None
    
    assert border_mode in {'same', 'valid'}
    
    dilated_filter_size = filter_size + (filter_size - 1) * (dilation - 1)
    
    if border_mode == 'same':
        output_length = input_length
    
    elif border_mode == 'valid':
        output_length = input_length - dilated_filter_size + 1

    return (output_length + stride - 1) // stride


model_2 = cnn_rnn_model(input_dim=161, # change to 13 if you would like to use MFCC features
                        filters=200,
                        kernel_size=11, 
                        conv_stride=2,
                        conv_border_mode='valid',
                        units=200)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, None, 161)         0         
_________________________________________________________________
conv1d (Conv1D)              (None, None, 200)         354400    
_________________________________________________________________
bn_conv_1d (BatchNormalizati (None, None, 200)         800       
_________________________________________________________________
rnn (GRU)                    (None, None, 200)         240600    
_________________________________________________________________
batch_normalization_1 (Batch (None, None, 200)         800       
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 29)          5829      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
Total para

In [5]:
from train_utils import train_model

train_model(input_to_softmax=model_2, 
            pickle_path='model_2.pickle', 
            save_model_path='model_2.h5', 
            spectrogram=True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Model 3: Deeper RNN + TimeDistributed Dense

<img src="images/deep_rnn_model.png" width="80%">

In [36]:
def deep_rnn_model(input_dim, 
                   units, 
                   recur_layers, 
                   output_dim = 29):

    input_data = Input(name='the_input', 
                       shape=(None, input_dim))
    
    rnn_model = Sequential()
    
    for i in range(recur_layers):
        rnn_model.add(GRU(units,
                          return_sequences = True,
                          implementation = 2,
                          name = "rnn_" + str(i+1),
                          input_shape=(None, input_dim)))
        
        rnn_model.add(BatchNormalization())
        
    all_hidden = rnn_model(input_data)
    
    time_dense = TimeDistributed(Dense(output_dim))(all_hidden)
    
    y_pred = Activation('softmax', 
                        name='softmax')(time_dense)
    
    model = Model(inputs=input_data, outputs=y_pred)
    
    model.output_length = lambda x: x
    
    print(model.summary())
    
    return model


model_3 = deep_rnn_model(input_dim = 161, # change to 13 if you would like to use MFCC features
                         units = 200,
                         recur_layers = 2) 

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, None, 161)         0         
_________________________________________________________________
sequential_1 (Sequential)    (None, None, 200)         459400    
_________________________________________________________________
time_distributed_9 (TimeDist (None, None, 29)          5829      
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
Total params: 465,229
Trainable params: 464,429
Non-trainable params: 800
_________________________________________________________________
None


In [3]:
from train_utils import train_model

train_model(input_to_softmax=model_3, 
            pickle_path='model_3.pickle', 
            save_model_path='model_3.h5', 
            spectrogram=True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## Model 4: Bidirectional RNN + TimeDistributed Dense

<img src="images/bidirectional_rnn_model.png" width="80%">

In [4]:
def bidirectional_rnn_model(input_dim, 
                            units, 
                            output_dim = 29):
    
    input_data = Input(name = 'the_input', 
                       shape = (None, input_dim))
    
    bidir_rnn = Bidirectional(GRU(units,
                                  return_sequences = True,
                                  implementation = 2,
                                  name = 'rnn'))(input_data)
    
    time_dense = TimeDistributed(Dense(output_dim))(bidir_rnn)
    
    y_pred = Activation('softmax', name='softmax')(time_dense)
    
    model = Model(inputs=input_data, outputs=y_pred)
    
    model.output_length = lambda x: x
    
    print(model.summary())
    
    return model


model_4 = bidirectional_rnn_model(input_dim=161, # change to 13 if you would like to use MFCC features
                                  units=200)

Using TensorFlow backend.


_________________________________________________________________
Layer (type)                 Output Shape              Param #   
the_input (InputLayer)       (None, None, 161)         0         
_________________________________________________________________
bidirectional_1 (Bidirection (None, None, 400)         434400    
_________________________________________________________________
time_distributed_1 (TimeDist (None, None, 29)          11629     
_________________________________________________________________
softmax (Activation)         (None, None, 29)          0         
Total params: 446,029
Trainable params: 446,029
Non-trainable params: 0
_________________________________________________________________
None


In [6]:
from train_utils import train_model

train_model(input_to_softmax=model_4, 
            pickle_path='model_4.pickle', 
            save_model_path='model_4.h5', 
            spectrogram=True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


# Comments 

Model_2 **(CNN + BDRNN)** has the **Lowest TRAINING Loss**. This explains that 1D Temporal Convolution really helped to extract existing patterns in the audio.

Model_3 **(Deep RNN + TimeDistributed Dense)** has the **Lowest VALIDATION Loss**. This exaplains that having a stack of RNNs with Batch Normalization in between greatly improves the learning of temporal patterns. Considering that Model_4 **(Bi-directional RNN + TimeDistributed Dense)** performed quite well too, it would be a good idea to stack multiple bi-directional RNNs together for the final model. 

# Final Model

# CNNs  ->  Bidirectional LSTMs  ->  FCs
I will try 2 versions of the final model, and use the one with better results for prediction.
## (version 1) for CNNs: Batch Normalization -> Dropout -> Activation

In [4]:
from ASR_model import ASR_network

# If you are running the notebook not as the author, you have to adjsut the code in ASR_model to changed
# the order of CNNs as Batch Normalization -> Dropout -> Activation
model_end = ASR_network(n_input_channels = 161,
                        n_cnn_filters = 200,
                        kernel_size = 11, 
                        stride = 2, 
                        padding_mode = 'valid',
                        dilation = 1,
                        cnn_dropout = 0.3,
                        n_bdrnn_layers = 2,
                        n_hidden_rnn = 200,
                        input_dropout = 0.3,      # dropout values referenced from: 
                        recurrent_dropout = 0.1,  # https://machinelearningmastery.com/use-dropout-lstm-networks-time-series-forecasting/
                        rnn_merge_mode = 'sum',
                        fc_n_hiddens = [200],
                        fc_dropout = 0.3,
                        output_dim = 29)

# CNN: Batch Normalization -> Dropout -> Activation

______________________________________________________________________________________________________________
Layer (type)                                     Output Shape                                Param #          
the_input (InputLayer)                           (None, None, 161)                           0                
______________________________________________________________________________________________________________
cnn (Conv1D)                                     (None, None, 200)                           354400           
______________________________________________________________________________________________________________
dropout_cnn (Dropout)                            (None, None, 200)                           0                
______________________________________________________________________________________________________________
bn_cnn (BatchNormalization)                      (None, None, 200)                           800              
_

Please execute the code cell below to train the neural network you specified in `input_to_softmax`.  After the model has finished training, the model is [saved](https://keras.io/getting-started/faq/#how-can-i-save-a-keras-model) in the HDF5 file `model_final.h5`.  The loss history is [saved](https://wiki.python.org/moin/UsingPickle) in `model_final.pickle`.  You are welcome to tweak any of the optional parameters while calling the `train_model` function, but this is not required.

In [4]:
from train_utils import train_model

train_model(input_to_softmax = model_end, 
            n_epochs = 20,
            pickle_path = 'model_final.pickle', 
            save_model_path = 'model_final.h5',
            spectrogram = True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


## (version 2) for CNNs: Activation -> Dropout -> Batch Normalization

In [14]:
from sample_models import *
# specify the model
model_end = final_model(n_input_channels = 161,
                        n_cnn_filters = 200,
                        kernel_size = 11, 
                        stride = 2, 
                        padding_mode = 'valid',
                        dilation = 1,
                        cnn_dropout = 0.3,
                        n_bdrnn_layers = 2,
                        n_hidden_rnn = 200,
                        input_dropout = 0.3,      # dropout values referenced from: 
                        recurrent_dropout = 0.1,  # https://machinelearningmastery.com/use-dropout-lstm-networks-time-series-forecasting/
                        rnn_merge_mode = 'sum',
                        fc_n_hiddens = [200],
                        fc_dropout = 0.3,
                        output_dim = 29)

# CNN: Activation -> Dropout -> Batch Normalization

______________________________________________________________________________________________________________
Layer (type)                                     Output Shape                                Param #          
the_input (InputLayer)                           (None, None, 161)                           0                
______________________________________________________________________________________________________________
cnn (Conv1D)                                     (None, None, 200)                           354400           
______________________________________________________________________________________________________________
dropout_cnn (Dropout)                            (None, None, 200)                           0                
______________________________________________________________________________________________________________
bn_cnn (BatchNormalization)                      (None, None, 200)                           800              
_

In [2]:
from train_utils import train_model

train_model(input_to_softmax = model_end, 
            n_epochs = 20,
            pickle_path = 'model_final_20_epochs.pickle', 
            save_model_path = 'model_final_20_epochs.h5',
            spectrogram = True) # change to False if you would like to use MFCC features

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


**It seems like version 2 performed slightly better. Thus, I will train it for 10 more epochs.**

## Train for 10 more epochs:

In [2]:
model_end.load_weights('results/model_final_20_epochs.h5')

In [3]:
from train_utils import train_model

train_model(input_to_softmax = model_end, 
            n_epochs = 10,
            pickle_path = 'model_final_30_epochs.pickle', 
            save_model_path = 'model_final_30_epochs.h5',
            spectrogram = True)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Try training a little more with a smaller learning rate.**

In [15]:
model_end.load_weights('results/model_final_30_epochs.h5')

train_model(input_to_softmax = model_end, 
            n_epochs = 10,
            pickle_path = 'model_final_40_epochs.pickle', 
            save_model_path = 'model_final_40_epochs.h5',
            spectrogram = True,
            lr = 0.01)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Try training a little more with even smaller learning rate.**

In [17]:
from train_utils import train_model
model_end.load_weights('results/model_final_40_epochs.h5')

train_model(input_to_softmax = model_end, 
            n_epochs = 3,
            pickle_path = 'model_final_43_epochs.pickle', 
            save_model_path = 'model_final_43_epochs.h5',
            spectrogram = True,
            lr = 0.003)

Epoch 1/3
Epoch 2/3
Epoch 3/3


Maybe this is enough since we don't want to overfit, and I am observing that the training loss continues to decrease steadily while the validation loss is going up and down even with decreased learning rate. 

# Comments

My final model has the following architecture:

1. **Input**: Spectrograms 


2. **One 1-D CNN Layer + Batch Normalization**:
    * number of filters : 200
    * activation : ReLU
    * kernel_size : 11
    * stride : 2
    * dilation : 1
    * dropout : 30%
    * order: Activation -> Dropout -> Batch Normalization
    
    
3. **Two Bi-directional LSTM Layers + Batch Normalizations**:
    * number of hidden nodes : 200
    * activation : Tanh
    * input dropout : 30%
    * recurrent cells dropout: 10%
    * order: Activation -> Dropout -> Batch Normalization
    
    
4. **First FC Layer**:
    * number of hidden nodes: 200
    * activation : ReLU
    * dropout: 30%
    
    
5. **Second FC Layer (for Output)**:
    * number of hidden nodes: 29
    * activation : Softmax


Notes:
* I used Spectrograms instead of MFCC as inputs to give 1-D CNN layer more features to work with.

* I used Dropout for both CNN and RNN layers in order to prevent overfitting.

* I only used one CNN layer and two BD-RNN layers considering the relatively small dataset size. 

Several things I learnt while building the model:
* For RNNs' Activation Function, **Tanh** must be used and NOT ReLU. When I used ReLU, the training & validation loss was much higher for Epoch 1 and 2, and barely decreased after Epoch 2.

* After CNN and RNN layers, **Batch Normalization** MUST be used in order to prevent exploding gradient and the losses being displayed as "nan".

* Although my model dimensions were quite humble (only one CNN layer & two BD RNN layers, all with 200 hidden nodes), the whole training process of 30 epochs almost took 4 hours. 

<a id='step3'></a>
## STEP 3: Obtain Predictions

We have written a function for you to decode the predictions of your acoustic model.  To use the function, please execute the code cell below.

In [112]:
import numpy as np
from data_generator import AudioGenerator
from keras import backend as K
from utils import int_sequence_to_text
from IPython.display import Audio

def get_predictions(index, partition, input_to_softmax = model_end, model_path = 'results/model_final_43_epochs.h5'):
    """ Print a model's decoded predictions
    Params:
        index (int): The example you would like to visualize
        partition (str): One of 'train' or 'validation'
        input_to_softmax (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # load the train and test data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()
    
    # obtain the true transcription and the audio features 
    if partition == 'validation':
        transcr = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    elif partition == 'train':
        transcr = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')
        
    # obtain and decode the acoustic model's predictions
    input_to_softmax.load_weights(model_path)
    prediction = input_to_softmax.predict(np.expand_dims(data_point, axis=0))
    output_length = [input_to_softmax.output_length(data_point.shape[0])] 
    pred_ints = (K.eval(K.ctc_decode(
                prediction, output_length)[0][0])+1).flatten().tolist()
    
    # play the audio file, and display the true and predicted transcriptions
    print('-'*80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + transcr)
    print('-'*80)
    print('Predicted transcription:\n' + '\n' + ''.join(int_sequence_to_text(pred_ints)))
    print('-'*80)

Use the code cell below to obtain the transcription predicted by your final model for the first example in the training dataset.

In [113]:
get_predictions(index = 0, partition = 'train')
get_predictions(index = 1000, partition = 'train')
get_predictions(index = 999, partition = 'validation')

--------------------------------------------------------------------------------
True transcription:

her father is a most remarkable person to say the least
--------------------------------------------------------------------------------
Predicted transcription:

her fother s a mos ere markcabl person to sa the last
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
True transcription:

he gave thanks for our food and comfort and prayed for the poor and destitute in great cities where the struggle for life was harder than it was here with us
--------------------------------------------------------------------------------
Predicted transcription:

he gave thaingk s for ar foodant comfeirt and prade fon the por indis tetu in gra sites whe the strole for lie was harder then it was here withtust
--------------------------------------------------------------------------------
----

One standard way to improve the results of the decoder is to incorporate a language model.  We won't pursue this in the notebook, but you are welcome to do so as an _optional extension_. 

If you are interested in creating models that provide improved transcriptions, you are encouraged to download [more data](http://www.openslr.org/12/) and train bigger, deeper models.  But beware - the model will likely take a long while to train.  For instance, training this [state-of-the-art](https://arxiv.org/pdf/1512.02595v1.pdf) model would take 3-6 weeks on a single GPU!

# Enhancement 1: Spell Correction
## (1.1) Spell Correction using the Training Corpus

In [111]:
import re
from collections import Counter
import numpy as np

from data_generator import AudioGenerator

corpus = AudioGenerator()
corpus.load_train_data("train_corpus.json")
corpus_texts = corpus.train_texts
corpus_tokens = np.concatenate(np.array([sentence.lower().split() for sentence in corpus_texts]))
token_counter = Counter(corpus_tokens)
print("most common words: ", token_counter.most_common(5))

most common words:  [('the', 1851), ('and', 932), ('of', 828), ('to', 803), ('a', 687)]


In [42]:
# Spell checker by Peter Norvig (http://norvig.com/spell-correct.html)
# + my addition of vowel_replaces & conso_replaces for edits_1 function
def word_probability(word):
    return token_counter[word]/sum(token_counter.values())


def edits_1(word):
    ''' Performs ONE of deletion, transposition, replacement, or insertion to the given word '''
    letters = "abcdefghijklmnopqrstuvwxyz"
    vowels = "aeiouy"
    consonants = "bcdfghjklmnpqrstvwxz"
    splits = [(word[1:i], word[i:]) for i in range(len(word) + 1)]
    
    deletes    = [word[0] + L + R[1:]               for L, R in splits if R]
    transposes = [word[0] + L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    vowel_replaces = [word[0] + L + v + R[1:]       for L, R in splits if (R and R[0] in vowels) for v in vowels]        
    conso_replaces = [word[0] + L + c + R[1:]       for L, R in splits if (R and R[0] in consonants) for c in consonants]                                                              
    inserts    = [word[0] + L + ch + R              for L, R in splits for ch in letters]

    return set(deletes + transposes + vowel_replaces + conso_replaces + inserts)


def edits_2(word):
    ''' Performs TWO of deletion, transposition, replacement, or insertion to the given word '''
    edits_1_words = edits_1(word)
    edits_2 = set()
    for e1 in edits_1_words:
        for e2 in edits_1(e1):
            edits_2.add(e2)
    
    return edits_2


def existing_words(editted_words):
    ''' Returns a subset of editted words that exist in train corpus '''
    return set(w for w in editted_words if w in token_counter)


def candidates(word):
    ''' Returns possible spelling corrections for the given word: 
        A or B or C: 
            = A, if A is not an empty set 
            = B, if A is an empty set
            = C, if A and B are empty sets
        Thus, returns:
            original word, if it exists in corpus (thus has correct spelling), otherwise
            words editted once, if they exists in corpus, otherwise
            word editted twice, if they exists in corpus, otherwise
            original word, although it does not exist in corpus
    '''
    
    return  existing_words([word]) or \
            existing_words(edits_2(word)) or \
            existing_words(edits_1(word)) or \
            [word]                               


def correction(word):
    ''' Returns the most probabale spelling correction of the given word'''

    return max(candidates(word), key = word_probability)

## **Adjust get_predictions function to include the spell correction:**



In [107]:
import numpy as np
from data_generator import AudioGenerator
from keras import backend as K
from utils import int_sequence_to_text
from IPython.display import Audio

def get_predictions(index, partition, ASR_model = model_end, model_path = 'results/model_final_43_epochs.h5'):
    """ Print a model's decoded predictions
    Params:
        index (int): sample index of training or validation set
        partition (str): One of 'train' or 'validation'
        ASR_model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # 1. Load the train and validation data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()
    
    # 2. Obtain the true transcription and the audio features 
    if partition == 'train':
        true_label = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))
        
    elif partition == 'validation':
        true_label = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]
        data_point = data_gen.normalize(data_gen.featurize(audio_path))       
    
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')
        
    # 3. Obtain and decode the acoustic model's predictions
    ASR_model.load_weights(model_path)
    prediction = ASR_model.predict(np.expand_dims(data_point, axis=0))  # give a batch size of 1
    output_length = [ASR_model.output_length(data_point.shape[0])] 
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist()
    pred_words = ''.join(int_sequence_to_text(pred_ints)).split()
    
    # 4. Perform Spelling Correction
    corrected_str = [correction(word) for word in pred_words]
    
    # 5. play the audio file, and display the true and predicted transcriptions
    print('-'*80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + true_label)
    print('-'*80)
    print('Predicted transcription: (Original -- Spell Correction)\n')
    print(' '.join(pred_words))
    print(' '.join(corrected_str))
    print('-'*80)

## Predict with spell correction:

In [108]:
get_predictions(index = 0, partition = 'train')
get_predictions(index = 1000, partition = 'train')
get_predictions(index = 999, partition = 'validation')

--------------------------------------------------------------------------------
True transcription:

her father is a most remarkable person to say the least
--------------------------------------------------------------------------------
Predicted transcription: (Original -- Spell Correction)

her fother s a mos ere markcabl person to sa the last
her father s a my ere markcabl person to she the last
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
True transcription:

he gave thanks for our food and comfort and prayed for the poor and destitute in great cities where the struggle for life was harder than it was here with us
--------------------------------------------------------------------------------
Predicted transcription: (Original -- Spell Correction)

he gave thaingk s for ar foodant comfeirt and prade fon the por indis tetu in gra sites whe the strole for lie was ha

## (1.2) Spell Correction using Brown Corpus

Brown Corpus could be used instead if we have a smaller training data, since new speech could contain word that do not exist in training set vocabulary. However, if the training data was from a niche corpus where the vocabulary must be limited (e.g. medical document), using a general language corpus such as Brown Corpus for spelling correction might lead to incorrect results. 

When I experimented with Spelling Correction using either Training Corpus and Brown Corpus along with the 2 other enhancements below, it seemed that **using Training Corpus led to more accurate results.**

In [27]:
import nltk
from nltk.corpus import brown

nltk.download('brown')

[nltk_data] Downloading package brown to /root/nltk_data...
[nltk_data]   Unzipping corpora/brown.zip.


True

In [109]:
from collections import Counter

brown_corpus = brown.words(categories = 'adventure') + brown.words(categories='romance') + brown.words(categories='fiction')
token_counter = Counter(brown_corpus)

In [110]:
get_predictions(index = 0, partition = 'train')
get_predictions(index = 1000, partition = 'train')
get_predictions(index = 999, partition = 'validation')

--------------------------------------------------------------------------------
True transcription:

her father is a most remarkable person to say the least
--------------------------------------------------------------------------------
Predicted transcription: (Original -- Spell Correction)

her fother s a mos ere markcabl person to sa the last
her father she a me even markcabl person to she the last
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
True transcription:

he gave thanks for our food and comfort and prayed for the poor and destitute in great cities where the struggle for life was harder than it was here with us
--------------------------------------------------------------------------------
Predicted transcription: (Original -- Spell Correction)

he gave thaingk s for ar foodant comfeirt and prade fon the por indis tetu in gra sites whe the strole for lie was

## Comments:
The result still looks very bad. The corrected sentences do not make sense at all both globally and locally. This could be improved by looking at neighbouring words' part of speech, which gives a phrase a logical "sense" for human ears.

# Enhancement 2: POS tagging 
### Suggest a better word choice for spell correction, looking at the **PREVIOUS WORD's Part of Speech**

In [31]:
import nltk
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


True

## Bigram_tags model : 
### Bigram for POS tags (i.e. gives a probability of the current word's tag, given the previous word's tag)

In [32]:
from nltk import bigrams
from collections import Counter, defaultdict

# 1. Create a bigram model for POS tags, base on Training Corpus
bigram_tags = defaultdict(lambda: defaultdict(lambda: 0))

# (1.1) Record bigram counts in a dict
for sentence in corpus_texts:
    tags = [nltk.pos_tag((w.split()))[0][1] for w in sentence.split()]
    for t1, t2 in bigrams(tags):
        bigram_tags[t1][t2] += 1
        
# (1.2) Transform the counts into probabilities
for t1 in bigram_tags:
    total_count = float(sum(bigram_tags[t1].values()))
    for t2 in bigram_tags[t1]:
        bigram_tags[t1][t2] /= total_count

## Adjust candidates & correction function to include POS tagging:

In [105]:
def candidates_POS(word, prev_tag=None):
    if existing_words([word]):  # if predicted word appears in the corpus
        if prev_tag is None:
            return [word]
        else:
            tag = nltk.pos_tag((word.split()))[0][1]
            if bigram_tags[prev_tag][tag] > 0.01:  # if the word has a reasonable tag, considering previous 2 words' tags
                return [word]
            
    one_edits = existing_words(edits_1(word))
    if one_edits:
        if prev_tag is None:
            return one_edits
        else:
            tags = [(w, nltk.pos_tag((w.split()))[0][1]) for w in one_edits]
            bigrams = [(w, bigram_tags[prev_tag][tag]) for (w, tag) in tags]
            logical_candidates = [pair[0] for pair in bigrams if pair[1] > 0.01]
            if logical_candidates:
                return logical_candidates

    two_edits = existing_words(edits_2(word))
    if two_edits:
        if prev_tag is None:
            return two_edits
        else:
            tags = [(w, nltk.pos_tag((w.split()))[0][1]) for w in two_edits]
            bigrams = [(w, bigram_tags[prev_tag][tag]) for (w, tag) in tags]
            logical_candidates = [pair[0] for pair in bigrams if pair[1] > 0.01]
            if logical_candidates:
                return logical_candidates
    
    return [word]   

        
def correction_POS(words):
    ''' Returns the most probabale spelling correction of the given sentence (words)'''
    corrected_sentence = []    
    for word_i, word in enumerate(words): 
        # Current word's POS tag:
        tag = nltk.pos_tag((word.split()))[0][1]
        # Don't apply POS tagging selection to the first word
        if word_i == 0:
            prev_tag = None    

        word_candidates = candidates_POS(word, prev_tag=prev_tag)
        next_word = max(word_candidates, key = word_probability)
        corrected_sentence.append(next_word) 
        prev_tag = nltk.pos_tag((next_word.split()))[0][1]
            
    return corrected_sentence

## Adjust get_predictions function to include POS tagging:

In [84]:
import numpy as np
from data_generator import AudioGenerator
from keras import backend as K
from utils import int_sequence_to_text
from IPython.display import Audio

def get_predictions(index, partition, ASR_model = model_end, model_path = 'results/model_final_43_epochs.h5'):
    """ Print a model's decoded predictions
    Params:
        index (int): sample index of training or validation set
        partition (str): One of 'train' or 'validation'
        ASR_model (Model): The acoustic model
        model_path (str): Path to saved acoustic model's weights
    """
    # 1. Load the train and validation data
    data_gen = AudioGenerator()
    data_gen.load_train_data()
    data_gen.load_validation_data()
    
    # 2. Obtain the true transcription and the audio features 
    if partition == 'train':
        true_label = data_gen.train_texts[index]
        audio_path = data_gen.train_audio_paths[index]              
        
    elif partition == 'validation':
        true_label = data_gen.valid_texts[index]
        audio_path = data_gen.valid_audio_paths[index]      
    
    else:
        raise Exception('Invalid partition!  Must be "train" or "validation"')
    
    data_point = data_gen.normalize(data_gen.featurize(audio_path)) 
    
    # 3. Obtain and decode the acoustic model's predictions
    ASR_model.load_weights(model_path)
    prediction = ASR_model.predict(np.expand_dims(data_point, axis=0))  # give a batch size of 1
    output_length = [ASR_model.output_length(data_point.shape[0])] 
    pred_ints = (K.eval(K.ctc_decode(prediction, output_length)[0][0]) + 1).flatten().tolist()
    pred_words = ''.join(int_sequence_to_text(pred_ints)).split()
    
    # 4. Perform Spelling Correction (just for reference)
    corrected_str = [correction(word) for word in pred_words]
    
    # 5. Perform Spelling Correction & POS Tagging
    corrected_str_POS = correction_POS(pred_words)
    
    # 6. play the audio file, and display the true and predicted transcriptions
    print('-'*80)
    Audio(audio_path)
    print('True transcription:\n' + '\n' + true_label)
    print('-'*80)
    print('Predicted transcription: (Original -- Spell Correction -- Spell Correction, POS tagging)\n')
    print(' '.join(pred_words))
    print(' '.join(corrected_str))
    print(' '.join(corrected_str_POS))
    print('-'*80)

## Get Predictions:

In [106]:
get_predictions(index = 0, partition = 'train')
get_predictions(index = 1000, partition = 'train')
get_predictions(index = 999, partition = 'validation')

--------------------------------------------------------------------------------
True transcription:

her father is a most remarkable person to say the least
--------------------------------------------------------------------------------
Predicted transcription: (Original -- Spell Correction -- Spell Correction, POS tagging)

her fother s a mos ere markcabl person to sa the last
her father s a my ere markcabl person to she the last
her father s a most ere markcabl person to say the last
--------------------------------------------------------------------------------
--------------------------------------------------------------------------------
True transcription:

he gave thanks for our food and comfort and prayed for the poor and destitute in great cities where the struggle for life was harder than it was here with us
--------------------------------------------------------------------------------
Predicted transcription: (Original -- Spell Correction -- Spell Correction, POS taggi

## Comments:
Wow! Localities of the sentences seem to "make sense" a little bit (e.g. got "a wild roses" instead of "a was roses" & "to say the last" instead of "to she the last"). 

## Future Recommendations
One of the major remaining problems is that the predictions seem to include some non-sensical words that combine several words in the true sentence (e.g. "withtust" for "with us") OR break up a single word in the true sentence (e.g. "ere markcabl" for "remarkable"). Thus, it would be a good idea to work on inventing an algorithm to improve this in the future. 