In [1]:
# TODO: test on training data to see if model is overfitting

from google.colab import drive
drive.mount('/content/gdrive', force_remount=True)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


In [2]:
cd gdrive/My Drive/University/MAIS202/Final Project

/content/gdrive/My Drive/University/MAIS202/Final Project


In [3]:
import numpy as np
import random

import pickle

import itertools as it

from keras.models import Model
from keras.layers import Input, LSTM, Dense, Bidirectional
from keras.layers.merge import Concatenate

from keras.callbacks import ModelCheckpoint, EarlyStopping

from sklearn.model_selection import train_test_split

from nltk.translate.bleu_score import sentence_bleu, SmoothingFunction

import datetime

Using TensorFlow backend.


In [4]:
with open('data_preprocessed.pickle', 'rb') as file_data:
  char_input = pickle.load(file_data)
  
  phoneme_input = char_input['phonemes']
  char_input = char_input['chars']
  
print(char_input.shape)
print(phoneme_input.shape)

max_phoneme_seq_len = phoneme_input.shape[1]

num_char_tokens = char_input.shape[2]
num_phoneme_tokens = phoneme_input.shape[2]

(124558, 20, 26)
(124558, 21, 71)


In [0]:
with open('mappings.pickle', 'rb') as file_mappings:
  map_char_id = pickle.load(file_mappings)
  
  map_id_char = map_char_id['id_char']
  map_phoneme_id = map_char_id['phoneme_id']
  map_id_phoneme = map_char_id['id_phoneme']
  map_char_id = map_char_id['char_id']


In [0]:
with open('data_dict.pickle', 'rb') as file_dict:
  data_dict = pickle.load(file_dict)

In [0]:
phoneme_output = np.pad(phoneme_input,((0,0),(0,1),(0,0)), mode='constant')[:,1:,:]

In [8]:
(char_input_train, char_input_test, 
 phoneme_input_train, phoneme_input_test, 
 phoneme_output_train, phoneme_output_test) = train_test_split(
    char_input, phoneme_input, phoneme_output, test_size=0.2, random_state=3791)

print(char_input_train.shape, char_input_test.shape)
print(phoneme_input_train.shape, phoneme_input_test.shape)
print(phoneme_output_train.shape, phoneme_output_test.shape)

(99646, 20, 26) (24912, 20, 26)
(99646, 21, 71) (24912, 21, 71)
(99646, 21, 71) (24912, 21, 71)


In [0]:
def create_models(encoder_type, decoder_type, hidden_nodes_encoder):
  
  ### encoder
  char_inputs = Input(shape=(None, num_char_tokens))
  
  if encoder_type == 'lstm':
    encoder = LSTM(hidden_nodes_encoder, return_state=True)
    hidden_nodes_decoder = hidden_nodes_encoder
  elif encoder_type == 'blstm':
    encoder = Bidirectional(LSTM(hidden_nodes_encoder, return_state=True))
    hidden_nodes_decoder = 2*hidden_nodes_encoder

  ### decoder
  phoneme_inputs = Input(shape=(None, num_phoneme_tokens))
  if decoder_type == 'lstm':
    decoder = LSTM(hidden_nodes_decoder, return_sequences=True, return_state=True)
  decoder_dense = Dense(num_phoneme_tokens, activation='softmax')
  
  ### training model
  if encoder_type == 'lstm':
    _, state_h, state_c = encoder(char_inputs) # ignore encoder outputs
  elif encoder_type == 'blstm':
    encoder_outputs, forward_h, forward_c, backward_h, backward_c = encoder(char_inputs) # ignore encoder outputs
    # concatenate states of forward and backward LSTMs
    state_h = Concatenate()([forward_h, backward_h])
    state_c = Concatenate()([forward_c, backward_c])
  encoder_states = [state_h, state_c]
  
  # initialize decoder with encoder states (hidden and internal)
  decoder_outputs, _, _ = decoder(phoneme_inputs, initial_state=encoder_states)
  phoneme_prediction = decoder_dense(decoder_outputs)
   
  training_model = Model([char_inputs, phoneme_inputs], phoneme_prediction)
  
  ### testing models
  # encoder
  testing_encoder_model = Model(char_inputs, encoder_states)

  # decoder
  decoder_state_input_h = Input(shape=(hidden_nodes_decoder,)) # hidden state
  decoder_state_input_c = Input(shape=(hidden_nodes_decoder,)) # internal state
  decoder_state_inputs = [decoder_state_input_h, decoder_state_input_c]
  decoder_outputs, decoder_state_h, decoder_state_c = decoder(phoneme_inputs, initial_state=decoder_state_inputs) 
  decoder_states = [decoder_state_h, decoder_state_c]
  phoneme_prediction = decoder_dense(decoder_outputs)
  
  testing_decoder_model = Model([phoneme_inputs] + decoder_state_inputs, [phoneme_prediction] + decoder_states)
  
  return training_model, testing_encoder_model, testing_decoder_model

In [0]:
def train(model, path_weights, encoder_input, decoder_input, decoder_output, patience, optimizer, batch_size):
  
    # save weights after training epochs
    checkpointer = ModelCheckpoint(filepath=path_weights, verbose=1, save_best_only=True)
    
    # stop training if validation loss does not decrease after some consecutive epochs
    stopper = EarlyStopping(monitor='val_loss',patience=patience)
    
    model.compile(optimizer=optimizer, loss='categorical_crossentropy')
    train_history = model.fit([encoder_input, decoder_input], decoder_output,
                               batch_size=batch_size,
                               epochs=100,
                               validation_split=0.125,
                               callbacks=[checkpointer, stopper])
    
    return train_history

In [0]:
def predict(input_char_seq, encoder, decoder):
  '''Predicts the pronunciation of one word'''
  
  # convert input to 3D array if necessary
  if len(input_char_seq.shape) == 2:
    input_char_seq = np.array([input_char_seq])
  
  start_token = '\t'
  end_token = '\n'

  # feed input to encoder and get encoder state vectors
  state_vectors = encoder.predict(input_char_seq) 

  # create start token
  prev_phoneme = np.zeros((1, 1, num_phoneme_tokens))
  prev_phoneme[0, 0, map_phoneme_id[start_token]] = 1.

  end_found = False
  pronunciation = ''
  
  while not end_found:
    
    # feed encoder states and token ('start' or previous prediction)
    decoder_output, h, c = decoder.predict([prev_phoneme] + state_vectors)

    # get phoneme with the highest probability
    predicted_phoneme_id = np.argmax(decoder_output[0, -1, :])
    predicted_phoneme = map_id_phoneme[predicted_phoneme_id]

    # add predicted phoneme to sequence
    pronunciation += predicted_phoneme + ' '

    if predicted_phoneme == end_token or len(pronunciation.split()) > max_phoneme_seq_len: 
        end_found = True

    # use predicted phoneme and decoder states as input in the next iteration
    prev_phoneme = np.zeros((1, 1, num_phoneme_tokens))
    prev_phoneme[0, 0, predicted_phoneme_id] = 1.
    state_vectors = [h, c]

  # return string of phonemes
  return pronunciation.strip()

In [0]:
def matrix_to_word(char_matrix):
  
  word = ''
  
  for char_vector in char_matrix:
    
    # stop if vector only contains 0s
    if np.count_nonzero(char_vector) == 0:
      break
    
    word += map_id_char[char_vector.argmax()]
    
  return word

def is_correct(word, prediction):
  
  # get correct pronunciations from dictionary
  correct_pronunciations = data_dict[word]
  
  for correct in correct_pronunciations:
    if prediction == correct:
      return True
  
  return False

def bleu_score(word, test_pronunciation):
    references = [pronun.split() for pronun in data_dict[word]]
    smooth = SmoothingFunction().method1
    return sentence_bleu(references, test_pronunciation.split(), smoothing_function=smooth)

In [0]:
def test(samples, encoder, decoder):
  
  n_samples = len(samples)
  
  print('Testing {} samples'.format(n_samples))
  
  correct = []
  bleu = []
  for i, sample in enumerate(samples):
    
    if i % 1000 == 0:
      if i%10000 == 0:
        print()
      print(i, end='...')
      
    word = matrix_to_word(sample)
    prediction = predict(sample, encoder, decoder)
      
    correct.append(is_correct(word, prediction))
    bleu.append(bleu_score(word, prediction))
      
  print()
      
  acc = np.mean(correct)
  avg_bleu = np.mean(bleu)
  
  print('Accuracy:', acc)
  print('Average BLEU score:', avg_bleu)
  
  return acc, avg_bleu

In [0]:
encoder_type_all = ['blstm']
decoder_type_all = ['lstm']
n_hidden_nodes_all = [512, 256]
patience_all = [1, 2, 3]
optimizer_all = ['adam']
batch_size_all = [32]

# encoder_type_all = ['blstm']
# decoder_type_all = ['lstm']
# n_hidden_nodes_all = [512]
# patience_all = [3]

hyperparameters_all = it.product(encoder_type_all, decoder_type_all, n_hidden_nodes_all, patience_all, optimizer_all, batch_size_all)


for hyperparameters in hyperparameters_all:
  
  start = datetime.datetime.utcnow()
  
  print('Hyperparameters:', hyperparameters)
  
  if hyperparameters in [('lstm', 'lstm', 256, 1, 'adam', 64), 
                         ('lstm', 'lstm', 256, 1, 'adam', 128),
                         ('lstm', 'lstm', 256, 1, 'adam', 256), 
                         ('lstm', 'lstm', 256, 1, 'rmsprop', 64), 
                         ('lstm', 'lstm', 256, 1, 'rmsprop', 128), 
                         ('lstm', 'lstm', 256, 1, 'rmsprop', 256),
                         ('lstm', 'lstm', 256, 2, 'adam', 64), 
                         ('lstm', 'lstm', 256, 2, 'adam', 128),
                         ('blstm', 'lstm', 256, 1, 'adam', 64), 
                         ('blstm', 'lstm', 256, 1, 'adam', 128), 
                         ('blstm', 'lstm', 256, 1, 'adam', 256), 
                         ('blstm', 'lstm', 256, 1, 'rmsprop', 64),
                         ('blstm', 'lstm', 256, 1, 'rmsprop', 128), 
                         ('blstm', 'lstm', 256, 1, 'rmsprop', 256), 
                         ('blstm', 'lstm', 256, 2, 'adam', 64), 
                         ('blstm', 'lstm', 512, 1, 'adam', 64),
                         ('blstm', 'lstm', 512, 1, 'adam', 128), 
                         ('blstm', 'lstm', 512, 1, 'adam', 256),
                         ('blstm', 'lstm', 512, 3, 'adam', 64),
                         ('blstm', 'lstm', 512, 3, 'adam', 128),
                         ('blstm', 'lstm', 512, 3, 'adam', 256),
                         ('blstm', 'lstm', 512, 2, 'adam', 64),
                         ('blstm', 'lstm', 512, 2, 'adam', 128), 
                         ('blstm', 'lstm', 512, 2, 'adam', 256),
                         ('blstm', 'lstm', 256, 3, 'adam', 64),
                         ('blstm', 'lstm', 256, 3, 'adam', 128),
                         ('blstm', 'lstm', 256, 3, 'adam', 256),
                         ('blstm', 'lstm', 256, 2, 'adam', 128),
                         ('blstm', 'lstm', 256, 2, 'adam', 256),
                         ('blstm', 'lstm', 512, 1, 'adam', 32),
                         ('blstm', 'lstm', 512, 2, 'adam', 32),
                         ('blstm', 'lstm', 512, 3, 'adam', 32),
                         ('blstm', 'lstm', 256, 1, 'adam', 32),
                         ('blstm', 'lstm', 256, 2, 'adam', 32),
                         ('blstm', 'lstm', 256, 3, 'adam', 32)]:
    print('Skipping this set of hyperparamters')
    continue
  
  encoder_type, decoder_type, n_hidden_nodes, patience, optimizer, batch_size = hyperparameters
  hyperparameters_string = '{}_{}_{}_{}_{}_{}'.format(encoder_type, decoder_type, n_hidden_nodes, patience, optimizer, batch_size)
  
  path_weights = '{}_weights.hdf5'.format(hyperparameters_string)
  print('File name:', path_weights)
  
  training_model, testing_encoder_model, testing_decoder_model = create_models(encoder_type, decoder_type, n_hidden_nodes)
  
  train_history = train(training_model, path_weights, char_input_train, phoneme_input_train, phoneme_output_train, patience, optimizer, batch_size)
  
#   print(train_history.history['loss'])
#   print(train_history.history['val_loss'])

  training_model.load_weights(path_weights)
  acc, bleu = test(char_input_test, testing_encoder_model, testing_decoder_model)
  
  path_metrics = '{}_metrics.pickle'.format(hyperparameters_string)
  
  with open(path_metrics, 'wb') as file_out:
    
    pickle.dump({'history':train_history, 'acc':acc, 'bleu':bleu}, file_out)
    
    print('History, accuracy and average BLEU score saved to {}\n'.format(path_metrics))
    
  with open('training_log.txt', 'a+') as file_out:
    
    time = datetime.datetime.utcnow() - start
    
    file_out.write('\nHyperparameters: {}\n'.format(hyperparameters))
    file_out.write('\tAccuracy: {}\n'.format(acc))
    file_out.write('\tAverage BLEU score: {}\n'.format(bleu))
    file_out.write('\tTotal training and testing time: {}\n'.format(time))

Hyperparameters: ('blstm', 'lstm', 512, 1, 'adam', 32)
Skipping this set of hyperparamters
Hyperparameters: ('blstm', 'lstm', 512, 2, 'adam', 32)
Skipping this set of hyperparamters
Hyperparameters: ('blstm', 'lstm', 512, 3, 'adam', 32)
Skipping this set of hyperparamters
Hyperparameters: ('blstm', 'lstm', 256, 1, 'adam', 32)
Skipping this set of hyperparamters
Hyperparameters: ('blstm', 'lstm', 256, 2, 'adam', 32)
File name: blstm_lstm_256_2_adam_32_weights.hdf5





Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where



Train on 87190 samples, validate on 12456 samples
Epoch 1/100






Epoch 00001: val_loss improved from inf to 0.18033, saving model to blstm_lstm_256_2_adam_32_weights.hdf5
Epoch 2/100

Epoch 00002: val_loss improved from 0.18033 to 0.13517, saving model to blstm_lstm_256_2_adam_32_weights.hdf5
Epoch 3/100

Epoch 00003: val_loss improved from 0.13517 to 0.11751, saving model to blstm_lstm_256_2_adam_32_weights.hdf5
Epoch 4/10

In [17]:
best_params = ('blstm', 'lstm', 512, 2, 'adam', 32)

encoder_type, decoder_type, hidden_nodes, patience, optimizer, batch_size = best_params

path_weights = '{}_{}_{}_{}_{}_{}_weights.hdf5'.format(encoder_type, decoder_type, hidden_nodes, patience, optimizer, batch_size)

training_model, testing_encoder_model, testing_decoder_model = load_model(
  encoder_type, decoder_type, hidden_nodes, patience, optimizer, batch_size, path_weights)

print('Test set:')
acc_test, bleu_test = test(char_input_test, testing_encoder_model, testing_decoder_model)
print()

print('Training set:')
acc_train, bleu_train = test(char_input_train, testing_encoder_model, testing_decoder_model)


Testing 24912 samples
0...1000...2000...3000...4000...5000...6000...7000...8000...9000...10000...11000...12000...13000...14000...15000...16000...17000...18000...19000...20000...21000...22000...23000...24000...
Accuracy: 0.6195407835581246
Average BLEU score: 0.7358755330145365
Test set:
	Accuracy: 0.6195407835581246
	BLEU score: 0.7358755330145365

Testing 99646 samples
0...1000...2000...3000...4000...5000...6000...7000...8000...9000...10000...11000...12000...13000...14000...15000...16000...17000...18000...19000...20000...21000...22000...23000...24000...25000...26000...27000...28000...29000...30000...31000...32000...33000...34000...35000...36000...37000...38000...39000...40000...41000...42000...43000...44000...45000...46000...47000...48000...49000...50000...51000...52000...53000...54000...55000...56000...57000...58000...59000...60000...61000...62000...63000...64000...65000...66000...67000...68000...69000...70000...71000...72000...73000...74000...75000...76000...77000...78000...79000...

In [0]:
def load_model(encoder_type, decoder_type, hidden_nodes, patience, optimizer, batch_size, path_weights):
  
  training_model, testing_encoder_model, testing_decoder_model = create_models(encoder_type, decoder_type, hidden_nodes)
  
  training_model.compile(optimizer=optimizer, loss='categorical_crossentropy')
  
  training_model.load_weights(path_weights)
  
  return training_model, testing_encoder_model, testing_decoder_model

In [0]:
# acc, bleu = test(char_input_test[:100], testing_encoder_model, testing_decoder_model)
# with open(path_metrics, 'wb') as file_out:
    
#     pickle.dump({'history':train_history, 'acc':acc, 'bleu':bleu}, file_out)
    
#     print('History, accuracy and average BLEU score saved to {}'.format(path_metrics))

In [0]:
# count = 0
# errors = 0

# for i, char_sequence in enumerate(char_input_test):
  
#   if i % 1000 == 0:
#     print(i)
  
#   count += 1

#   word = matrix_to_word(char_sequence)
#   prediction = predict(char_sequence, testing_encoder_model, testing_decoder_model)
  
#   if not is_correct(word, prediction):
#     errors += 1
# #     print(word)
# #     print('\t', prediction)
# #     print('\t', data_dict[word])
  
# print('Accuracy:', 1 - errors/count)