In [111]:
import numpy as np
import tensorflow as tf
import keras
from matplotlib import pyplot as plt
from keras.models import Model
from keras.layers import LSTM, Dense, Input, Dropout

In [59]:
lines = []
with open("ph.txt") as phfile:
    lines = phfile.readlines()

lines = [k.split('\t') for k in lines]
lines = [[k[0], k[1].split(' ')] for  k in lines]
lines = [[k[0], k[1][:-1]+[k[1][-1][:-1]]] for k in lines]

lines[:5]

[['aback', ['AH', 'B', 'AE', 'K']],
 ['abandon', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N']],
 ['abandoned', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'D']],
 ['abandoning', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'IH', 'NG']],
 ['abandons', ['AH', 'B', 'AE', 'N', 'D', 'AH', 'N', 'Z']]]

In [60]:
xraw = [k[0] for k in lines]
yraw = [k[1] for k in lines]

In [61]:
yraw = [['\s'] + k + ['\e'] for k in yraw]

In [62]:
phoneme_array = [
    '\s',    'AA',    'AE',    'AH',    'AO',
    'AW',    'AY',    'B',    'CH',    'D',
    'DH',    'EH',    'ER',    'EY',    'F',
    'G',    'HH',    'IH',    'IY',    'JH',
    'K',    'L',    'M',    'N',    'NG',
    'OW',    'OY',    'P',    'R',    'S',
    'SH',    'T',    'TH',    'UH',    'UW',
    'V',    'W',    'Y',    'Z',    'ZH',  '\e'
]

In [63]:
phonemes_count = len(phoneme_array)
chars_count = 26
phonemes_count, chars_count

(41, 26)

In [64]:
phoneme_2_index = dict(zip(phoneme_array, range(len(phoneme_array))))
index_2_phoneme = dict(zip(range(len(phoneme_array)), phoneme_array))

In [65]:
char_2_index = dict(zip([chr(ord('a')+k) for k in range(26)], range(26) ))
index_2_char = dict(zip(range(26), [chr(ord('a')+k) for k in range(26)] ))

In [66]:
max_word_len = max([len(k) for k in xraw])
max_word_len

20

In [67]:
max_phonemes_len = max([len(k) for k in yraw])
max_phonemes_len

18

In [68]:
# no_entries chr_idx vct_idx
# 0-len(xraw)  0-19  0-25

xdata = np.zeros(shape=(len(xraw), max_word_len, 26), dtype='float32')

In [69]:
for i in range(len(xraw)):
    word = xraw[i]
    for j in range(len(word)):
        xdata[i,j,char_2_index[word[j]]] = 1

In [70]:
# no_entries ph_idx vct_idx
# 0-len(xraw)  0-max_wordlen  0-25
# ---> We are padding phonemes with start and stop symbols

ydata = np.zeros(shape=(len(yraw), max_phonemes_len, phonemes_count), dtype='float32')

In [71]:
for i in range(len(yraw)):
    phword = yraw[i]
    for j in range(len(phword)):
        ydata[i,j,phoneme_2_index[phword[j]]] = 1

---------------------------

In [133]:
encoder_lstm_output_size = 128
decoder_lstm_output_size = 128

In [134]:
# The first lstm feeds on word vec
encoder_lstm_input = Input(shape=(max_word_len, chars_count), dtype='float32')

In [135]:
# This lstm should return the internal state C-memory H-hypothesis
encoder_lstm = LSTM(encoder_lstm_output_size, return_state=True)

In [136]:
encoder_lstm_output, encoder_lstm_h, encoder_lstm_c = encoder_lstm(encoder_lstm_input)

In [137]:
# This bad boy will be the initial state for decoder lstm
encoder_lstm_state = [encoder_lstm_h, encoder_lstm_c]

------------

In [138]:
# We pass phonemes that we expect in the same order
# - The idea is that, based on internal state from encoder and start symbol - the first phoneme will be generated
# - This first generated phoneme can be fed along with new state of decoder as next iteration input.

# During training, since we already know whats what - we can feed input as known phonemes and output shall be
# one timestep left shifted. i.e., input -> \s, <first_phoneme>... | output -> <first_phoneme>, <second_phoneme>...

decoder_lstm_input = Input(shape=(max_phonemes_len, phonemes_count), dtype='float32')

In [139]:
# Decoder's output and state is needed - we'll use decoder's output state in first iteration as input for next
decoder_lstm = LSTM(decoder_lstm_output_size, return_state=True, return_sequences=True)

In [140]:
decoder_lstm_outputs, decoder_lstm_h, decoder_lstm_c = decoder_lstm(
    decoder_lstm_input,
    initial_state=encoder_lstm_state)

In [141]:
decoder_dense_layer1 = Dense(phonemes_count, activation='softmax')
decoder_dense_output1 = decoder_dense_layer1(decoder_lstm_outputs)

decoder_dropouts_layer = Dropout(rate=0.1)
decoder_dropouts_output = decoder_dropouts_layer(decoder_dense_output1)

decoder_dense_layer2 = Dense(phonemes_count, activation='softmax')
decoder_dense_output = decoder_dense_layer2(decoder_dropouts_output)

------------

In [142]:
enc_dec_model = Model(
    inputs=[encoder_lstm_input, decoder_lstm_input], # <word representation>, <expected phonemes representation>
    outputs=decoder_dense_output # <one timestep left shifted phonemes>
)

In [143]:
enc_dec_model.compile(loss='categorical_crossentropy', optimizer='rmsprop', metrics=['mae', 'acc'])

In [144]:
enc_dec_model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_11 (InputLayer)           (None, 20, 26)       0                                            
__________________________________________________________________________________________________
input_12 (InputLayer)           (None, 18, 41)       0                                            
__________________________________________________________________________________________________
lstm_10 (LSTM)                  [(None, 128), (None, 79360       input_11[0][0]                   
__________________________________________________________________________________________________
lstm_11 (LSTM)                  [(None, 18, 128), (N 87040       input_12[0][0]                   
                                                                 lstm_10[0][1]                    
          

In [145]:
_shift_ydata = np.zeros(shape=ydata.shape, dtype='float32')
_shift_ydata[:,:,:-1] = ydata[:,:,1:] 

even_indices = [2*k for k in range(int(xdata.shape[0]/2))]
odd_indices = [2*k+1 for k in range(int(xdata.shape[0]/2) - 1)]

_train_xdata = xdata[even_indices]
_train_ydata = ydata[even_indices]
_train_shift_ydata = _shift_ydata[even_indices]

_test_xdata = xdata[odd_indices]
_test_ydata = ydata[odd_indices]
_test_shift_ydata = _shift_ydata[odd_indices]

In [146]:
enc_dec_train_history = enc_dec_model.fit(
    x=[_train_xdata, _train_ydata],
    y=_train_shift_ydata,
    batch_size=64,
    epochs=100,
    verbose=1
)

Epoch 1/100


InternalError: GPU sync failed