# Introduction

**Dataset**

* 

<div class="alert alert-block alert-danger">

<b>TODO: switch from Udacity dataset to publicly available one</b>

</div>

Links:

* [A ten-minute introduction to sequence-to-sequence learning in Keras](https://blog.keras.io/a-ten-minute-introduction-to-sequence-to-sequence-learning-in-keras.html)
* [Keras Official Seq2Seq Example](https://github.com/keras-team/keras/blob/master/examples/lstm_seq2seq.py)

**Resources**

* [Sequence to Sequence Learning with Neural Networks](https://arxiv.org/abs/1409.3215) (2014) by Ilya Sutskever, Oriol Vinyals, Quoc V. Le
* [Learning Phrase Representations using RNN Encoder-Decoder for Statistical Machine Translation](https://arxiv.org/abs/1406.1078) (2014) by Kyunghyun Cho, Bart van Merrienboer, Caglar Gulcehre, Dzmitry Bahdanau, Fethi Bougares, Holger Schwenk, Yoshua Bengio

# Imports

In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt

In [2]:
import tensorflow as tf
gpu_options = tf.GPUOptions(allow_growth=True)  # init TF ...
config=tf.ConfigProto(gpu_options=gpu_options)  # w/o taking ...
with tf.Session(config=config): pass            # all GPU memory

# English to French Dataset

In [3]:
dataset_location = '/home/marcin/Udacity/aind2-nlp-capstone/data/'

In [4]:
with open(os.path.join(dataset_location, 'small_vocab_en')) as f:
    data_en_raw = list(map(lambda x: 'ST '+x.strip().lower()+' EN', f.readlines()))
data_en_raw[0:3]

['ST new jersey is sometimes quiet during autumn , and it is snowy in april . EN',
 'ST the united states is usually chilly during july , and it is usually freezing in november . EN',
 'ST california is usually quiet during march , and it is usually hot in june . EN']

In [5]:
with open(os.path.join(dataset_location, 'small_vocab_fr')) as f:
    data_fr_raw = list(map(lambda x: 'ST '+x.strip().lower()+' EN', f.readlines()))
data_fr_raw[0:3]

["ST new jersey est parfois calme pendant l' automne , et il est neigeux en avril . EN",
 'ST les états-unis est généralement froid en juillet , et il gèle habituellement en novembre . EN',
 'ST california est généralement calme en mars , et il est généralement chaud en juin . EN']

In [6]:
tok_en = tf.keras.preprocessing.text.Tokenizer(lower=False)
tok_en.fit_on_texts(data_en_raw)
data_en_tok = tok_en.texts_to_sequences(data_en_raw)
data_en_tok[0:3]

[[2, 19, 25, 1, 10, 69, 6, 41, 9, 5, 1, 57, 4, 46, 3],
 [2, 7, 22, 23, 1, 11, 64, 6, 45, 9, 5, 1, 11, 53, 4, 47, 3],
 [2, 24, 1, 11, 69, 6, 40, 9, 5, 1, 11, 70, 4, 36, 3]]

In [7]:
tok_fr = tf.keras.preprocessing.text.Tokenizer(lower=False)
tok_fr.fit_on_texts(data_fr_raw)
data_fr_tok = tok_fr.texts_to_sequences(data_fr_raw)
data_fr_tok[0:3]

[[2, 37, 36, 1, 10, 69, 39, 13, 26, 8, 5, 1, 114, 4, 52, 3],
 [2, 6, 34, 33, 1, 14, 21, 4, 51, 8, 5, 97, 71, 4, 53, 3],
 [2, 103, 1, 14, 69, 4, 47, 8, 5, 1, 14, 23, 4, 43, 3]]

In [8]:
max_len_en = len(max(data_en_tok, key=len))
max_len_fr = len(max(data_fr_tok, key=len))
max_len_both = max(max_len_en, max_len_fr)
max_len_both

23

In [16]:
data_en = tf.keras.preprocessing.sequence.pad_sequences(data_en_tok, maxlen=max_len_en, padding='post')
data_fr = tf.keras.preprocessing.sequence.pad_sequences(data_fr_tok, maxlen=max_len_fr, padding='post')

In [18]:
n_en_seq = data_en.shape[1]
n_fr_seq = data_fr.shape[1]
n_en_vocab = len(tok_en.word_index)
n_fr_vocab = len(tok_fr.word_index)
max_seq_len = max(n_en_seq, n_fr_seq)
print('n_en_seq', n_en_seq)
print('n_fr_seq', n_fr_seq)
print('n_en_vocab', n_en_vocab)
print('n_fr_vocab', n_fr_vocab)
print('max_seq_len', max_seq_len)

n_en_seq 17
n_fr_seq 23
n_en_vocab 201
n_fr_vocab 346
max_seq_len 23


In [11]:
print(data_en.shape)
print(data_en[0:3])

(137860, 23)
[[ 2 19 25  1 10 69  6 41  9  5  1 57  4 46  3  0  0  0  0  0  0  0  0]
 [ 2  7 22 23  1 11 64  6 45  9  5  1 11 53  4 47  3  0  0  0  0  0  0]
 [ 2 24  1 11 69  6 40  9  5  1 11 70  4 36  3  0  0  0  0  0  0  0  0]]


In [12]:
print(data_fr.shape)
print(data_fr[0:3])

(137860, 23)
[[  2  37  36   1  10  69  39  13  26   8   5   1 114   4  52   3   0   0
    0   0   0   0   0]
 [  2   6  34  33   1  14  21   4  51   8   5  97  71   4  53   3   0   0
    0   0   0   0   0]
 [  2 103   1  14  69   4  47   8   5   1  14  23   4  43   3   0   0   0
    0   0   0   0   0]]


# Simple Model

<img src="assets/rnn_bidirectional.png"/>
<center>Figure from Bidirectional Recurrent Neural Networks (1997) by Mike Schuster and kuldip K. Paliwal</center>

In [101]:
from tensorflow.keras.layers import Input, Embedding, Bidirectional, GRU, TimeDistributed, Dense, Activation

X_input = Input(shape=(n_en_seq,))
X = Embedding(input_dim=n_en_vocab, output_dim=50)(X_input)
X = Bidirectional( GRU(units=64, return_sequences=True) )(X)
X = TimeDistributed(Dense(units=n_fr_vocab))(X)
X = Activation('softmax')(X)

model = tf.keras.Model(inputs=X_input, outputs=X)
model.compile(loss=tf.keras.losses.sparse_categorical_crossentropy,
              optimizer=tf.keras.optimizers.Adam(lr=0.001),
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])    
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_7 (InputLayer)         (None, 23)                0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 23, 50)            10050     
_________________________________________________________________
bidirectional_1 (Bidirection (None, 23, 128)           44160     
_________________________________________________________________
time_distributed_1 (TimeDist (None, 23, 346)           44634     
_________________________________________________________________
activation_1 (Activation)    (None, 23, 346)           0         
Total params: 98,844
Trainable params: 98,844
Non-trainable params: 0
_________________________________________________________________


In [108]:
hist = model.fit(x=data_en, y=np.expand_dims(data_fr, axis=-1),
                 batch_size=1024, epochs=10, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 110288 samples, validate on 27572 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


**Test Model**

In [13]:
def sequence_to_english(seq):
    words = [tok_en.index_word[x] for x in seq if x in tok_en.index_word]
    return ' '.join(words)
def sequence_to_french(seq):
    words = [tok_fr.index_word[x] for x in seq if x in tok_fr.index_word]
    return ' '.join(words)

In [112]:
index = 234
english_sentence = data_en_raw[index]
french_sentence = data_fr_raw[index]

prediction_prob = model.predict(data_en[index:index+1])
prediction_prob = prediction_prob.squeeze()
prediction_tok = prediction_prob.argmax(axis=-1)
predicted_sentence = sequence_to_french(prediction_tok)

print('english:            ', english_sentence)
print('french (original):  ', french_sentence)
print('french (predicted): ', predicted_sentence)

english:             ST we dislike oranges , grapefruit , and bananas . EN
french (original):   ST nous détestons les oranges , le pamplemousse et les bananes . EN
french (predicted):  ST nous détestons les le le pamplemousse et les les EN


# Sequence to Sequence

In [19]:
data_fr_noST = np.roll(data_fr, shift=-1, axis=-1)
data_fr_noST[:,-1] = 0
print(data_fr_noST.shape)
print(data_fr_noST[0:3])

(137860, 23)
[[ 37  36   1  10  69  39  13  26   8   5   1 114   4  52   3   0   0   0
    0   0   0   0   0]
 [  6  34  33   1  14  21   4  51   8   5  97  71   4  53   3   0   0   0
    0   0   0   0   0]
 [103   1  14  69   4  47   8   5   1  14  23   4  43   3   0   0   0   0
    0   0   0   0   0]]


In [20]:
from tensorflow.keras.layers import Input, Embedding, LSTM, TimeDistributed, Dense, Activation

E_input = Input(shape=(n_en_seq,), name='Enc_Input')
E = Embedding(input_dim=n_en_vocab, output_dim=50, name='Enc_Embbeding')(E_input)
_, Eh, Ec = LSTM(units=512, return_state=True, name='Enc_LSTM')(E)

decoder_embedding = Embedding(input_dim=n_fr_vocab, output_dim=50, name='Dec_Embbedingg')
decoder_lstm = LSTM(512, return_sequences=True, return_state=True, name='Dec_LSTM')
decoder_dense = Dense(n_fr_vocab, activation='softmax', name='Dec_Output')

D_input = Input(shape=(n_fr_seq,), name='Dec_Target')
D = decoder_embedding(D_input)
D, _, _ = decoder_lstm(D, initial_state=[Eh, Ec])
D_output = decoder_dense(D)

model = tf.keras.Model(inputs=[E_input, D_input], outputs=D_output)
model.compile(optimizer=tf.keras.optimizers.Adam(lr=0.001),
              loss=tf.keras.losses.sparse_categorical_crossentropy,
              metrics=[tf.keras.metrics.sparse_categorical_accuracy])    
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Enc_Input (InputLayer)          (None, 17)           0                                            
__________________________________________________________________________________________________
Dec_Target (InputLayer)         (None, 23)           0                                            
__________________________________________________________________________________________________
Enc_Embbeding (Embedding)       (None, 17, 50)       10050       Enc_Input[0][0]                  
__________________________________________________________________________________________________
Dec_Embbedingg (Embedding)      (None, 23, 50)       17300       Dec_Target[0][0]                 
__________________________________________________________________________________________________
Enc_LSTM (

In [15]:
model.fit(x=[data_en, data_fr], y=np.expand_dims(data_fr_noST, axis=-1),
          batch_size=1024, epochs=10, validation_split=0.2)

  "Converting sparse IndexedSlices to a dense Tensor of unknown shape. "


Train on 110288 samples, validate on 27572 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fe65a593f28>

**Encoder and Sampler**

In [16]:
encoder = tf.keras.Model(inputs=E_input, outputs=[Eh, Ec])

In [17]:
encoder.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
Enc_Input (InputLayer)       (None, 23)                0         
_________________________________________________________________
Enc_Embbeding (Embedding)    (None, 23, 50)            10050     
_________________________________________________________________
Enc_LSTM (LSTM)              [(None, 512), (None, 512) 1153024   
Total params: 1,163,074
Trainable params: 1,163,074
Non-trainable params: 0
_________________________________________________________________


In [18]:
Sh_init = Input(shape=(512,))
Sc_init = Input(shape=(512,))

In [19]:
S_input = Input(shape=(1,), name='Sam_Input')
S_input

<tf.Tensor 'Sam_Input:0' shape=(?, 1) dtype=float32>

In [20]:
S = decoder_embedding(S_input)

<tf.Tensor 'Dec_Embbedingg_1/embedding_lookup/Identity_2:0' shape=(?, 1, 50) dtype=float32>

In [21]:
S, Sh, Sc = decoder_lstm(S, initial_state=[Sh_init, Sc_init])

In [22]:
S_output = decoder_dense(S)

In [23]:
sampler = tf.keras.Model(inputs=[S_input, Sh_init, Sc_init], outputs=[S_output, Sh, Sc])

In [24]:
sampler.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
Sam_Input (InputLayer)          (None, 1)            0                                            
__________________________________________________________________________________________________
Dec_Embbedingg (Embedding)      multiple             17300       Sam_Input[0][0]                  
__________________________________________________________________________________________________
input_1 (InputLayer)            (None, 512)          0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            (None, 512)          0                                            
__________________________________________________________________________________________________
Dec_LSTM (

In [50]:
index = 666
english_sentence = data_en_raw[index]
french_sentence = data_fr_raw[index]
print('english:            ', english_sentence)
print('french (original):  ', french_sentence)

english:             ST his least favorite fruit is the pear , but our least favorite is the banana . EN
french (original):   ST son fruit préféré est moins la poire , mais notre moins préféré est la banane . EN


**Actually Sample**

In [51]:
st_h, st_c = encoder.predict(data_en[index:index+1])
assert st_h.shape == (1, 512) and st_c.shape == (1, 512)

In [52]:
st_input = tok_fr.word_index['ST']
st_input = np.array([[st_input]])  # batch size = 1, seq len = 1
assert st_input.shape == (1, 1)

In [53]:
prediction_tok = []
for i in range(n_fr_seq):
    probs, st_h, st_c = sampler.predict([st_input, st_h, st_c])
    assert st_h.shape == (1, 512) and st_c.shape == (1, 512)
    
    st_input = probs.argmax(axis=-1)
    assert st_input.shape == (1, 1)
    
    token = probs.argmax()
    prediction_tok.append(token)
    
    if token == tok_fr.word_index['EN']:
        break    

In [54]:
prediction_tok

[22, 18, 19, 1, 15, 9, 90, 7, 22, 15, 19, 1, 9, 91, 3]

In [55]:
print('english:            ', english_sentence)
print('french (original):  ', french_sentence)
predicted_sentence = sequence_to_french(prediction_tok)
print('french (predicted): ', predicted_sentence)

english:             ST his least favorite fruit is the pear , but our least favorite is the banana . EN
french (original):   ST son fruit préféré est moins la poire , mais notre moins préféré est la banane . EN
french (predicted):  son fruit préféré est moins la poire mais son moins préféré est la fraise EN
