In [2]:
import sklearn

import numpy as np 
import pandas as pd
import tensorflow as tf 
import tensorflow_datasets as tfds

from sklearn.model_selection import train_test_split
tf.__version__

  from .autonotebook import tqdm as notebook_tqdm


'2.11.0'

In [3]:
df = pd.read_csv("https://go.aws/38ECHUB", delimiter="\t", header=None, nrows=5000)
df.rename(columns={0:'en', 1:'fr'}, inplace=True)

In [4]:
df

Unnamed: 0,en,fr
0,Go.,Va !
1,Hi.,Salut !
2,Run!,Cours !
3,Run!,Courez !
4,Wow!,Ça alors !
...,...,...
4995,I am so sorry.,Je suis tellement désolé !
4996,I am so sorry.,Je suis tellement désolée !
4997,I am very sad.,Je suis très triste.
4998,I ate a donut.,J'ai mangé un beignet.


In [5]:
df["padded_en"] = df["en"].apply(lambda x : f"<start> {x}")

In [6]:
tokenizer_fr = tf.keras.preprocessing.text.Tokenizer()
tokenizer_fr.fit_on_texts(df["fr"])
df["fr_indices"] = tokenizer_fr.texts_to_sequences(df["fr"])

In [7]:
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters='<>!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')
tokenizer_en.fit_on_texts(df["en"])
df["en_indices"] = tokenizer_en.texts_to_sequences(df["en"])

In [8]:
df["padded_en_indices"] = tokenizer_en.texts_to_sequences(df["padded_en"])

In [9]:
df["padded_en_indices_clean"] = df["padded_en_indices"].apply(lambda x : x[:-1])

In [10]:
df

Unnamed: 0,en,fr,padded_en,fr_indices,en_indices,padded_en_indices,padded_en_indices_clean
0,Go.,Va !,<start> Go.,[36],[10],"[218, 10]",[218]
1,Hi.,Salut !,<start> Hi.,[404],[615],"[218, 615]",[218]
2,Run!,Cours !,<start> Run!,[1212],[110],"[218, 110]",[218]
3,Run!,Courez !,<start> Run!,[1213],[110],"[218, 110]",[218]
4,Wow!,Ça alors !,<start> Wow!,"[22, 1214]",[871],"[218, 871]",[218]
...,...,...,...,...,...,...,...
4995,I am so sorry.,Je suis tellement désolé !,<start> I am so sorry.,"[1, 2, 181, 232]","[1, 23, 77, 127]","[218, 1, 23, 77, 127]","[218, 1, 23, 77]"
4996,I am so sorry.,Je suis tellement désolée !,<start> I am so sorry.,"[1, 2, 181, 361]","[1, 23, 77, 127]","[218, 1, 23, 77, 127]","[218, 1, 23, 77]"
4997,I am very sad.,Je suis très triste.,<start> I am very sad.,"[1, 2, 208, 169]","[1, 23, 286, 133]","[218, 1, 23, 286, 133]","[218, 1, 23, 286]"
4998,I ate a donut.,J'ai mangé un beignet.,<start> I ate a donut.,"[12, 401, 23, 684]","[1, 395, 6, 589]","[218, 1, 395, 6, 589]","[218, 1, 395, 6]"


In [11]:
padding_fr = tf.keras.preprocessing.sequence.pad_sequences( df["fr_indices"], padding="post")
padding_en = tf.keras.preprocessing.sequence.pad_sequences( df["en_indices"], padding="post")
padded_en_indices_clean = tf.keras.preprocessing.sequence.pad_sequences( df["padded_en_indices_clean"], padding="post")

In [12]:
padding_fr.shape

(5000, 10)

In [13]:
padding_en.shape

(5000, 4)

In [14]:
padded_en_indices_clean.shape

(5000, 4)

In [15]:
en_train, en_val, fr_train, fr_val, teacher_train, teacher_val =  train_test_split(padding_en,
                                                                                   padding_fr,
                                                                                   padded_en_indices_clean,
                                                                                   test_size=0.3)

In [16]:
en_train.shape, en_val.shape, fr_train.shape, fr_val.shape, teacher_train.shape, teacher_val.shape

((3500, 4), (1500, 4), (3500, 10), (1500, 10), (3500, 4), (1500, 4))

In [17]:
n_embed = 128
n_lstm = 64
fr_len = padding_fr.shape[1]
en_len = padding_en.shape[1]
vocab_size_fr = len(tokenizer_fr.word_index)
vocab_size_en = len(tokenizer_en.word_index)


In [18]:
#Setup Encoder
encoder_input = tf.keras.Input(shape=fr_len)
encoder_embed = tf.keras.layers.Embedding(input_dim=vocab_size_fr+1, output_dim=n_embed)
encoder_lstm = tf.keras.layers.LSTM(n_lstm, return_state=True)

encoder_embed_ouput = encoder_embed(encoder_input)
encoder_output = encoder_lstm(encoder_embed_ouput)

encoder = tf.keras.Model(inputs = encoder_input, outputs = encoder_output)

In [19]:
encoder(fr_train)

[<tf.Tensor: shape=(3500, 64), dtype=float32, numpy=
 array([[-0.00669308, -0.01318382, -0.00317102, ...,  0.01990228,
         -0.00015661,  0.00371027],
        [ 0.00074786, -0.01345642,  0.00087293, ...,  0.01017446,
         -0.0059988 ,  0.00316827],
        [-0.00689298, -0.01382952,  0.0003599 , ...,  0.01907188,
         -0.00168965,  0.00392611],
        ...,
        [-0.00769159, -0.01423794, -0.00187745, ...,  0.01765457,
         -0.00244856,  0.00479478],
        [-0.00484343, -0.01421603, -0.00415691, ...,  0.01548809,
         -0.00010933,  0.0048531 ],
        [-0.00624511, -0.01397249, -0.0009027 , ...,  0.01741099,
         -0.00320802,  0.00396907]], dtype=float32)>,
 <tf.Tensor: shape=(3500, 64), dtype=float32, numpy=
 array([[-0.00669308, -0.01318382, -0.00317102, ...,  0.01990228,
         -0.00015661,  0.00371027],
        [ 0.00074786, -0.01345642,  0.00087293, ...,  0.01017446,
         -0.0059988 ,  0.00316827],
        [-0.00689298, -0.01382952,  0.0003599 ,

In [20]:
#Setup Decoder for train 
decoder_input = tf.keras.Input(shape=(en_len))
decoder_embed = tf.keras.layers.Embedding(input_dim=vocab_size_en+1, 
                                          output_dim=n_embed)

decoder_lstm = tf.keras.layers.LSTM(n_lstm, return_sequences=True, return_state=True)
decoder_pred = tf.keras.layers.Dense(vocab_size_en+1, activation="softmax")

decoder_embed_output = decoder_embed(decoder_input)
decoder_lstm_output, _, _ = decoder_lstm(decoder_embed_output, initial_state=encoder_output[1:])
decoder_output = decoder_pred(decoder_lstm_output)

decoder = tf.keras.Model(inputs = [encoder_input,decoder_input], outputs = decoder_output)

In [21]:
#Setup Decoder for inference
decoder_state_input_h = tf.keras.Input(shape=(n_lstm,))
decoder_state_input_c = tf.keras.Input(shape=(n_lstm,))
decoder_states_inputs = [decoder_state_input_h, decoder_state_input_c]

decoder_input_inf = tf.keras.Input(shape=(1))
decoder_embed_output = decoder_embed(decoder_input_inf)

decoder_outputs, state_h, state_c = decoder_lstm(decoder_embed_output, initial_state=decoder_states_inputs)
decoder_states = [state_h, state_c]
decoder_outputs = decoder_pred(decoder_outputs)

decoder_inf = tf.keras.Model(inputs = [decoder_input_inf, decoder_states_inputs], 
                     outputs = [decoder_outputs, decoder_states])

In [22]:
#Compile decoder for training
decoder.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss=tf.keras.losses.SparseCategoricalCrossentropy(),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()])

In [23]:
#Training
decoder.fit(x=[fr_train, teacher_train], y=en_train,epochs=50, validation_data=([fr_val, teacher_val], en_val))

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x2507e9215d0>

In [24]:
#Setup for predictions on the validation data
enc_input = fr_val
dec_input = tf.ones(shape=(len(fr_val), 1))

enc_out, state_h_inf, state_c_inf = encoder(enc_input)
dec_state = [state_h_inf, state_c_inf]
pred = [] 

for i in range(en_len):
  dec_out, dec_state = decoder_inf([dec_input, dec_state])
  decoded_out = tf.argmax(dec_out, axis=-1)
  pred.append(decoded_out) 
  dec_input = decoded_out 

pred = tf.concat(pred, axis=-1).numpy()

for i in range(10):
  print("pred:", pred[i,:])
  print("true:", en_val[i,:])
  print("\n")

pred: [ 44 220 336  18]
true: [80 11  0  0]


pred: [37 37  4 92]
true: [  24    4 1213    0]


pred: [ 16 268 268  67]
true: [ 37   1 103   4]


pred: [ 12   4 191 125]
true: [ 13 284   0   0]


pred: [311 110  94   3]
true: [744  35   0   0]


pred: [  5 290  76  38]
true: [   7    5   91 1237]


pred: [ 12   4 191  53]
true: [ 13 130   0   0]


pred: [ 23 127 118 118]
true: [  2 509   0   0]


pred: [30 11 10 38]
true: [ 27  33 507   0]


pred: [ 12   4 106   0]
true: [564  54   0   0]




In [25]:
#Visualize predictions 
y_sample = tokenizer_en.sequences_to_texts(en_val)[:10]
pred_sample = tokenizer_en.sequences_to_texts(pred)[:10]

for i, j in zip(y_sample,pred_sample):
  print("true:", i)
  print("pred", j)
  print("\n")

true: help me
pred to sleep tight up


true: can you skate
pred did did you ok


true: did i ask you
pred don't care care out


true: you're silly
pred are you young big


true: release him
pred knows run for it


true: he is my type
pred is kind busy in


true: you're funny
pred are you young good


true: i'm thorough
pred am sorry fat fat


true: he's no saint
pred let me go in


true: fish please
pred are you new


