In [None]:
# Import necessaries librairies
import pandas as pd
import numpy as np 
import sklearn
import tensorflow_datasets as tfds
import tensorflow as tf 
import os
from sklearn.model_selection import train_test_split

#Load data with 5000 lines 
doc = pd.read_csv("https://go.aws/38ECHUB", delimiter="\t", header=None, nrows=25000)
doc.rename(columns={0:'en', 1:'fr'}, inplace=True)

#Add the word <start> to the target to change en and fr
doc["en"] = doc['en'].apply(lambda x: "<start> "+ x)

#Create 2 tokenizers fr and eng, add filters that doesn't contain <> 
tokenizer_fr = tf.keras.preprocessing.text.Tokenizer() 
tokenizer_en = tf.keras.preprocessing.text.Tokenizer(filters='!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n')

#Fit tokenizer on fr and en
tokenizer_fr.fit_on_texts(doc['fr'])
tokenizer_en.fit_on_texts(doc["en"])

#Apply fitted tokenizers to the dataset by creating 2 columns, one for fr and 1 for en
doc["fr_indices"] = tokenizer_fr.texts_to_sequences(doc['fr'])
doc["en_indices"] = tokenizer_en.texts_to_sequences(doc['en'])

#Store in 2 variables all indices variables once they are padded with pad_sequence
padded_fr_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["fr_indices"], padding="post")
padded_en_indices = tf.keras.preprocessing.sequence.pad_sequences(doc["en_indices"], padding="post")

print(padded_fr_indices.shape, padded_en_indices.shape)

#Do train_test_split to devide into train and validation sets 
X_train, X_val, y_train, y_val = train_test_split(padded_fr_indices, padded_en_indices, test_size=.2)

#Select Batch size and do tensor slices 
BATCH_SIZE=128
train = tf.data.Dataset.from_tensor_slices((X_train, y_train)).shuffle(len(X_train)).batch(BATCH_SIZE)
val = tf.data.Dataset.from_tensor_slices((X_val, y_val)).batch(BATCH_SIZE)

print(X_train.shape, X_val.shape, y_train.shape, y_val.shape)

In [3]:
n_embed = 1024
n_gru = 256
vocab_inp_size = len(tokenizer_fr.word_index)
vocab_tar_size = len(tokenizer_en.word_index)


![bahdanau](https://full-stack-bigdata-datasets.s3.eu-west-3.amazonaws.com/Deep+Learning/attention/Attention-encoder-decoder.drawio.png)

In [4]:
class Encoder_maker(tf.keras.Model):
  def __init__(self, in_vocab_size, embed_dim, n_units):
    super().__init__()
    # Declaring layers I need 
    self.embed = tf.keras.layers.Embedding(input_dim=in_vocab_size, output_dim=embed_dim)
    self.gru = tf.keras.layers.GRU(units=n_units, return_sequences=True, return_state=True)
    
  def __call__(self, input_batch):
    #Returning both values of GRU
    return self.gru(self.embed(input_batch))
    
class Bahdanau_attention_maker(tf.keras.layers.Layer):
  def __init__(self, attention_units):
    super().__init__()
    # Declaring layers I need 
    self.W1 = tf.keras.layers.Dense(units=attention_units)
    self.W2 = tf.keras.layers.Dense(units=attention_units)
    self.V = tf.keras.layers.Dense(units=1)

  def __call__(self, enc_out, state):
    #Using first 2 neurons
    self.W1_out = self.W1(enc_out) 
    self.W2_out = self.W2(tf.expand_dims(state, axis = 1)) 
    self.sum = self.W1_out + self.W2_out  
    
    #Using third neuron
    self.score = self.V(tf.nn.tanh(self.sum)) 

    #Creating Attention vector
    self.attention_weights = tf.nn.softmax(self.score, axis=1) 
    self.weighted_enc_out = enc_out * self.attention_weights 
    self.context_vector = tf.reduce_sum(self.weighted_enc_out, axis=1) 

    return self.context_vector, self.attention_weights
     
class Decoder_maker(tf.keras.Model):
  def __init__(self, tar_vocab_size, embed_dim, n_units):
    super().__init__()
    # Declaring layers I need 
    self.embed = tf.keras.layers.Embedding(input_dim=tar_vocab_size, output_dim=embed_dim)
    self.gru = tf.keras.layers.GRU(units=n_units, return_sequences=True, return_state=True)
    self.pred = tf.keras.layers.Dense(units=tar_vocab_size,activation="softmax")
    self.attention = Bahdanau_attention_maker(attention_units=n_units)

  def __call__(self, dec_in, enc_out, state):
    # Applying attention layer 
    self.context_vector, self.attention_weights = self.attention(enc_out,state)
    self.embed_out = self.embed(dec_in) 
    self.concat = tf.keras.layers.concatenate([self.embed_out, tf.expand_dims(self.context_vector, axis=1)])

    # Using GRU
    self.gru_out, self.gru_state = self.gru(self.concat) 
    self.pred_out = self.pred(tf.reshape(self.gru_out, shape=(-1, self.gru_out.shape[2]))) 

    return self.pred_out, self.gru_state, self.attention_weights

In [5]:
encoder = Encoder_maker(vocab_inp_size+1, n_embed, n_gru)
decoder = Decoder_maker(tar_vocab_size=vocab_tar_size+1, embed_dim=n_embed, n_units=n_gru)

In [6]:
optimizer = tf.keras.optimizers.Adam()
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(reduction='none')

def loss_function(real, pred):
  loss_ = loss_object(real, pred)
  mask = tf.cast(tf.math.logical_not(tf.math.equal(real, 0)), dtype=loss_.dtype)
  loss_ *= mask
  return tf.reduce_mean(loss_)

In [7]:
checkpoint_dir = './training_checkpoints2'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer, encoder=encoder, decoder=decoder)

In [8]:
def train_step(inp, targ):
  loss = 0
  with tf.GradientTape() as tape: 
    enc_output, enc_state = encoder(inp)
    dec_state = enc_state 
    dec_input = tf.expand_dims(targ[:,0], axis=1)
    for t in range(1, targ.shape[1]):
      pred, dec_state, _ = decoder(dec_input, enc_output, dec_state)
      loss += loss_function(targ[:, t], pred)
      dec_input = tf.expand_dims(targ[:, t], 1)

  variables = encoder.trainable_variables + decoder.trainable_variables 
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables)) 

  return (loss / int(targ.shape[1]))

In [9]:
import time
EPOCHS = 100

for epoch in range(EPOCHS):
  start = time.time()
  total_loss = 0

  for (batch, (inp, targ)) in enumerate(train):
    batch_loss = train_step(inp, targ)
    total_loss += batch_loss

    if batch % 10 == 0:
      print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1, batch, batch_loss.numpy()))
  
  checkpoint.save(file_prefix = checkpoint_prefix)
  print('Epoch {} Loss {:.4f}'.format(epoch + 1, total_loss))
  print('Time taken for 1 epoch {} sec'.format(time.time() - start))

  enc_input = X_val
  dec_input = tf.zeros(shape=(len(X_val),1))
  enc_out, enc_state = encoder(enc_input)
  dec_state = enc_state

  pred = []

  for i in range(y_val.shape[1]-1):
    dec_out, dec_state, attention_w = decoder(dec_input, enc_out, dec_state)
    decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)
    pred.append(tf.expand_dims(dec_out,axis=1)) 
    dec_input = decoded_out 

  pred = tf.concat(pred, axis=1).numpy()
  print("\n val loss :", loss_function(y_val[:,1:],pred),"\n")

Epoch 1 Batch 0 Loss 3.1797
Epoch 1 Loss 21.9276
Time taken for 1 epoch 2.1873700618743896 sec

 val loss : tf.Tensor(4.0299044, shape=(), dtype=float32) 

Epoch 2 Batch 0 Loss 2.8797
Epoch 2 Loss 19.5678
Time taken for 1 epoch 0.6481108665466309 sec

 val loss : tf.Tensor(3.8183992, shape=(), dtype=float32) 

Epoch 3 Batch 0 Loss 2.6701
Epoch 3 Loss 18.4579
Time taken for 1 epoch 0.6290912628173828 sec

 val loss : tf.Tensor(3.8240035, shape=(), dtype=float32) 

Epoch 4 Batch 0 Loss 2.4998
Epoch 4 Loss 17.8649
Time taken for 1 epoch 0.6332724094390869 sec

 val loss : tf.Tensor(3.8400233, shape=(), dtype=float32) 

Epoch 5 Batch 0 Loss 2.4902
Epoch 5 Loss 17.0234
Time taken for 1 epoch 0.641333818435669 sec

 val loss : tf.Tensor(3.9425151, shape=(), dtype=float32) 

Epoch 6 Batch 0 Loss 2.3161
Epoch 6 Loss 16.5858
Time taken for 1 epoch 0.6108508110046387 sec

 val loss : tf.Tensor(3.966123, shape=(), dtype=float32) 

Epoch 7 Batch 0 Loss 2.3471
Epoch 7 Loss 15.7355
Time taken for 1 

In [10]:
checkpoint.restore(tf.train.latest_checkpoint(checkpoint_dir))
encoder_latest=checkpoint.encoder
decoder_latest=checkpoint.decoder

In [11]:
enc_input = X_val
dec_input = tf.zeros(shape=(len(X_val),1))
enc_out, enc_state = encoder_latest(enc_input)
dec_state = enc_state

pred = []  

for i in range(y_val.shape[1]-1):
  dec_out, dec_state, attention_w = decoder_latest(dec_input, enc_out, dec_state)
  decoded_out = tf.expand_dims(tf.argmax(dec_out, axis=-1), axis=1)
  pred.append(decoded_out) 
  dec_input = decoded_out 

pred = tf.concat(pred, axis=-1).numpy()

pred_text = tokenizer_en.sequences_to_texts(pred)
y_val_text = tokenizer_en.sequences_to_texts(y_val[:,1:])
for i in range(10):
  print("pred:", pred_text[i])
  print("true:", y_val_text[i])
  print("\n")

pred: am cold i'm
true: i'm right


pred: take it hot
true: take it


pred: you up you
true: move over


pred: go ahead go
true: go ahead


pred: on it you
true: here's 5


pred: i give up
true: i forgot


pred: me go away
true: go now


pred: who knows she
true: who spoke


pred: i hope so
true: i'm ok


pred: a man it
true: keep it


