<a href="https://colab.research.google.com/github/martinpius/RNN-ALIENS/blob/main/Sequence_to_sequence_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [76]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)
try:
  COLAB = True
  import tensorflow as tf
  print(f"You are using Colab with tensorflow version {tf.__version__}")
except Exception as e:
  COLAB = False
  print(f"{type(e)}: {e}\n....Please Load Your Drive....")

def time_fmt(x):
  h = int(x / (60 * 60))
  m = int(x % (60 * 60) / 60)
  s = int(x % 60)
  return f"{h}: {m:>03}: {s:>05.2f}"

time_fmt(240.892)

Mounted at /content/drive
You are using Colab with tensorflow version 2.4.0


'0: 004: 00.00'

In [75]:
import time, io, os, re, unicodedata
import matplotlib as mlp
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib.ticker as ticker
import tensorflow as tf
import numpy as np

In [3]:
#Lets build an encoder-decoder network for machine translation

In [77]:
#Importing and preprocessing the data
#We will train a machine to translate spanish language to english
#It is a simple MT with attention mechanism to learn variable/words contribution

In [78]:
folder_path = tf.keras.utils.get_file(fname = "spa-eng.zip", origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
                                      extract = True)
file_path = os.path.dirname(folder_path) + "/spa-eng/spa.txt"

In [79]:
#Change the unicode data format into ascii format:
def ascii_fmt(t):
  return "".join(k for k in unicodedata.normalize('NFD',t) if unicodedata.category(k) != 'Mn')

In [80]:
def preprocess_text(t):
  '''We do some cleaning and marking the start and the end of each sentence'''
  t = ascii_fmt(t.lower().strip()) #Convert to lower cases and strip the white spaces
  t = re.sub(r"([?,!.多])", r" \1 ", t)
  t = re.sub(r'[" "]+', " ", t)
  t = re.sub(r'[^a-zA-Z?多,.!]+'," ", t) # For each sentence we replace the everthing else except the one listed with a white space
  t = t.strip() #Strip the white spaces
  t = '<start>' + t + '<end>' #Marking the start and the end of each sentence with start, end
  return t

In [81]:
#Testing the above function if it works as intended:
en_verse = u"May I borrow this book?"
sp_verse = u"多Puedo tomar prestado este libro?"

In [82]:
en_out = preprocess_text(en_verse)

In [83]:
sp_out = preprocess_text(sp_verse)

In [84]:
print(en_out)
print()
print(sp_out)

<start>may i borrow this book ?<end>

<start>多 puedo tomar prestado este libro ?<end>


In [85]:
#Create sentences pairs for english-spanish side by side

In [86]:
def data_creator(path, sample_size):
  lines = io.open(path, encoding = 'UTF-8').read().strip().split("\n")
  words_pair = [[preprocess_text(t) for t in line.split("\t")] for line in lines[:sample_size]]
  return zip(*words_pair)

In [87]:
#Applying the function
sample_size = 50000

In [88]:
#Load the sample of size 50000
eng_text, spa_text = data_creator(file_path,sample_size )

In [89]:
print(f"English text is : {eng_text[-1]}")
print()
print(f"Spanish text is: {spa_text[-1]}")

English text is : <start>the people are so friendly .<end>

Spanish text is: <start>la gente es muy amable .<end>


In [90]:
#Tokenize and padding the data to the right

In [91]:
def ln_tokenization(text):
  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = '')
  tokenizer.fit_on_texts(text)
  tensor = tokenizer.texts_to_sequences(text)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
  return tensor, tokenizer

In [92]:
#Load the cleaned and prepared dataset for training our MT-model

In [93]:
def load_clean_data(path, sample_size = None):
  target_lang, input_lang = data_creator(file_path, sample_size)
  input_tensor, input_tokenizer = ln_tokenization(input_lang)
  target_tensor, target_tokenizer = ln_tokenization(target_lang)
  return input_tensor, target_tensor, input_tokenizer, target_tokenizer

In [94]:
sample_size = 50000
input_tensor, target_tensor, input_lang, target_lang = load_clean_data(file_path, sample_size)

In [95]:
print(f"Max_len_input: {input_tensor.shape[1]}, Max_len_output: {target_tensor.shape[1]}")

Max_len_input: 14, Max_len_output: 10


In [96]:
#Split the data for training and testing 
from sklearn.model_selection import train_test_split

In [97]:
x_train, x_test, y_train,y_test = train_test_split(input_tensor, target_tensor, test_size = 0.2)

In [98]:
print(f"x_train_shape: {x_train.shape}, y_train_shape: {y_train.shape}\nx_test_shape: {x_test.shape}, y_test_shape: {y_test.shape}")

x_train_shape: (40000, 14), y_train_shape: (40000, 10)
x_test_shape: (10000, 14), y_test_shape: (10000, 10)


In [99]:
#Map every word in a text to an index (number)
def create_index(lang, tensor):
  for t in tensor:
    if t != 0:
      print("%d---->%s" %(t, lang.index_word[t]))

In [100]:
#Testing the map function
print('Input language, index-word mapping')
create_index(input_lang, x_train[10])

Input language, index-word mapping
122----><start>solo
92---->era
2278---->curiosidad
1---->.<end>


In [101]:
print("output language, index-word")
create_index(target_lang, y_train[10])

output language, index-word
2----><start>i
19---->was
100---->just
1310---->curious
1---->.<end>


In [102]:
#Creating a tensorflow data type for easy training

In [103]:
batch_size = 64
BUFFER = len(x_train)
step_per_epoch = BUFFER // batch_size
step_per_epoch_eval = len(x_test)//batch_size
units = 1024
embedding_dim = 512
input_voc_size = len(input_lang.word_index) + 1
output_voc_size = len(target_lang.word_index) + 1



In [104]:
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(BUFFER)
train_data = train_data.batch(batch_size, drop_remainder = True)
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test)).shuffle(BUFFER)
test_data = test_data.batch(batch_size, drop_remainder = True)

In [105]:
#The above datasets are ready for training:
#We employ model subclassing to build both encoder and decoder network
#We the use layer subclassing to construct an attention mechanism
#In this project we will employ additive attention (Bhanadau's attention) with 3 parameters to be learnt

In [106]:
class Encoder(tf.keras.Model):
  def __init__(self, enc_units, voc_size, batch_size, embedding_dim, name = 'encoder', **kwargs):
    super(Encoder, self).__init__(name = name, **kwargs)
    self.enc_units = enc_units
    self.batch_size = batch_size
    self.enc_embedding = tf.keras.layers.Embedding(input_dim = voc_size, output_dim = embedding_dim, name = 'encoder_embd')
    self.enc_gru = tf.keras.layers.GRU(units = self.enc_units, kernel_initializer = 'glorot_uniform',
                                       return_state = True, return_sequences = True,
                                       recurrent_dropout = 0.5, dropout = 0.5,
                                       name = 'encoder_gru')
  
  def call(self, inputs, hidden):
    inputs = self.enc_embedding(inputs)
    enc_out, enc_hidden = self.enc_gru(inputs, initial_state = hidden)
    return enc_out, enc_hidden
  
  def hidden_initializer(self):
    return tf.zeros(shape = (self.batch_size, self.enc_units))


In [107]:
#Instantiate and testing the encoder

In [108]:
encoder = Encoder(units, input_voc_size, batch_size,embedding_dim, name = 'encoder')



In [109]:
sample_x_batch_train, sample_y_batch_train = next(iter(train_data))

In [110]:
hidden_state = encoder.hidden_initializer()

In [111]:
sample_enc_out, sample_enc_hidden = encoder(sample_x_batch_train, hidden_state)

In [112]:
print(f"sample_enc_out_shape: {sample_enc_out.shape}\nsample_enc_hidden_shape: {sample_enc_hidden.shape}")

sample_enc_out_shape: (64, 14, 1024)
sample_enc_hidden_shape: (64, 1024)


In [113]:
#The decoder network without an attention

In [114]:
class Decoder(tf.keras.Model):
  def __init__(self, dec_units, embedding_dim, voc_size, batch_size, name = 'decoder', **kwargs):
    super(Decoder, self).__init__(name = name, **kwargs)
    self.dec_units = dec_units
    self.batch_size = batch_size
    self.dec_embedding = tf.keras.layers.Embedding(input_dim = voc_size, output_dim = embedding_dim, name = 'decoder_embd')
    self.dec_gru = tf.keras.layers.GRU(units = self.dec_units, kernel_initializer = 'glorot_uniform',
                                       return_state = True, return_sequences = True,
                                       recurrent_dropout = 0.5, dropout = 0.5,
                                       name = 'decoder_gru')
    self.fc = tf.keras.layers.Dense(units = voc_size, kernel_initializer = 'glorot_uniform',activation = 'softmax')
  
  def call(self, inputs, hidden):
    inputs = self.dec_embedding(inputs)
    dec_out, dec_hidden = self.dec_gru(inputs, initial_state = hidden)
    dec_out = tf.reshape(dec_out, shape = (-1, dec_out.shape[2]))
    inputs = self.fc(dec_out)
    return inputs, dec_hidden

In [115]:
#Instantiate the decoder and testing using the sample batch encoder output 

In [116]:
decoder = Decoder(units, embedding_dim,output_voc_size,batch_size,name = 'decoder')



In [117]:
sample_dec_out, sample_dec_hidden = decoder(tf.random.uniform(shape = (batch_size,1)),sample_enc_hidden)

In [118]:
print(f"sample_dec_out_shape: {sample_dec_out.shape}\nsample_dec_hidden_shape: {sample_dec_hidden.shape}")

sample_dec_out_shape: (64, 7525)
sample_dec_hidden_shape: (64, 1024)


In [119]:
#Dotproduct attention:
class DotproductAttention(tf.keras.layers.Layer):
  def call(self, query, values):
    query_expanded = tf.expand_dims(query,1)# adding the time dimension
    score = query_expanded * values
    score = tf.reduce_sum(score, axis = 2)
    score = tf.expand_dims(score, axis = 2)
    attention_wt = tf.nn.softmax(score, axis = 1)
    context = attention_wt * values
    context_vector = tf.reduce_sum(context, axis = 1)
    return context_vector, attention_wt


In [120]:
#Testing the attention 
attention_layer = DotproductAttention()
sample_context_vector, sample_attention_wt = attention_layer(sample_enc_hidden, sample_enc_out)

In [121]:
print(f"sample_context_vector_shape: {sample_context_vector.shape}:\nsample_attention_wt_shape: {sample_attention_wt.shape}")

sample_context_vector_shape: (64, 1024):
sample_attention_wt_shape: (64, 14, 1)


In [122]:
#Bhanadau attention:
class BhanadauAttention(tf.keras.layers.Layer):
  def __init__(self, units, name = 'bhanadau',**kwargs):
    super(BhanadauAttention, self).__init__(name = name, **kwargs)
    self.W1 = tf.keras.layers.Dense(units = units)
    self.W2 = tf.keras.layers.Dense(units = units)
    self.V = tf.keras.layers.Dense(units = 1)
  
  def call(self, query, values):
    query_expanded = tf.expand_dims(query, 1) #Add the time dimension for the hidden state
    score = self.V(tf.nn.tanh(self.W1(query_expanded) + self.W2(values)))
    attention_wt = tf.nn.softmax(score, axis = 1)
    context = attention_wt * values
    context_vector = tf.reduce_sum(context, axis = 1)
    return context_vector, attention_wt

In [123]:
#Instantiate and testing the bhanadau attention

In [124]:
bhnadau = BhanadauAttention(10, name = 'bhanadau')

In [125]:
sample_bhanadau_context_vec, sample_bhanadau_attention_wt = bhnadau(sample_enc_hidden, sample_enc_out)

In [126]:
print(f"sample_bhanadau_context_vec_shape: {sample_bhanadau_context_vec.shape}\nsample_nhanadau_attention_wt_shape: {sample_bhanadau_attention_wt.shape}")

sample_bhanadau_context_vec_shape: (64, 1024)
sample_nhanadau_attention_wt_shape: (64, 14, 1)


In [127]:
#Decoder with attention:

In [128]:
class DecoderWithAttention(tf.keras.Model):
  def __init__(self,dec_units, voc_size, batch_size,embedding_dim,attention_layer = None, name = 'dec_with_attention',**kwargs):
    super(DecoderWithAttention, self).__init__(name = name, **kwargs)
    self.batch_size = batch_size
    self.dec_units = dec_units
    self.dec_embedding = tf.keras.layers.Embedding(input_dim = voc_size, output_dim = embedding_dim, name = 'dec_embedding')
    self.dec_gru = tf.keras.layers.GRU(units = self.dec_units, kernel_initializer = 'glorot_uniform',
                                       return_state = True, return_sequences = True,
                                       recurrent_dropout = True, dropout = True,
                                       name = 'dec_gru')
    self.fc = tf.keras.layers.Dense(units = voc_size)
    self.attention = attention_layer

  def call(self,inputs,enc_hidden, enc_out):
    inputs = self.dec_embedding(inputs)
    attention_wt = None
    if self.attention:
      context_vector, attention_wt = self.attention(enc_hidden, enc_out)
      inputs = tf.concat([tf.expand_dims(context_vector, 1), inputs], axis = -1)
      dec_out, dec_hidden_state = self.dec_gru(inputs, initial_state = enc_hidden)
      dec_out = tf.reshape(dec_out, shape = (-1, dec_out.shape[2])) 
      inputs = self.fc(dec_out)
      return inputs, dec_hidden_state, attention_wt

In [129]:
#Instantiate and testing the decoder with dotproduct attention

In [130]:
decoder_with_attention = DecoderWithAttention(units, output_voc_size,batch_size, embedding_dim,attention_layer= attention_layer, name = 'dec_attn')



In [131]:
sample_dec_out_with_attn, sample_dec_hidden_with_attn,sample_attent_wt = decoder_with_attention(tf.random.uniform(shape = (batch_size,1)), sample_enc_hidden, sample_enc_out)

In [132]:
print(f"sample_dec_hidden_with_attn_shape: {sample_dec_hidden_with_attn.shape}\nsample_dec_out_with_attn_shape: {sample_dec_out_with_attn.shape}\nsample_attn_wt-shape: {sample_attent_wt.shape}")

sample_dec_hidden_with_attn_shape: (64, 1024)
sample_dec_out_with_attn_shape: (64, 7525)
sample_attn_wt-shape: (64, 14, 1)


In [133]:
#Training loop from scratch

In [134]:
#Get the loss function and the optimizer

In [135]:
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

In [136]:
def custom_loss(y_real, y_pred):
  mask = tf.math.logical_not(tf.math.equal(y_real, 0))
  loss_ = loss_object(y_real, y_pred)
  mask = tf.cast(mask, dtype = loss_.dtype)
  loss_*=mask
  return tf.reduce_mean(loss_)


In [137]:
#Testing the custom loss function:
loss_object_test = loss_object([1.0,0.8],[[1.0,0.6,0.7,0.2],[0.3,1,0.7,0.5]])

In [138]:
custom_loss_test = custom_loss([1.0,0.8],[[1.0,0.6,0.7,0.2],[0.3,1,0.7,0.5]])

In [139]:
print(f"Loss_object_test gives:{loss_object_test}\nCustomized loss fn gives: {custom_loss_test:.3f} ")

Loss_object_test gives:[1.4509848 1.7451882]
Customized loss fn gives: 1.598 


In [140]:
optimizer = tf.keras.optimizers.RMSprop(learning_rate = 1e-3)

In [141]:
#The training step fuction using @tf.fuction as decorator to speed excecution

In [142]:
def get_train_fun():
  @tf.function
  def train_step(inputs, outputs, enc_hidden, encoder, decoder):
    loss = 0
    with tf.GradientTape() as tape:
      enc_out, enc_hidden = encoder(inputs, enc_hidden)
      dec_hidden = enc_hidden
      dec_inputs = tf.expand_dims([target_lang.word_index['start']] * batch_size, 1)
      for t in range(1, outputs.shape[1]):
        preds, dec_hidden,attn_wt = decoder(dec_inputs, dec_hidden,enc_out)
        loss+=custom_loss(outputs[:,t], preds)
        dec_inputs = tf.expand_dims(outputs[:,t],1)
    batch_loss = (loss/outputs.shape[1])
    trainable_vars = encoder.trainable_variables + decoder.trainable_variables
    grads = tape.gradient(loss, trainable_vars)
    optimizer.apply_gradients(zip(grads, trainable_vars))
    return batch_loss
  return train_step

In [143]:
#Define the validation loss:
def custom_validation_loss(inputs, outputs, enc_hidden, encoder, decoder):
  loss = 0
  enc_out, enc_hidden = encoder(inputs, enc_hidden)
  dec_hidden = enc_hidden
  dec_inputs = tf.expand_dims([target_lang.word_index['start']] * batch_size, 1)
  for t in range(1, outputs.shape[1]):
    preds,dec_hidden,attn_wt = decoder(dec_inputs,dec_hidden,enc_out)
    loss+=custom_loss(outputs[:, t], preds)
    dec_inputs = tf.expand_dims(outputs[:, t],1)
  loss = (loss / outputs.shape[1])
  return loss

In [144]:
#Training the model from scatch:
def train_MTN(epochs, attention):
  encoder = Encoder(units, input_voc_size,batch_size,embedding_dim,name = 'encoder')
  decoder = DecoderWithAttention(units,output_voc_size,batch_size,embedding_dim,attention_layer = attention)
  train_step_comp = get_train_fun()
  train_loss = []
  validation_loss = []
  for epoch in range(epochs):
    tic = time.time()
    enc_hidden = encoder.hidden_initializer()
    total_loss = 0
    for (step, (input,output)) in enumerate(train_data.take(step_per_epoch)):
      batch_loss = train_step_comp(input,output,enc_hidden,encoder, decoder)
      total_loss+=batch_loss
      if step % 100 == 0:
        print(f"Epoch: {epoch + 1}: Batch: {step}: Loss: {batch_loss: .4f}")
    
    enc_hidden = encoder.hidden_initializer()
    total_val_loss = 0
    for (step, (input, output)) in enumerate(test_data.take(step_per_epoch_eval)):
      val_loss = custom_validation_loss(input,output,enc_hidden,encoder, decoder)
      total_val_loss+=val_loss
    train_loss.append(total_loss/step_per_epoch)
    validation_loss.append(total_val_loss/step_per_epoch_eval)
    print(f"Epoch: {epoch + 1}: Train Loss: {train_loss[-1]:.4f}: Validation Loss: {validation_loss[-1]:.4f}")
    print(f"Time elapsed is : {time_fmt(time.time()-tic)}")
  return encoder, decoder, train_loss, validation_loss



In [72]:
#Training the MTN

In [73]:
epochs = 15
attention = attention_layer
encoder, decoder,train_loss, validation_loss = train_MTN(epochs, attention)

Epoch: 1: Batch: 0: Loss:  4.1279
Epoch: 1: Batch: 100: Loss:  2.0896
Epoch: 1: Batch: 200: Loss:  2.1274
Epoch: 1: Batch: 300: Loss:  1.7351
Epoch: 1: Batch: 400: Loss:  1.8572
Epoch: 1: Batch: 500: Loss:  1.5767
Epoch: 1: Batch: 600: Loss:  1.6061
Epoch: 1: Train Loss: 1.9197: Validation Loss: 1.5464
Time elapsed is : 0: 001: 55.00
Epoch: 2: Batch: 0: Loss:  1.4572
Epoch: 2: Batch: 100: Loss:  1.4176
Epoch: 2: Batch: 200: Loss:  1.5282
Epoch: 2: Batch: 300: Loss:  1.4668
Epoch: 2: Batch: 400: Loss:  1.3292
Epoch: 2: Batch: 500: Loss:  1.1903
Epoch: 2: Batch: 600: Loss:  1.1544
Epoch: 2: Train Loss: 1.3283: Validation Loss: 1.2519
Time elapsed is : 0: 001: 42.00
Epoch: 3: Batch: 0: Loss:  1.0903
Epoch: 3: Batch: 100: Loss:  1.1467
Epoch: 3: Batch: 200: Loss:  1.1109
Epoch: 3: Batch: 300: Loss:  1.1157
Epoch: 3: Batch: 400: Loss:  0.9820
Epoch: 3: Batch: 500: Loss:  1.1217
Epoch: 3: Batch: 600: Loss:  0.8441
Epoch: 3: Train Loss: 1.0349: Validation Loss: 1.1013
Time elapsed is : 0: 001

In [145]:
epochs = 15
attention = bhnadau
encoder, decoder,train_loss, validation_loss = train_MTN(epochs, attention)

Epoch: 1: Batch: 0: Loss:  3.9051
Epoch: 1: Batch: 100: Loss:  2.3026
Epoch: 1: Batch: 200: Loss:  1.9640
Epoch: 1: Batch: 300: Loss:  1.6806
Epoch: 1: Batch: 400: Loss:  1.5599
Epoch: 1: Batch: 500: Loss:  1.5685
Epoch: 1: Batch: 600: Loss:  1.5704
Epoch: 1: Train Loss: 1.8707: Validation Loss: 1.4882
Time elapsed is : 0: 001: 56.00
Epoch: 2: Batch: 0: Loss:  1.2919
Epoch: 2: Batch: 100: Loss:  1.3495
Epoch: 2: Batch: 200: Loss:  1.0954
Epoch: 2: Batch: 300: Loss:  1.3205
Epoch: 2: Batch: 400: Loss:  1.2194
Epoch: 2: Batch: 500: Loss:  1.1985
Epoch: 2: Batch: 600: Loss:  0.9686
Epoch: 2: Train Loss: 1.2674: Validation Loss: 1.1746
Time elapsed is : 0: 001: 43.00
Epoch: 3: Batch: 0: Loss:  0.9395
Epoch: 3: Batch: 100: Loss:  1.0114
Epoch: 3: Batch: 200: Loss:  1.0499
Epoch: 3: Batch: 300: Loss:  1.0728
Epoch: 3: Batch: 400: Loss:  0.8695
Epoch: 3: Batch: 500: Loss:  0.8148
Epoch: 3: Batch: 600: Loss:  0.8728
Epoch: 3: Train Loss: 0.9615: Validation Loss: 1.0284
Time elapsed is : 0: 001