<a href="https://colab.research.google.com/github/martinpius/RNN-ALIENS/blob/main/Sequence_to_sequence_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [139]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)
try:
  COLAB = True
  import tensorflow as tf
  print(f"You are using Colab with tensorflow version {tf.__version__}")
except Exception as e:
  COLAB = False
  print(f"{type(e)}: {e}\n....Please Load Your Drive....")

def time_fmt(x):
  h = int(x / (60 * 60))
  m = int(x % (60 * 60) / 60)
  s = int(x % 60)
  return f"{h}: {m:>03}: {s:>05.2f}"

time_fmt(240.892)

Mounted at /content/drive
You are using Colab with tensorflow version 2.4.0


'0: 004: 00.00'

In [140]:
import time, io, os, re, unicodedata
import matplotlib as mlp
import matplotlib.pyplot as plt
from matplotlib.pyplot import imshow
import matplotlib.ticker as ticker
import tensorflow as tf
import numpy as np

In [141]:
#Lets build an encoder-decoder network for machine translation

In [142]:
#Importing and preprocessing the data
#We will train a machine to translate spanish language to english
#It is a simple MT with attention mechanism to learn variable/words contribution

In [143]:
folder_path = tf.keras.utils.get_file(fname = "spa-eng.zip", origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip",
                                      extract = True)
file_path = os.path.dirname(folder_path) + "/spa-eng/spa.txt"

In [144]:
#Change the unicode data format into ascii format:
def ascii_fmt(t):
  return "".join(k for k in unicodedata.normalize('NFD',t) if unicodedata.category(k) != 'Mn')

In [145]:
def preprocess_text(t):
  '''We do some cleaning and marking the start and the end of each sentence'''
  t = ascii_fmt(t.lower().strip()) #Convert to lower cases and strip the white spaces
  t = re.sub(r"([?,!.¿])", r" \1 ", t)
  t = re.sub(r'[" "]+', " ", t)
  t = re.sub(r'[^a-zA-Z?¿,.!]+'," ", t) # For each sentence we replace the everthing else except the one listed with a white space
  t = t.strip() #Strip the white spaces
  t = '<start>' + t + '<end>' #Marking the start and the end of each sentence with start, end
  return t

In [146]:
#Testing the above function if it works as intended:
en_verse = u"May I borrow this book?"
sp_verse = u"¿Puedo tomar prestado este libro?"

In [147]:
en_out = preprocess_text(en_verse)

In [148]:
sp_out = preprocess_text(sp_verse)

In [149]:
print(en_out)
print()
print(sp_out)

<start>may i borrow this book ?<end>

<start>¿ puedo tomar prestado este libro ?<end>


In [150]:
#Create sentences pairs for english-spanish side by side

In [151]:
def data_creator(path, sample_size):
  lines = io.open(path, encoding = 'UTF-8').read().strip().split("\n")
  words_pair = [[preprocess_text(t) for t in line.split("\t")] for line in lines[:sample_size]]
  return zip(*words_pair)

In [152]:
#Applying the function
sample_size = 50000

In [153]:
#Load the sample of size 50000
eng_text, spa_text = data_creator(file_path,sample_size )

In [154]:
print(f"English text is : {eng_text[-1]}")
print()
print(f"Spanish text is: {spa_text[-1]}")

English text is : <start>the people are so friendly .<end>

Spanish text is: <start>la gente es muy amable .<end>


In [155]:
#Tokenize and padding the data to the right

In [156]:
def ln_tokenization(text):
  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = '')
  tokenizer.fit_on_texts(text)
  tensor = tokenizer.texts_to_sequences(text)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
  return tensor, tokenizer

In [157]:
#Load the cleaned and prepared dataset for training our MT-model

In [158]:
def load_clean_data(path, sample_size = None):
  target_lang, input_lang = data_creator(file_path, sample_size)
  input_tensor, input_tokenizer = ln_tokenization(input_lang)
  target_tensor, target_tokenizer = ln_tokenization(target_lang)
  return input_tensor, target_tensor, input_tokenizer, target_tokenizer

In [159]:
sample_size = 50000
input_tensor, target_tensor, input_lang, target_lang = load_clean_data(file_path, sample_size)

In [160]:
print(f"Max_len_input: {input_tensor.shape[1]}, Max_len_output: {target_tensor.shape[1]}")

Max_len_input: 14, Max_len_output: 10


In [161]:
#Split the data for training and testing 
from sklearn.model_selection import train_test_split

In [162]:
x_train, x_test, y_train,y_test = train_test_split(input_tensor, target_tensor, test_size = 0.2)

In [163]:
print(f"x_train_shape: {x_train.shape}, y_train_shape: {y_train.shape}\nx_test_shape: {x_test.shape}, y_test_shape: {y_test.shape}")

x_train_shape: (40000, 14), y_train_shape: (40000, 10)
x_test_shape: (10000, 14), y_test_shape: (10000, 10)


In [164]:
#Map every word in a text to an index (number)
def create_index(lang, tensor):
  for t in tensor:
    if t != 0:
      print("%d---->%s" %(t, lang.index_word[t]))

In [165]:
#Testing the map function
print('Input language, index-word mapping')
create_index(input_lang, x_train[10])

Input language, index-word mapping
154----><start>las
270---->cosas
59---->son
5369---->diferentes
96---->ahora
1---->.<end>


In [166]:
print("output language, index-word")
create_index(target_lang, y_train[10])

output language, index-word
1087----><start>things
27---->are
1095---->different
77---->now
1---->.<end>


In [167]:
#Creating a tensorflow data type for easy training

In [168]:
batch_size = 64
BUFFER = len(x_train)
step_per_epoch = BUFFER // batch_size
units = 1024
embedding_dim = 512
input_voc_size = len(input_lang.word_index) + 1
output_voc_size = len(target_lang.word_index) + 1



In [169]:
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train)).shuffle(BUFFER)
train_data = train_data.batch(batch_size, drop_remainder = True)
test_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
test_data = test_data.batch(batch_size, drop_remainder = True)

In [170]:
#The above datasets are ready for training:
#We employ model subclassing to build both encoder and decoder network
#We the use layer subclassing to construct an attention mechanism
#In this project we will employ additive attention (Bhanadau's attention) with 3 parameters to be learnt

In [171]:
#The encoder network

In [172]:
class Encoder(tf.keras.Model):
  def __init__(self, voc_size, batch_size, enc_units,embedding_dim, name = 'encoder', **kwargs):
    super(Encoder, self).__init__(name = name, **kwargs)
    self.batch_size = batch_size
    self.enc_units = enc_units
    self.embedding_dim = embedding_dim
    self.embedding_layer = tf.keras.layers.Embedding(input_dim = voc_size, 
                                                     output_dim = embedding_dim, 
                                                     name = 'embd_layer')
    self.gru_layer = tf.keras.layers.GRU(units = self.enc_units, 
                                         return_sequences = True, 
                                         return_state = True, 
                                         kernel_initializer = 'glorot_uniform',
                                         name = 'gru_layer',
                                         dropout = 0.5, recurrent_dropout = 0.25)
  def call(self, inputs, hidden):
    inputs = self.embedding_layer(inputs)
    dec_out, hidden_state = self.gru_layer(inputs, initial_state = hidden)
    return dec_out, hidden_state
    
  def hidden_initializer(self):
    return tf.zeros(shape = (self.batch_size, self.enc_units))
           


In [173]:
#Instantiate the class:

In [174]:
encoder = Encoder(input_voc_size, batch_size,units, embedding_dim, name = 'encoder')



In [175]:
hidden = encoder.hidden_initializer()

In [176]:
#Testing on the input data
train_input_sample_batch, train_output_sample_batch = next(iter(train_data))
train_input_sample_batch.shape, train_output_sample_batch.shape

(TensorShape([64, 14]), TensorShape([64, 10]))

In [177]:
enc_sample_out, enc_hidden = encoder(train_input_sample_batch, hidden)

In [178]:
enc_sample_out.shape, enc_hidden.shape

(TensorShape([64, 14, 1024]), TensorShape([64, 1024]))

In [179]:
print("I am tired: To be continued tomorrow!!!Thanks Jesus my Lord and my God")

I am tired: To be continued tomorrow!!!Thanks Jesus my Lord and my God


In [180]:
#The additive attention mechanism (Bhanadau's attention)
#We use layer subclassing to construct attention network

In [181]:
class BhanadauAttention(tf.keras.layers.Layer):
  def __init__(self, units, name = 'attention', **kwargs):
    super(BhanadauAttention, self).__init__(name = name)
    self.W1 = tf.keras.layers.Dense(units = units,
                                    kernel_initializer = 'random_normal', 
                                    activation = 'relu',
                                    name = 'w1')
    self.W2 = tf.keras.layers.Dense(units = units,
                                    kernel_initializer = 'random_normal',
                                    activation = 'relu',
                                    name = 'w1')
    self.V = tf.keras.layers.Dense(units = 1, 
                                   kernel_initializer = 'random_normal',
                                   activation = 'relu',
                                   name = 'v')
  
  def call(self, query, value):
    query_expanded = tf.expand_dims(query, 1)# Add the time axis
    score = self.V(tf.nn.tanh(self.W1(query_expanded)+ self.W2(value)))
    attention_wt = tf.nn.softmax(score, axis = 1)
    context = attention_wt * value
    context_vector = tf.reduce_sum(context, axis = 1)
    return context_vector, attention_wt

In [182]:
#Instantiste the class
attention = BhanadauAttention(units = 10, name = 'attention')

In [183]:
#Testing on the sample batch
sample_context_vector, sample_attention_wt = attention(enc_hidden, enc_sample_out)

In [184]:
sample_context_vector.shape, sample_attention_wt.shape

(TensorShape([64, 1024]), TensorShape([64, 14, 1]))

In [185]:
#The decoder's network.
#We again use the model subclassing to build this network

In [186]:
class Decoder(tf.keras.Model):
  def __init__(self, voc_size, embedding_dim, dec_units,batch_size, name = 'decoder', **kwargs):
    super(Decoder,self).__init__(name = name, **kwargs)
    self.batch_size = batch_size
    self.embedding_dim = embedding_dim
    self.dec_units = dec_units
    self.embedding_layer = tf.keras.layers.Embedding(input_dim = voc_size, 
                                                     output_dim = embedding_dim,
                                                     name = 'dec_embedding')
    self.gru = tf.keras.layers.GRU(units = self.dec_units, 
                                   return_state = True,
                                   return_sequences = True,
                                   kernel_initializer = 'glorot_uniform',
                                   recurrent_dropout = 0.25,
                                   dropout = 0.5)
    self.fc = tf.keras.layers.Dense(units = voc_size, activation = 'softmax', name = 'decoder_out')
    self.attention = BhanadauAttention(self.dec_units)
  
  def call(self, inputs, hidden, enc_out):
    enc_context_vec, attention_wt = self.attention(hidden,enc_out)
    inputs = self.embedding_layer(inputs)
    inputs = tf.concat([tf.expand_dims(enc_context_vec,1), inputs], axis = -1)
    dec_out, hidden_state = self.gru(inputs)
    dec_out = tf.reshape(dec_out, (-1, dec_out.shape[2]))
    inputs = self.fc(dec_out)
    return inputs, hidden_state, attention_wt


In [187]:
#Instantiate the class
decoder = Decoder(output_voc_size, embedding_dim,units,batch_size, name = 'decoder')



In [188]:
#Testing on a sample batch
dec_out, dec_hidden, attention_wt = decoder(tf.random.uniform(shape = (batch_size, 1)), enc_hidden, enc_sample_out)

In [189]:
print(f"{dec_out.shape, dec_hidden.shape, attention_wt.shape}")

(TensorShape([64, 7525]), TensorShape([64, 1024]), TensorShape([64, 14, 1]))


In [190]:
#The training step

In [191]:
#We will train this model from scratch.


In [192]:
loss_obj = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')
optimizer = tf.keras.optimizers.RMSprop(learning_rate = 1e-3)


In [193]:
#The loss function

In [194]:
def my_loss(y_real, y_pred):
  mask = tf.math.logical_not(tf.math.equal(y_real, 0)) #Return boolean for the mask
  loss_ =loss_obj(y_real, y_pred)
  mask = tf.cast(mask, dtype = loss_.dtype)
  loss_*= mask
  return tf.reduce_mean(loss_)

In [195]:
#Create a checkpoint to save the model periodically

In [196]:
chekpoint_dir = "./training_checkpoints"
checkpoint_prefix = os.path.join(chekpoint_dir, 'chk')
checkpoint = tf.train.Checkpoint(optimizer = optimizer, encoder = encoder, decoder = decoder)

In [197]:
@tf.function
def train_step(inputs, target, enc_hidden):
  loss = 0
  with tf.GradientTape() as tape:
    enc_out, enc_hidden = encoder(inputs, enc_hidden)
    dec_hidden = enc_hidden
    dec_input = tf.expand_dims([target_lang.word_index['start']] * batch_size,1)
    #Feeding the target as an input to predict the next output (Teacher forcing!!!)
    for t in range(1, target.shape[1]):
      predictions, hidden, _ = decoder(dec_input, dec_hidden,enc_out)
      loss+=my_loss(target[:, t], predictions)
      dec_input = tf.expand_dims(target[:, t], 1)
  batch_loss = (loss/int(target.shape[1]))
  vars = encoder.trainable_variables + decoder.trainable_variables
  grads = tape.gradient(loss, vars)
  optimizer.apply_gradients(zip(grads, vars))
  return batch_loss



In [198]:
#Iterate over the epochs

In [199]:
epochs = 20
for epoch in range(epochs):
  tic = time.time()
  enc_hidden = encoder.hidden_initializer()
  total_loss = 0
  for (step,(inp, targ)) in enumerate(train_data.take(step_per_epoch)):
    batch_loss = train_step(inp,targ, enc_hidden)
    total_loss+=batch_loss
    if step % 100 == 0:
      print(f"Epoch: {epoch + 1} Batch: {step} Loss: {batch_loss:.4f}")
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)
  print(f"Epoch: {epoch + 1} Loss: {(total_loss/step_per_epoch):.4f}")
  toc = time.time()
  print(f"time elapse is : {time_fmt(toc - tic)}")

Epoch: 1 Batch: 0 Loss: 4.2539
Epoch: 1 Batch: 100 Loss: 2.2254
Epoch: 1 Batch: 200 Loss: 2.0658
Epoch: 1 Batch: 300 Loss: 1.8565
Epoch: 1 Batch: 400 Loss: 1.8110
Epoch: 1 Batch: 500 Loss: 1.8295
Epoch: 1 Batch: 600 Loss: 1.6503
Epoch: 1 Loss: 1.9835
time elapse is : 0: 001: 12.00
Epoch: 2 Batch: 0 Loss: 1.6369
Epoch: 2 Batch: 100 Loss: 1.4871
Epoch: 2 Batch: 200 Loss: 1.3881
Epoch: 2 Batch: 300 Loss: 1.5761
Epoch: 2 Batch: 400 Loss: 1.4179
Epoch: 2 Batch: 500 Loss: 1.2291
Epoch: 2 Batch: 600 Loss: 1.3349
Epoch: 2 Loss: 1.4379
time elapse is : 0: 000: 59.00
Epoch: 3 Batch: 0 Loss: 1.1636
Epoch: 3 Batch: 100 Loss: 1.3489
Epoch: 3 Batch: 200 Loss: 1.1220
Epoch: 3 Batch: 300 Loss: 1.1023
Epoch: 3 Batch: 400 Loss: 1.1760
Epoch: 3 Batch: 500 Loss: 1.0659
Epoch: 3 Batch: 600 Loss: 1.1727
Epoch: 3 Loss: 1.1495
time elapse is : 0: 000: 59.00
Epoch: 4 Batch: 0 Loss: 0.9119
Epoch: 4 Batch: 100 Loss: 1.1335
Epoch: 4 Batch: 200 Loss: 0.9967
Epoch: 4 Batch: 300 Loss: 0.8118
Epoch: 4 Batch: 400 Loss