<a href="https://colab.research.google.com/github/martinpius/MachineTranslation/blob/main/Recarp_to_Language_Models_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount("/content/drive", force_remount = True)
try:
  COLAB = True
  import tensorflow as tf
  print(f"You are on Google Colab with tensoflow version: {tf.__version__}")
except Exception as e:
  COLAB = False
  print(f"{type(e)}: {e}\n...Please Load Your Drive...")
def time_fmt(t):
  h = int(t / (60 * 60))
  m = int(t % (60 * 60) / 60)
  s = int(t % 60)
  return f"{h}: {m:>03}: {s:>05.2f}"
print(f"time elapse is: {time_fmt(123.46987)}")

Mounted at /content/drive
You are on Google Colab with tensoflow version: 2.4.0
time elapse is: 0: 002: 03.00


In [2]:
import re, unicodedata, time,io,os
import numpy as np
import tensorflow as tf
import matplotlib as mpl
import matplotlib.ticker as ticker
import matplotlib.pyplot as plt




In [3]:
#Import and preprocess the data from the url using tensorflow utils

In [4]:
folder_path = tf.keras.utils.get_file(fname = 'spa-eng.zip', origin = "http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip", extract = True)

Downloading data from http://storage.googleapis.com/download.tensorflow.org/data/spa-eng.zip


In [5]:
file_path = os.path.dirname(folder_path)+ '/spa-eng/spa.txt'

In [6]:
def unicode_ascii(text):
  return "".join(k for k in unicodedata.normalize("NFD", text) if unicodedata.category(k) != 'Mn')

In [7]:
def text_preprocess(text):
  text = unicode_ascii(text.lower().strip())
  text = re.sub(r"([!,.¿?])", r" \1 ", text)
  text = re.sub(r'[" "]+', r" ", text)
  text = re.sub("[^a-zA-Z.,¿?!]+", r" ", text)
  text = text.strip()
  text = "<start> " + text + " <end>"
  return text



In [8]:
#Testing the processing function
en_verse = u"May I borrow this @ book?"
sp_verse = u"¿Puedo tomar prestado este libro?"

In [9]:
print(f"english sentence: {text_preprocess(en_verse)}\nspanish sentence: {text_preprocess(sp_verse)}")

english sentence: <start> may i borrow this book ? <end>
spanish sentence: <start> ¿ puedo tomar prestado este libro ? <end>


In [10]:
#Return a pair of sentences matching each language:
def matched_texts(path, sample_size):
  S = io.open(path,encoding = 'UTF-8').read().strip().split('\n')
  paired_text = [[text_preprocess(k) for k in l.split("\t")] for l in S[:sample_size]]
  return zip(*paired_text)

In [11]:
#Testing the function:
english, spanish = matched_texts(file_path, None)

In [12]:
print(f"last english verse: {english[-1]}\nlast spanish verse: {spanish[-1]}")

last english verse: <start> if you want to sound like a native speaker , you must be willing to practice saying the same sentence over and over in the same way that banjo players practice the same phrase over and over until they can play it correctly and at the desired tempo . <end>
last spanish verse: <start> si quieres sonar como un hablante nativo , debes estar dispuesto a practicar diciendo la misma frase una y otra vez de la misma manera en que un musico de banjo practica el mismo fraseo una y otra vez hasta que lo puedan tocar correctamente y en el tiempo esperado . <end>


In [13]:
#Tokenization of the text

In [14]:
def tokenization(text):
  tokenizer = tf.keras.preprocessing.text.Tokenizer(filters = "")
  tokenizer.fit_on_texts(text)
  tensor = tokenizer.texts_to_sequences(text)
  tensor = tf.keras.preprocessing.sequence.pad_sequences(tensor, padding = 'post')
  return tensor, tokenizer

In [15]:
#Loading the clean data of size 100000 which is ready for training:

In [16]:
output_lang, input_lang = matched_texts(file_path, sample_size = 100000)


In [17]:
input_tensor, input_tokenizer = tokenization(input_lang)
output_tensor, output_tokenizer = tokenization(output_lang)

In [18]:
print(f"input_tensor_shape: {input_tensor.shape}\noutput_tensor_shape: {output_tensor.shape}")

input_tensor_shape: (100000, 20)
output_tensor_shape: (100000, 17)


In [19]:
#Split the data into train and test using cv
from sklearn.model_selection import train_test_split
x_train, x_test, y_train,y_test = train_test_split(input_tensor, output_tensor, test_size = 0.1)
print(f"train_shape: {x_train.shape}, {y_train.shape}\ntest_shape: {x_test.shape}, {y_test.shape}")

train_shape: (90000, 20), (90000, 17)
test_shape: (10000, 20), (10000, 17)


In [20]:
#Create index-word matching for the languages:

In [21]:
def create_index(lang, tensor):
  for t in tensor:
    if t != 0:
      print("%d---->%s" %(t, lang.index_word[t]))

In [22]:
print("Input language, from word to index mapping:")
print(create_index(input_tokenizer, x_train[10]))

Input language, from word to index mapping:
1----><start>
13---->es
21---->una
197---->buena
467---->camara
3---->.
2----><end>
None


In [23]:
print("Output language, from word to index mapping:")
print(create_index(output_tokenizer, y_train[10]))

Output language, from word to index mapping:
1----><start>
14---->it
15---->s
10---->a
76---->good
496---->camera
3---->.
2----><end>
None


In [24]:
#Convert to tensorflow data type and splits into batches
batch_size = 64
embedding_dim = 512
units = 1024
input_voc_size = len(input_tokenizer.word_index)+1
output_voc_size = len(output_tokenizer.word_index)+ 1
steps_per_epoch_train = len(x_train)//batch_size
steps_per_epoch_val = len(x_test)//batch_size
BUFFER = len(x_train)


In [25]:
train_data = tf.data.Dataset.from_tensor_slices((x_train, y_train))
train_data = train_data.shuffle(BUFFER).batch(batch_size, drop_remainder = True)
validation_data = tf.data.Dataset.from_tensor_slices((x_test, y_test))
validation_data = validation_data.shuffle(BUFFER).batch(batch_size, drop_remainder = True)

In [26]:
#Get the sample batch for testing the classes later on during model building

In [27]:
train_sample_x, train_sample_y = next(iter(train_data))
print(f"train_sample_x_shape: {train_sample_x.shape}\ntrain_sample_y_shape: {train_sample_y.shape}")

train_sample_x_shape: (64, 20)
train_sample_y_shape: (64, 17)


In [28]:
#Model Building:

In [29]:
class Encoder(tf.keras.Model):
  def __init__(self, enc_units, embedding_dim, voc_size, batch_size,name = 'encoder', **kwargs):
    super(Encoder, self).__init__(self, name = name, **kwargs)
    self.enc_units = enc_units
    self.batch_size = batch_size
    self.embedding_layer = tf.keras.layers.Embedding(input_dim = voc_size, output_dim = embedding_dim, name = 'dec_embedding')
    self.gru_layer = tf.keras.layers.GRU(units = self.enc_units, kernel_initializer = 'glorot_uniform',
                                         return_state = True, return_sequences = True,
                                         recurrent_dropout = 0.5, dropout = 0.5,
                                         name = 'decoder_gru')
  
  def call(self, inputs, enc_hidden):
    inputs = self.embedding_layer(inputs)
    enc_out, enc_hidden = self.gru_layer(inputs, initial_state = enc_hidden)
    return enc_out, enc_hidden
  
  def hidden_initializer(self):
    return tf.zeros(shape = (self.batch_size, self.enc_units))

In [30]:
#Instantiating and testing the encoder using the sample batch:
encoder = Encoder(units, embedding_dim,input_voc_size,batch_size,name = 'encoder')



In [31]:
enc_hidden = encoder.hidden_initializer()
sample_enc_out, sample_enc_hidden = encoder(train_sample_x, enc_hidden)
print(f"sample_enc_out_shape: {sample_enc_out.shape}\nsample_enc_hidden_shape: {sample_enc_hidden.shape}")

sample_enc_out_shape: (64, 20, 1024)
sample_enc_hidden_shape: (64, 1024)


In [32]:
#Decoder without an attention:
class Decoder(tf.keras.Model):
  def __init__(self, dec_units, embedding_dim, voc_size, batch_size, name = 'decoder',**kwargs):
    super(Decoder, self).__init__(name = name, **kwargs)
    self.dec_units = dec_units
    self.batch_size = batch_size
    self.embedding_layer = tf.keras.layers.Embedding(input_dim = voc_size, output_dim = embedding_dim, name = 'decoder_embedding')
    self.gru_layer = tf.keras.layers.GRU(units = self.dec_units, kernel_initializer = 'glorot_uniform',
                                         return_state = True, return_sequences = True,
                                         recurrent_dropout = 0.5, dropout = 0.5,
                                         name = 'decoder_gru')
    self.fc = tf.keras.layers.Dense(units = voc_size, kernel_initializer = 'random_normal', activation = 'softmax',name = 'decoder_output')

  def call(self, inputs, enc_hidden, enc_out):
    inputs = self.embedding_layer(inputs)
    dec_out, dec_hidden = self.gru_layer(inputs, initial_state = enc_hidden)
    dec_out = tf.reshape(dec_out, shape = (-1, dec_out.shape[2]))
    inputs = self.fc(dec_out)
    return inputs, dec_hidden

In [33]:
#Instantiating and testing the decoder class using the sample batch:
decoder = Decoder(units, embedding_dim, output_voc_size,batch_size,name = 'decoder')



In [34]:
sample_dec_out, sample_dec_hidden = decoder(tf.random.uniform(shape = (batch_size, 1)), sample_enc_hidden, sample_enc_out)

In [35]:
print(f"sample_dec_out_shape: {sample_dec_out.shape}\nsample_dec_hidden_shape: {sample_dec_hidden.shape}")

sample_dec_out_shape: (64, 10785)
sample_dec_hidden_shape: (64, 1024)


In [36]:
#The dot-product attention:
class DotprtAttention(tf.keras.layers.Layer):
  def call(self, query, values):
    query_expanded = tf.expand_dims(query, 1) #Adding the time dimension to the encoder's hidden
    score = query_expanded * values
    score = tf.reduce_sum(score,axis = 2)
    score = tf.expand_dims(score, axis = 2)
    attention_wts = tf.nn.softmax(score, axis = 1)
    context = attention_wts * values
    context_vector = tf.reduce_sum(context, axis = 1)
    return context_vector, attention_wts

In [37]:
#Instantiating and testing the attention class using the sample batch:
dotproduct = DotprtAttention()
sample_context_vector, sample_attention_wts = dotproduct(sample_enc_hidden, sample_enc_out)
print(f"sample_context_vector_shape: {sample_context_vector.shape}\nsample_attention_wt_shape: {sample_attention_wts.shape}")

sample_context_vector_shape: (64, 1024)
sample_attention_wt_shape: (64, 20, 1)


In [38]:
#Bhanadau's-Additive attention: 
class Additive_Attention(tf.keras.layers.Layer):
  def __init__(self, units, name = 'additive_attention', **kwargs):
    super(Additive_Attention, self).__init__(name = name, **kwargs)
    self.W1 = tf.keras.layers.Dense(units = units, kernel_initializer = 'random_normal')
    self.W2 = tf.keras.layers.Dense(units = units, kernel_initializer = 'random_normal')
    self.V = tf.keras.layers.Dense(units = 1, kernel_initializer = 'random_normal')
  
  def call(self, query, values):
    expanded_query = tf.expand_dims(query, axis = 1)
    score = self.V(tf.nn.tanh(self.W1(values) + self.W2(expanded_query)))
    attention_wts = tf.nn.softmax(score, axis = 1)
    context = attention_wts * values
    contenxt_vector = tf.reduce_sum(context, axis = 1)
    return contenxt_vector, attention_wts

In [39]:
#Instantiate and testing the Bhanadau's attention using the sample batch:
additive = Additive_Attention(units = 10)
sample_context_vector_bn, sample_attention_wts_bn = additive(sample_enc_hidden, sample_enc_out)
print(f"sample_context_vector_bn_shape: {sample_context_vector_bn.shape}\nsample_attention_wts_bn: {sample_attention_wts_bn.shape}")

sample_context_vector_bn_shape: (64, 1024)
sample_attention_wts_bn: (64, 20, 1)


In [40]:
#Decoder_with Attention:
class DecoderWithAttention(tf.keras.Model):
  def __init__(self, dec_units, voc_size, batch_size,embedding_dim,attention_layer = None, name = 'decoder_with_attention', **kwargs):
    super(DecoderWithAttention, self). __init__(self, name = name, **kwargs)
    self.dec_units = dec_units
    self.batch_size = batch_size
    self.embedding_layer = tf.keras.layers.Embedding(input_dim = voc_size, output_dim = embedding_dim, name = 'embedding_dec')
    self.gru_layer = tf.keras.layers.GRU(units = self.dec_units, kernel_initializer = 'glorot_uniform',
                                         return_state = True, return_sequences = True, 
                                         recurrent_dropout = 0.5, dropout = 0.5,
                                         name = 'gru_dec')
    self.fc = tf.keras.layers.Dense(units = voc_size, activation = 'softmax', name = 'dec_outputs')
    self.attention = attention_layer
  
  def call(self, inputs, enc_hidden, enc_out):
    inputs = self.embedding_layer(inputs)
    attention_wts = None
    if self.attention:
      context_vector, attention_wts = self.attention(enc_hidden, enc_out)
      inputs = tf.concat([tf.expand_dims(context_vector, axis = 1), inputs], axis = -1)
    dec_out, dec_hidden = self.gru_layer(inputs, initial_state = enc_hidden)
    dec_out = tf.reshape(dec_out, shape = (-1, dec_out.shape[2]))
    inputs = self.fc(dec_out)
    return inputs, dec_hidden, attention_wts


In [41]:
#Instantiating and testing the decoder with attention class using the sample batch

In [42]:
decoder_with_attention = DecoderWithAttention(units, output_voc_size,batch_size, embedding_dim,attention_layer= additive)



In [43]:
sample_dec_out_with_attn, sample_dec_hidden_with_attn, attention_wts = decoder_with_attention(tf.random.uniform(shape = (batch_size, 1)), sample_enc_hidden, sample_enc_out)

In [44]:
print(f"sample_dec_out_shape: {sample_dec_out_with_attn.shape}\nsample_dec_hidden_shape: {sample_dec_hidden_with_attn.shape}\nsample_attn_wt_shape: {sample_attention_wts.shape}")

sample_dec_out_shape: (64, 10785)
sample_dec_hidden_shape: (64, 1024)
sample_attn_wt_shape: (64, 20, 1)


In [45]:
#The training Loop from the scratch

In [46]:
#Get the loss object, customize the loss function and prepapare the optimizer:
optimizer = tf.keras.optimizers.Adam(learning_rate = 1e-3)
loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits = True, reduction = 'none')

def custom_loss(y_real, y_pred):
  mask = tf.math.logical_not(tf.math.equal(y_real,0))
  loss_ = loss_object(y_real, y_pred)
  mask = tf.cast(mask, dtype = loss_.dtype)
  loss_*=mask
  return tf.reduce_mean(loss_)

print(f"Uncostomized loss: {loss_object([1.2,0.9],[[0.3,0.7,0.9,1.0],[0.5,0.8,1.0,0.3]])}")
print(f"Customized_loss: {custom_loss([1.2,0.9],[[0.3,0.7,0.9,1.0],[0.5,0.8,1.0,0.3]])}")

Uncostomized loss: [1.4449363 1.5722159]
Customized_loss: 1.5085761547088623


In [47]:
def get_training_step():
  @tf.function
  def train_step(inputs, outputs, enc_hidden,encoder, decoder):
    loss = 0
    with tf.GradientTape() as tape:
      enc_out, enc_hidden = encoder(inputs, enc_hidden)
      dec_hidden = enc_hidden
      dec_inputs = tf.expand_dims([output_tokenizer.word_index['<start>']] * batch_size, axis = 1)
      for t in range(1, outputs.shape[1]):
        preds, dec_hidden, attention_wts = decoder(dec_inputs, dec_hidden, enc_out)
        loss+=custom_loss(outputs[:,t], preds)
        dec_inputs = tf.expand_dims(outputs[:, t], 1)
    batch_loss = (loss / int(outputs.shape[1]))
    trainable_vars = encoder.trainable_variables + decoder.trainable_variables
    grads = tape.gradient(loss, trainable_vars)
    optimizer.apply_gradients(zip(grads, trainable_vars))
    return batch_loss
  return train_step 


In [48]:
#Define the function to compute validation loss:

In [49]:
def get_validation_loss(inputs, outputs,enc_hidden,encoder,decoder):
  val_loss = 0
  enc_out, enc_hidden = encoder(inputs,enc_hidden)
  dec_hidden = enc_hidden
  dec_inputs = tf.expand_dims([output_tokenizer.word_index['<start>']] * batch_size, 1)
  for t in range(1, outputs.shape[1]):
    preds, dec_hidden, attention_wts = decoder(dec_inputs, dec_hidden, enc_out)
    val_loss+=custom_loss(outputs[:,t], preds)
    dec_inputs = tf.expand_dims(outputs[:,t],1)
  val_loss = (val_loss / int(outputs.shape[1]))
  return val_loss 

In [53]:
#Training the model:
def train_mtn_model(epochs, attention):
  encoder = Encoder(units, embedding_dim,input_voc_size,batch_size, name = 'encoder')
  decoder = DecoderWithAttention(units, output_voc_size, batch_size,embedding_dim, attention_layer = attention)
  train_fn = get_training_step()
  train_loss = []
  validation_loss = []
  for epoch in range(epochs):
    tic = time.time()
    enc_hidden = encoder.hidden_initializer()
    total_loss = 0
    for (steps, (input, output)) in enumerate(train_data.take(steps_per_epoch_train)):
      batch_loss = train_fn(input, output,enc_hidden,encoder, decoder)
      total_loss+=batch_loss
      if steps % 100 == 0:
        print(f"Epoch: {epoch + 1}: Batch: {steps}: Loss: {batch_loss:.4f}")
    enc_hidden = encoder.hidden_initializer()
    total_val_loss = 0
    for (steps, (input, output)) in enumerate(validation_data.take(steps_per_epoch_val)):
      val_loss = get_validation_loss(input, output,enc_hidden,encoder, decoder)
      total_val_loss+=val_loss
    train_loss.append(total_loss/steps_per_epoch_train)
    validation_loss.append(total_val_loss/steps_per_epoch_val)
    print(f"At epoch: {epoch+ 1}: training loss is: {train_loss[-1]:.4f}: validation loss is: {validation_loss[-1]:.4f}")
    print(time_fmt(time.time() - tic))
  return encoder, decoder, train_loss, validation_loss


In [54]:
#Train without attention:

In [55]:
epochs = 15
attention = None
train_mtn_model(epochs, attention)

Epoch: 1: Batch: 0: Loss: 4.4037
Epoch: 1: Batch: 100: Loss: 2.3943
Epoch: 1: Batch: 200: Loss: 1.8535
Epoch: 1: Batch: 300: Loss: 1.6135
Epoch: 1: Batch: 400: Loss: 1.6292
Epoch: 1: Batch: 500: Loss: 1.3531
Epoch: 1: Batch: 600: Loss: 1.3853
Epoch: 1: Batch: 700: Loss: 1.2677
Epoch: 1: Batch: 800: Loss: 1.0937
Epoch: 1: Batch: 900: Loss: 0.9646
Epoch: 1: Batch: 1000: Loss: 1.0725
Epoch: 1: Batch: 1100: Loss: 1.1077
Epoch: 1: Batch: 1200: Loss: 1.0820
Epoch: 1: Batch: 1300: Loss: 0.9843
Epoch: 1: Batch: 1400: Loss: 0.9715
At epoch: 1: training loss is: 1.4310: validation loss is: 0.9277
0: 005: 29.00
Epoch: 2: Batch: 0: Loss: 0.7463
Epoch: 2: Batch: 100: Loss: 0.8139
Epoch: 2: Batch: 200: Loss: 0.6730
Epoch: 2: Batch: 300: Loss: 0.8752
Epoch: 2: Batch: 400: Loss: 0.7149
Epoch: 2: Batch: 500: Loss: 0.7079
Epoch: 2: Batch: 600: Loss: 0.7404
Epoch: 2: Batch: 700: Loss: 0.6102
Epoch: 2: Batch: 800: Loss: 0.6862
Epoch: 2: Batch: 900: Loss: 0.5521
Epoch: 2: Batch: 1000: Loss: 0.6308
Epoch: 2

(<__main__.Encoder at 0x7f532e348f28>,
 <__main__.DecoderWithAttention at 0x7f532e2b9cc0>,
 [<tf.Tensor: shape=(), dtype=float32, numpy=1.431033>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.6767448>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.3821498>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.23012912>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.15318148>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.1124177>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.09006567>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.07641119>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.06780918>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.06294307>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.059121843>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.055616636>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.053861495>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.05259737>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.051350623>],
 [<tf.Tensor: shape=(), dtype=float32, 

In [56]:
epochs = 15
attention = dotproduct
train_mtn_model(epochs, attention)

Epoch: 1: Batch: 0: Loss: 4.2670
Epoch: 1: Batch: 100: Loss: 2.2518
Epoch: 1: Batch: 200: Loss: 1.7869
Epoch: 1: Batch: 300: Loss: 1.4391
Epoch: 1: Batch: 400: Loss: 1.3762
Epoch: 1: Batch: 500: Loss: 1.3870
Epoch: 1: Batch: 600: Loss: 1.3819
Epoch: 1: Batch: 700: Loss: 1.3010
Epoch: 1: Batch: 800: Loss: 1.3333
Epoch: 1: Batch: 900: Loss: 1.0212
Epoch: 1: Batch: 1000: Loss: 1.1368
Epoch: 1: Batch: 1100: Loss: 0.9467
Epoch: 1: Batch: 1200: Loss: 0.9509
Epoch: 1: Batch: 1300: Loss: 0.8669
Epoch: 1: Batch: 1400: Loss: 0.9145
At epoch: 1: training loss is: 1.3911: validation loss is: 0.9129
0: 006: 12.00
Epoch: 2: Batch: 0: Loss: 0.7150
Epoch: 2: Batch: 100: Loss: 0.6572
Epoch: 2: Batch: 200: Loss: 0.7748
Epoch: 2: Batch: 300: Loss: 0.7619
Epoch: 2: Batch: 400: Loss: 0.7566
Epoch: 2: Batch: 500: Loss: 0.6824
Epoch: 2: Batch: 600: Loss: 0.6610
Epoch: 2: Batch: 700: Loss: 0.6902
Epoch: 2: Batch: 800: Loss: 0.6286
Epoch: 2: Batch: 900: Loss: 0.7525
Epoch: 2: Batch: 1000: Loss: 0.5774
Epoch: 2

(<__main__.Encoder at 0x7f532d37aeb8>,
 <__main__.DecoderWithAttention at 0x7f532e1f2940>,
 [<tf.Tensor: shape=(), dtype=float32, numpy=1.3910816>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.68004495>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.413652>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.27207646>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.19327904>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.14523251>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.113918364>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.09256906>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.07807478>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.069708124>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.06319567>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.058385316>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.05483585>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.05231696>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.050822828>],
 [<tf.Tensor: shape=(), dtype=float32

In [57]:
epochs = 15
attention = additive
train_mtn_model(epochs, attention)

Epoch: 1: Batch: 0: Loss: 4.3783
Epoch: 1: Batch: 100: Loss: 2.3845
Epoch: 1: Batch: 200: Loss: 1.8689
Epoch: 1: Batch: 300: Loss: 1.7603
Epoch: 1: Batch: 400: Loss: 1.5609
Epoch: 1: Batch: 500: Loss: 1.6098
Epoch: 1: Batch: 600: Loss: 1.5337
Epoch: 1: Batch: 700: Loss: 1.4159
Epoch: 1: Batch: 800: Loss: 1.5343
Epoch: 1: Batch: 900: Loss: 1.2867
Epoch: 1: Batch: 1000: Loss: 1.4148
Epoch: 1: Batch: 1100: Loss: 1.3500
Epoch: 1: Batch: 1200: Loss: 1.3086
Epoch: 1: Batch: 1300: Loss: 1.3006
Epoch: 1: Batch: 1400: Loss: 1.2071
At epoch: 1: training loss is: 1.5824: validation loss is: 1.2398
0: 006: 03.00
Epoch: 2: Batch: 0: Loss: 1.1595
Epoch: 2: Batch: 100: Loss: 1.1248
Epoch: 2: Batch: 200: Loss: 0.9861
Epoch: 2: Batch: 300: Loss: 1.0528
Epoch: 2: Batch: 400: Loss: 0.9710
Epoch: 2: Batch: 500: Loss: 1.0445
Epoch: 2: Batch: 600: Loss: 1.0102
Epoch: 2: Batch: 700: Loss: 0.9774
Epoch: 2: Batch: 800: Loss: 0.9488
Epoch: 2: Batch: 900: Loss: 0.8640
Epoch: 2: Batch: 1000: Loss: 0.9851
Epoch: 2

(<__main__.Encoder at 0x7f532caba400>,
 <__main__.DecoderWithAttention at 0x7f532c7b96a0>,
 [<tf.Tensor: shape=(), dtype=float32, numpy=1.5823554>,
  <tf.Tensor: shape=(), dtype=float32, numpy=1.0183258>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.70003134>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.48164475>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.33727655>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.24375208>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.18258658>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.14117578>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.113993354>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.09677085>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.08355326>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.07577282>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.06923313>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.065496504>,
  <tf.Tensor: shape=(), dtype=float32, numpy=0.06303487>],
 [<tf.Tensor: shape=(), dtype=float32,