In [None]:
%pip install -q tensorflow_datasets
% pip install -q tensorflow_text

[K     |████████████████████████████████| 3.4MB 5.9MB/s 
[?25h

In [None]:
import collections
import logging
import os
import pathlib
import re
import string
import sys
import time

import numpy as np
import matplotlib.pyplot as plt

import tensorflow_datasets as tfds
import tensorflow_text as text
import tensorflow as tf

In [None]:
logging.getLogger('tensorflow').setLevel(logging.ERROR) # suppress warnings

In [None]:
# Download dataset
examples , metadata = tfds.load('ted_hrlr_translate/pt_to_en', with_info=True, as_supervised=True)
train_examples, val_examples = examples['train'] ,examples['validation']

# returns a tf.data.Dataset object that yields pairs of text examples.

[1mDownloading and preparing dataset ted_hrlr_translate/pt_to_en/1.0.0 (download: 124.94 MiB, generated: Unknown size, total: 124.94 MiB) to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0...[0m


HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Completed...', max=1.0, style=Progre…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Dl Size...', max=1.0, style=ProgressSty…

HBox(children=(FloatProgress(value=1.0, bar_style='info', description='Extraction completed...', max=1.0, styl…









HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompletePPVAEA/ted_hrlr_translate-train.tfrecord


HBox(children=(FloatProgress(value=0.0, max=51785.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompletePPVAEA/ted_hrlr_translate-validation.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1193.0), HTML(value='')))



HBox(children=(FloatProgress(value=1.0, bar_style='info', max=1.0), HTML(value='')))

Shuffling and writing examples to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0.incompletePPVAEA/ted_hrlr_translate-test.tfrecord


HBox(children=(FloatProgress(value=0.0, max=1803.0), HTML(value='')))

[1mDataset ted_hrlr_translate downloaded and prepared to /root/tensorflow_datasets/ted_hrlr_translate/pt_to_en/1.0.0. Subsequent calls will reuse this data.[0m


In [None]:
# Download and unzip and impoort the subword tokenizer optimized for this dataset.
model_name = "ted_hrlr_translate_pt_en_converter"
tf.keras.utils.get_file(
    f"{model_name}.zip",
    f"https://storage.googleapis.com/download.tensorflow.org/models/{model_name}.zip",
    cache_dir='.', cache_subdir='', extract=True
)

Downloading data from https://storage.googleapis.com/download.tensorflow.org/models/ted_hrlr_translate_pt_en_converter.zip


'./ted_hrlr_translate_pt_en_converter.zip'

In [None]:
tokenizers = tf.saved_model.load(model_name) # Loads the tokenizer that was specifically optiimized for this portugese - english dataset.

In [None]:
[item for item in dir(tokenizers.en) if not item.startswith('_')]

['detokenize',
 'get_reserved_tokens',
 'get_vocab_path',
 'get_vocab_size',
 'lookup',
 'tokenize',
 'tokenizer',
 'vocab']

In [None]:
# Function encodes the batches of raw text

def tokenize_pairs(pt, en):
  pt = tokenizers.pt.tokenize(pt)
  # Convert from ragged to dense, padding with zeros.
  pt  = pt.to_tensor()

  en = tokenizers.en.tokenize(en)
  en = en.to_tensor()
  return pt, en


# Pipeline that processes, shuffles and batches data
BUFFER_SIZE = 20000
BATCH_SIZE = 64

def make_batches(ds):
  return (ds.cache().shuffle(BUFFER_SIZE).batch(BATCH_SIZE).map(tokenize_pairs, num_parallel_calls= tf.data.AUTOTUNE)
  .prefetch(tf.data.AUTOTUNE))

train_batches = make_batches(train_examples)
val_batches = make_batches(val_examples) 

In [None]:
def get_angles(pos, i, d_model):
  angle_rates  = 1 / np.power(10000, (2 * (i // 2) / np.float32(d_model)))
  return pos * angle_rates


def positional_encoding(position, d_model):
  angle_rads = get_angles( np.arange(position)[:,np.newaxis], np.arange(d_model)[np.newaxis,:], d_model)

  #apply sin to even indices in the array ; 2i
  angle_rads[:, 0::2]   = np.sin(angle_rads[:, 0::2])

  # apply cos to odd indices in the array; 2i + 1
  angle_rads[:, 1::2]   = np.cos(angle_rads[:, 1:: 2])

  pos_encoding = angle_rads[np.newaxis, ...]

  return tf.cast(pos_encoding, dtype= tf.float32)

#get_angles(np.arange(5)[:, np.newaxis], np.arange(3)[np.newaxis,:],3)
pos = positional_encoding(5,3)
print(pos.shape)
print(pos)

(1, 5, 3)
tf.Tensor(
[[[ 0.          1.          0.        ]
  [ 0.84147096  0.5403023   0.00215443]
  [ 0.9092974  -0.41614684  0.00430886]
  [ 0.14112    -0.9899925   0.00646326]
  [-0.7568025  -0.6536436   0.00861763]]], shape=(1, 5, 3), dtype=float32)


In [None]:
# Masking : outputs a 1 at locations of padding and 0 otherwise


def create_padding_mask(seq):
  seq = tf.cast(tf.math.equal(seq, 0), tf.float32)

  # add extra dimensions to add the padding to the attention logits.
  return seq[:, tf.newaxis, tf.newaxis, :] # (batch_size, 1, 1, seq_len)


def create_look_ahead_mask(size):
  mask = 1 - tf.linalg.band_part(tf.ones((size, size)), -1, 0)
  return mask # (seq_len , seq_len)

create_look_ahead_mask(8)

<tf.Tensor: shape=(8, 8), dtype=float32, numpy=
array([[0., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>

In [None]:
x = tf.random.uniform((3,4,4), minval=1 , maxval =3, dtype = tf.float32)
z = tf.random.uniform((3,1,1,4), minval=0, maxval=3, dtype=tf.float32)

(x+ z).shape

TensorShape([3, 3, 4, 4])

In [None]:
# Scaled dot product attention

def scaled_dot_product_attention(q, k, v, mask):
  """Calculate the attention weights.
  q, k, v must have matching leading dimensions.
  k, v must have matching penultimate dimension, i.e.: seq_len_k = seq_len_v.
  The mask has different shapes depending on its type(padding or look ahead) 
  but it must be broadcastable for addition.

  Args:
    q: query shape == (..., seq_len_q, depth)
    k: key shape == (..., seq_len_k, depth)
    v: value shape == (..., seq_len_v, depth_v)
    mask: Float tensor with shape broadcastable 
          to (..., seq_len_q, seq_len_k). Defaults to None.

  Returns:
    output, attention_weights
  """
  matmul_qk = tf.matmul( q, k, transpose_b=True) # (...., seq_len_q, seq_len_k)

  # scale matmul_qk
  dk = tf.cast(tf.shape(k)[-1], tf.float32)
  scaled_attention_logits = matmul_qk / tf.math.sqrt(dk)

  # add the mask to the scaled tensor.
  if mask is not None:
    scaled_attention_logits += (mask * -1e9)

  # softmax is normalized on the last axis (seq_len_k) so that the scores add up to 1.
  attention_weights = tf.nn.softmax(scaled_attention_logits, axis=-1) # (..., seq_len_q ,seq_len_k)

  output  = tf.matmul(attention_weights, v) # (..., seq_len_q, depth_v)

  return output, attention_weights

In [None]:
# Multi-head Attention
class MultiHeadAttention(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()
    self.num_heads = num_heads
    self.d_model = d_model

    assert d_model % self.num_heads == 0

    self.depth = d_model // self.num_heads

    self.wq = tf.keras.layers.Dense(d_model)
    self.wk = tf.keras.layers.Dense(d_model)
    self.wv = tf.keras.layers.Dense(d_model)

    self.dense = tf.keras.layers.Dense(d_model)

  def split_heads(self, x, batch_size):
    """Split the last dimension into (num_heads, depth).
    Transpose the result such that the shape is (batch_size, num_heads, seq_len, depth)
    """
    x = tf.reshape(x, (batch_size, -1, self.num_heads, self.depth))
    return tf.transpose(x, perm=[0, 2, 1, 3])

  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]

    q = self.wq(q)  # (batch_size, seq_len, d_model)
    k = self.wk(k)  # (batch_size, seq_len, d_model)
    v = self.wv(v)  # (batch_size, seq_len, d_model)

    q = self.split_heads(q, batch_size)  # (batch_size, num_heads, seq_len_q, depth)
    k = self.split_heads(k, batch_size)  # (batch_size, num_heads, seq_len_k, depth)
    v = self.split_heads(v, batch_size)  # (batch_size, num_heads, seq_len_v, depth)

    # scaled_attention.shape == (batch_size, num_heads, seq_len_q, depth)
    # attention_weights.shape == (batch_size, num_heads, seq_len_q, seq_len_k)
    scaled_attention, attention_weights = scaled_dot_product_attention(
        q, k, v, mask)

    scaled_attention = tf.transpose(scaled_attention, perm=[0, 2, 1, 3])  # (batch_size, seq_len_q, num_heads, depth)

    concat_attention = tf.reshape(scaled_attention, 
                                  (batch_size, -1, self.d_model))  # (batch_size, seq_len_q, d_model)

    output = self.dense(concat_attention)  # (batch_size, seq_len_q, d_model)

    return output, attention_weights

In [None]:
temp_mha = MultiHeadAttention(d_model=512, num_heads=8)
y = tf.random.uniform((1, 60, 512))  # (batch_size, encoder_sequence, d_model)
out, attn = temp_mha(y, k=y, q=y, mask=None)
out.shape, attn.shape

(TensorShape([1, 60, 512]), TensorShape([1, 8, 60, 60]))

In [None]:
# Point wise feed forward network
def point_wise_feed_forward_network(d_model, dff):
  return tf.keras.Sequential([
                              tf.keras.layers.Dense(dff, activation='relu'), #(batch_size, seq_len, dff)
                              tf.keras.layers.Dense(d_model) # (batch_size, seq_len, d_model)
                  ])

Encoder and Decoder

In [None]:
# Encoder Layer
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff, rate=0.1):
    super(EncoderLayer, self).__init__()

    self.mha = MultiHeadAttention(d_model, num_heads)
    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)


  def call(self, x, training, mask):
    attn_output, _ = self.mha(x,x,x,mask) # (batch_size, input_seq_len, d_model)

    attn_output = self.dropout1(attn_output, training= training)
    out1 = self.layernorm1(x + attn_output)   # (batch_size, input_seq_len, d_model)

    ffn_output = self.ffn(out1) # (batch_size, input_seq_len, d_model)
    ffn_output = self.dropout2(ffn_output, training = training)
    out2 = self.layernorm2(out1 + ffn_output) # (batch_size, input_seq_len, d_model)


    return out2

In [None]:
sample_encoder_layer = EncoderLayer(512, 8, 2048)

sample_encoder_layer_output = sample_encoder_layer(
    tf.random.uniform((64, 43, 512)), False, None)

sample_encoder_layer_output.shape  # (batch_size, input_seq_len, d_model)

TensorShape([64, 43, 512])

In [None]:
# Decoder layer

# Masked multi-head attention sublayer (with look ahead mask and padding mask)
# Multi-head atttention (with padding mask). V and K receive the encoder output as inputs. Q receives the output from the masked
# multihead attention sublayer.
# Point wise feed forward network

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self, d_model, num_heads, dff , rate=0.1):
    super(DecoderLayer, self).__init__()


    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)

    self.ffn = point_wise_feed_forward_network(d_model, dff)

    self.layernorm1 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm2 = tf.keras.layers.LayerNormalization(epsilon=1e-6)
    self.layernorm3 = tf.keras.layers.LayerNormalization(epsilon=1e-6)

    self.dropout1 = tf.keras.layers.Dropout(rate)
    self.dropout2 = tf.keras.layers.Dropout(rate)
    self.dropout3 = tf.keras.layers.Dropout(rate)

  def call(self, x, enc_output, training, look_ahead_mask, padding_mask):
    # enc_output.shape == (batch_size, input_seq_len, d_model)

    attn1, attn_weights_block1 = self.mha1(x, x, x, look_ahead_mask) # (batch_size, target_seq_len, d_model)
    attn1 = self.dropout1(attn1, training=training)
    out1 = self.layernorm1(attn1 + x)

    attn2, attn_weights_block2 = self.mha2(enc_output, enc_output, out1, padding_mask) # (batch_size, target_seq_len, d_model)
    attn2 = self.dropout2(attn2, training= training)
    out2 = self.layernorm2(attn2 + out1) # (batch_size, target_seq_len, d_model)

    ffn_output = self.ffn(out2) # (batch_size, target_seq_len, d_model)
    ffn_output = self.dropout3(ffn_output, training = training)
    out3 = self.layernorm3(ffn_output + out2) # (batch_size, target_seq_len, d_Model)

    return out3, attn_weights_block1, attn_weights_block2


In [None]:
sample_decoder_layer = DecoderLayer(512, 8, 2048)

sample_decoder_layer_output, _, _ = sample_decoder_layer(
    tf.random.uniform((64, 50, 512)), sample_encoder_layer_output, 
    False, None, None)

sample_decoder_layer_output.shape  # (batch_size, target_seq_len, d_model)

TensorShape([64, 50, 512])

In [None]:
# Encoder consists of : 
# Input embedding
# Positional Encoding
# N encoder layers

class Encoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, maximum_position_encoding, rate=0.1):
    super(Encoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = tf.keras.layers.Embedding(input_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, self.d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

    self.dropout = tf.keras.layers.Dropout(rate)

  def call(self, x , training , mask):

    seq_len = tf.shape(x)[1]

    # adding embedding and position encoding
    x = self.embedding(x)  # (batch_size, input_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
    x += self.pos_encoding[:, : seq_len: ]

    x = self.dropout(x , training =training)

    for i in  range(self.num_layers):
      x = self.enc_layers[i](x, training, mask)

    return x #(batch_size, input_seq_len, d_model)



In [None]:
# Decoder

# Output embedding
# positional encoding
# N decoder layers

# The output of this decoder is the input to the final FC layer

class Decoder(tf.keras.layers.Layer):
  def __init__(self, num_layers, d_model, num_heads, dff, target_vocab_size, maximum_position_encoding, rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers  = num_layers

    self.embedding = tf.keras.layers.Embedding(target_vocab_size, d_model)
    self.pos_encoding = positional_encoding(maximum_position_encoding, d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) for _ in range(num_layers)]

    self.dropout  = tf.keras.layers.Dropout(rate)

  def call(self, x ,enc_output, training, look_ahead_mask, padding_mask):
    seq_len = tf.shape(x)[1]
    attention_weights = {}

    x = self.embedding(x) # (batch_size, target_seq_len, d_model)
    x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))

    x += self.pos_encoding[:, :seq_len, : ]

    x  = self.dropout(x, training=training)

    for i in range(self.num_layers):
      x, block1, block2 = self.dec_layers[i](x, enc_output, training, look_ahead_mask, padding_mask)

      attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      attention_weights[ 'decoder_layer{}_block2'.format(i+1)] = block2

    # x.shape == (batch_size, target_seq_len, d_model)
    return x , attention_weights

In [None]:
# Create Transformer

class Transformer(tf.keras.Model):
  def __init__(self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    super(Transformer, self).__init__()

    self.tokenizer = Encoder(num_layers, d_model, num_heads, dff, 
                           input_vocab_size, pe_input, rate)

    self.decoder = Decoder(num_layers, d_model, num_heads, dff, 
                           target_vocab_size, pe_target, rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inp, tar, training, enc_padding_mask, 
           look_ahead_mask, dec_padding_mask):

    enc_output = self.tokenizer(inp, training, enc_padding_mask)  # (batch_size, inp_seq_len, d_model)

    # dec_output.shape == (batch_size, tar_seq_len, d_model)
    dec_output, attention_weights = self.decoder(
        tar, enc_output, training, look_ahead_mask, dec_padding_mask)

    final_output = self.final_layer(dec_output)  # (batch_size, tar_seq_len, target_vocab_size)

    return final_output, attention_weights
    

In [None]:
sample_transformer = Transformer(
    num_layers=2, d_model=512, num_heads=8, dff=2048, 
    input_vocab_size=8500, target_vocab_size=8000, 
    pe_input=10000, pe_target=6000)

temp_input = tf.random.uniform((64, 38), dtype=tf.int64, minval=0, maxval=200)
temp_target = tf.random.uniform((64, 36), dtype=tf.int64, minval=0, maxval=200)

fn_out, _ = sample_transformer(temp_input, temp_target, training=False, 
                               enc_padding_mask=None, 
                               look_ahead_mask=None,
                               dec_padding_mask=None)

fn_out.shape  # (batch_size, tar_seq_len, target_vocab_size)

TensorShape([64, 36, 8000])

In [None]:
# Set Hyperparameters

# The values for num_layers, d_model and dff were reduced in the project.
# The values used in the base model of transformer were ; num_layers = 6, d_model = 512, dff= 2048.

num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
# Optimizer learning schedule rate.
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super(CustomSchedule, self).__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [None]:
# Loss and metrics
#Apply padding mask when calculating the loss

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [None]:
def loss_function(real, pred):
  mask = tf.math.logical_not(tf.math.equal(real, 0))
  loss_ = loss_object(real, pred)

  mask = tf.cast(mask, dtype=loss_.dtype)
  loss_ *= mask

  return tf.reduce_sum(loss_)/tf.reduce_sum(mask)


def accuracy_function(real, pred):
  accuracies = tf.equal(real, tf.argmax(pred, axis=2))

  mask = tf.math.logical_not(tf.math.equal(real, 0))
  accuracies = tf.math.logical_and(mask, accuracies)

  accuracies = tf.cast(accuracies, dtype=tf.float32)
  mask = tf.cast(mask, dtype=tf.float32)
  return tf.reduce_sum(accuracies)/tf.reduce_sum(mask)

In [None]:
train_loss = tf.keras.metrics.Mean(name='train_loss')
train_accuracy = tf.keras.metrics.Mean(name='train_accuracy')

transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=tokenizers.pt.get_vocab_size(),
    target_vocab_size=tokenizers.en.get_vocab_size(), 
    pe_input=1000, 
    pe_target=1000,
    rate=dropout_rate)


In [None]:
x = tf.random.uniform((2,1,1,2), minval=1, maxval=3, dtype = tf.int32)
y = tf.random.uniform((2,2), minval=1, maxval=4, dtype=tf.int32)

print(x)
print(y)
tf.math.maximum(x,y)

tf.Tensor(
[[[[1 1]]]


 [[[2 2]]]], shape=(2, 1, 1, 2), dtype=int32)
tf.Tensor(
[[3 2]
 [1 3]], shape=(2, 2), dtype=int32)


<tf.Tensor: shape=(2, 1, 2, 2), dtype=int32, numpy=
array([[[[3, 2],
         [1, 3]]],


       [[[3, 2],
         [2, 3]]]], dtype=int32)>

In [None]:
def create_masks(inp, tar):
  # Encoder padding mask
  enc_padding_mask = create_padding_mask(inp)

  # Used in the 2nd attention block in the decoder.
  # This padding mask is used to mask the encoder outputs.
  dec_padding_mask = create_padding_mask(inp)

  # Used in the 1st attention block in the decoder.
  # It is used to pad and mask future tokens in the input received by 
  # the decoder.
  look_ahead_mask = create_look_ahead_mask(tf.shape(tar)[1])
  dec_target_padding_mask = create_padding_mask(tar)
  combined_mask = tf.maximum(dec_target_padding_mask, look_ahead_mask)

  return enc_padding_mask, combined_mask, dec_padding_mask

In [None]:
checkpoint_path = './Checkpoints/translate/train'

ckpt = tf.train.Checkpoint(transformer = transformer,
                           optimizer = optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print('Checkpoint Restored')

In [None]:
EPOCHS = 20
transformer

<__main__.Transformer at 0x7f327fa628d0>

In [None]:
ckpt_manager

<tensorflow.python.training.checkpoint_management.CheckpointManager at 0x7f327f915400>

In [None]:
# To avoid re-tracing due to the variable sequence lengths or variable batch sizes (the last batch is smaller), 
# use input_signature to specify more generic shapes.

train_step_signature = [
                        tf.TensorSpec(shape=(None, None), dtype=tf.int64),
                        tf.TensorSpec(shape=(None, None), dtype=tf.int64)
]

@tf.function(input_signature = train_step_signature)
def train_step(inp, tar):
  tar_inp = tar[:, :-1]
  tar_real = tar[:, 1:]

  enc_padding_mask , combined_mask , dec_padding_mask = create_masks(inp, tar_inp)

  with tf.GradientTape() as tape:
    predictions, _  = transformer(inp, tar_inp,
                                   True,
                                   enc_padding_mask,
                                   combined_mask,
                                   dec_padding_mask)
    
    loss = loss_function(tar_real, predictions)

    gradients = tape.gradient(loss, transformer.trainable_variables)
    optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

    train_loss(loss)
    train_accuracy(accuracy_function(tar_real, predictions))

In [None]:
for epoch in range(EPOCHS):
  start = time.time()
  
  train_loss.reset_states()
  train_accuracy.reset_states()

  # inp -> portugese, tar -> english
  for (batch, (inp, tar)) in enumerate(train_batches):
    train_step(inp, tar)

    if batch % 50 == 0:
      print(f'Epoch {epoch + 1} Batch {batch} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  if (epoch + 1) % 4 == 0:
    ckpt_save_path = ckpt_manager.save()
    print(f'Saving checkpoint for epoch {epoch +1} at {ckpt_save_path}')

  print(f'Epoch {epoch + 1} Loss {train_loss.result():.4f} Accuracy {train_accuracy.result():.4f}')

  print(f'Time taken for 1 epoch: {time.time() - start:.2f} secs\n')
  # Training ineterrupted because of length of training.

Epoch 1 Batch 0 Loss 8.9047 Accuracy 0.0000
Epoch 1 Batch 50 Loss 8.8461 Accuracy 0.0003
Epoch 1 Batch 100 Loss 8.7310 Accuracy 0.0191
Epoch 1 Batch 150 Loss 8.6120 Accuracy 0.0288
Epoch 1 Batch 200 Loss 8.4690 Accuracy 0.0352
Epoch 1 Batch 250 Loss 8.2992 Accuracy 0.0412
Epoch 1 Batch 300 Loss 8.1073 Accuracy 0.0479
Epoch 1 Batch 350 Loss 7.9084 Accuracy 0.0537
Epoch 1 Batch 400 Loss 7.7201 Accuracy 0.0594
Epoch 1 Batch 450 Loss 7.5508 Accuracy 0.0664
Epoch 1 Batch 500 Loss 7.4036 Accuracy 0.0732
Epoch 1 Batch 550 Loss 7.2698 Accuracy 0.0804
Epoch 1 Batch 600 Loss 7.1458 Accuracy 0.0879
Epoch 1 Batch 650 Loss 7.0273 Accuracy 0.0955
Epoch 1 Batch 700 Loss 6.9201 Accuracy 0.1024
Epoch 1 Batch 750 Loss 6.8200 Accuracy 0.1086
Epoch 1 Batch 800 Loss 6.7245 Accuracy 0.1148
Epoch 1 Loss 6.7081 Accuracy 0.1158
Time taken for 1 epoch: 3317.37 secs

Epoch 2 Batch 0 Loss 5.3143 Accuracy 0.1956
Epoch 2 Batch 50 Loss 5.2337 Accuracy 0.2112
Epoch 2 Batch 100 Loss 5.2044 Accuracy 0.2156
Epoch 2 Batc

In [None]:

transformer.save('./portugese_translator')