# Imports

In [1]:
import numpy as np
import re
import time
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds
from tqdm import tqdm

tf.get_logger().setLevel('ERROR')

2022-04-12 12:06:20.207162: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1


# Loading database

In [2]:
def load_db(file_path):
    with open(file_path) as f:
        return f.read()

euro_en = load_db('pt-en/europarl-v7.pt-en.en')
euro_pt = load_db('pt-en/europarl-v7.pt-en.pt')

In [3]:
print('en sample example: ', euro_en.split('\n')[0])
print('pt sample example: ', euro_pt.split('\n')[0])

en sample example:  Resumption of the session
pt sample example:  Reinício da sessão


# Data cleaning

In [4]:
def data_cleaning(data):
    data = re.sub(r'\.(?=[0-9]|[a-z]|[A-Z])', '$$', data)
    data = re.sub(r'\$\$', '', data)
    data = re.sub(r' +', ' ', data)
    return data.split('\n')

data_en = data_cleaning(data=euro_en)
data_pt = data_cleaning(data=euro_pt)

print('Data en: ', data_en[10])
print('Data pt: ', data_pt[10])

Data en:  Would it be appropriate for you, Madam President, to write a letter to the Sri Lankan President expressing Parliament's regret at his and the other violent deaths in Sri Lanka and urging her to do everything she possibly can to seek a peaceful reconciliation to a very difficult situation?
Data pt:  Será que a senhora Presidente poderia enviar uma carta à Presidente do Sri Lanka manifestando o pesar do Parlamento por esta e outras mortes violentas perpetradas no seu país, e instando­a a envidar todos os esforços ao seu alcance para procurar obter uma reconciliação pacífica na situação extremamente difícil que ali se vive?


In [5]:
print('en data size: {} | pt data size: {}'.format(len(data_en), len(data_pt)))

en data size: 1960408 | pt data size: 1960408


# Tokenization

In [6]:
def tokenizer_data(data, vocab_size):
    return tfds.features.text.SubwordTextEncoder.build_from_corpus(data, target_vocab_size=vocab_size)

tokenizer_en = tokenizer_data(data=data_en, vocab_size=2**13) # en data
tokenizer_pt = tokenizer_data(data=data_pt, vocab_size=2**13) # pt data

print('En vocab size: ', tokenizer_en.vocab_size)
print('Pt vocab size: ', tokenizer_pt.vocab_size)

vocab_size_en = tokenizer_en.vocab_size + 2
vocab_size_pt = tokenizer_pt.vocab_size + 2

En vocab size:  8191
Pt vocab size:  8116


In [7]:
tokenizer_en.vocab_size

8191

In [8]:
def token_start_end(data, tokenizer):
    vocab_size = tokenizer.vocab_size + 2
    # adding start and end token in each setense
    return [[vocab_size - 2] + tokenizer.encode(sentense) + [vocab_size - 1] for sentense in data]

inputs = token_start_end(data=data_en, tokenizer=tokenizer_en)
outputs = token_start_end(data=data_pt, tokenizer=tokenizer_pt)

print('Input example: ', inputs[0])
print('Output example: ', outputs[0])

Input example:  [8191, 2458, 972, 2108, 3, 1, 2571, 8192]
Output example:  [8116, 834, 705, 7, 3561, 8117]


removing setenses longer than 15 

In [9]:
def remove_longer_sentense(data, max_length=15):
    idx_to_remove = [idx for idx, sentense in enumerate(data) if len(sentense) > max_length]

    for idx in tqdm(reversed(idx_to_remove)):
        # remove the same setense in english and portuguese dataset
        del inputs[idx]
        del outputs[idx]

remove_longer_sentense(data=inputs)
remove_longer_sentense(data=outputs)

print('len inputs: {} | len outputs {}'.format(len(inputs), len(outputs)))

1685300it [07:09, 3923.94it/s] 
66118it [00:10, 6589.59it/s]  

len inputs: 208990 | len outputs 208990





padding sentenses 

In [10]:
def padding_sequences(data, max_length):
    return tf.keras.preprocessing.sequence.pad_sequences(sequences=data, value=0, padding='post', maxlen=max_length)

inputs = padding_sequences(data=inputs, max_length=15)
outputs = padding_sequences(data=outputs, max_length=15)

print('Input padded sequences example: ', inputs[0])
print('Output padded sequences example: ', outputs[0])

Input padded sequences example:  [8191 2458  972 2108    3    1 2571 8192    0    0    0    0    0    0
    0]
Output padded sequences example:  [8116  834  705    7 3561 8117    0    0    0    0    0    0    0    0
    0]


final dataset cration with tf optimization

In [11]:
batch_size = 64
buffer_size = 20000

dataset = tf.data.Dataset.from_tensor_slices(tensors=(inputs, outputs))
dataset = dataset.cache()
dataset = dataset.shuffle(buffer_size=buffer_size).batch(batch_size=batch_size)
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

2022-04-12 12:22:27.675068: I tensorflow/compiler/jit/xla_cpu_device.cc:41] Not creating XLA devices, tf_xla_enable_xla_devices not set
2022-04-12 12:22:27.683407: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcuda.so.1
2022-04-12 12:22:27.715283: I tensorflow/stream_executor/cuda/cuda_gpu_executor.cc:941] successful NUMA node read from SysFS had negative value (-1), but there must be at least one NUMA node, so returning NUMA node zero
2022-04-12 12:22:27.716238: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1720] Found device 0 with properties: 
pciBusID: 0000:05:00.0 name: NVIDIA GeForce GTX 1060 6GB computeCapability: 6.1
coreClock: 1.8095GHz coreCount: 10 deviceMemorySize: 5.93GiB deviceMemoryBandwidth: 178.99GiB/s
2022-04-12 12:22:27.716279: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcudart.so.10.1
2022-04-12 12:22:27.779533: I tensorflow/stream_executor/platfo

# Model building

## Positional Encoding implementation

![pe](imgs/pe.png)

$ PE(pos, 2i) = sin(pos * angles)$

$ PE(pos, 2i + 1) = cos(pos * angles)$


$ angles = \frac{1}{10000^{2*i / dmodel}}$

In [12]:
class PositionalEncoding(layers.Layer):
    def __init__(self):
        super(PositionalEncoding, self).__init__()

    def get_angles(self, i, d_model):
        # definition of angles
        return 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))

    def call(self, inputs):
        seq_length = inputs.shape.as_list()[-2]
        d_model = inputs.shape.as_list()[-1]

        # get the pos and i matrix to gerate positional encoding from a input tensor
        pos = np.arange(seq_length)[:, np.newaxis]
        i = np.arange(d_model)[np.newaxis, :]

        # calculates the angle results
        angles = self.get_angles(i=i, d_model=d_model)

        # calculates the positional encoding
        pe = pos * angles #(seq_length, d_model)
        pe[:, 0::2] = np.sin(pe[:, 0::2]) #even position
        pe[:, 1::2] = np.cos(pe[:, 1::2]) #odd position

        # transform the pos encoding dimension to the same as the input
        pos_encoding = pe[np.newaxis, ...]
        pos_encoding = tf.cast(x=pos_encoding, dtype=tf.float32)

        # print('Inputs shape: {} | PE shape {} | Pos encoding shape: {}'.format(inputs.shape, pe.shape, pos_encoding.shape))
        
        # print('Pos encoding', pos_encoding)

        # print('Pos encoding shape {} | inputs shape {}'.format(pos_encoding.shape, inputs.shape))

        return inputs + pos_encoding

Testing positional encoding class

In [13]:
pos = PositionalEncoding()

matrix_test = tf.ones((1, 4, 4))

print('Inputs + pos encoding', pos(matrix_test))

Inputs + pos encoding tf.Tensor(
[[[1.        2.        1.        2.       ]
  [1.841471  1.5403023 1.0099999 1.9999499]
  [1.9092975 0.5838531 1.0199987 1.9998   ]
  [1.14112   0.0100075 1.0299954 1.9995501]]], shape=(1, 4, 4), dtype=float32)


## Attention mecanism

### Scaled dot product Attention

![scaled dot product attention](imgs/scaled-dot-produt-attention.png)

$Attention(Q, K, V) = softmax(\frac{QK^{T}}{\sqrt{d_{k}}})*V$

Q = queries

K = keys

V = values

$K^{T}$ = K matrix transpose

$d_{k}$ = K dimension

In [14]:
def scaled_dot_product_attention(queries, keys, values, mask):
    product = tf.matmul(queries, keys, transpose_b=True)
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

    scaled_produt = product / tf.math.sqrt(keys_dim)

    if mask is not None:
        scaled_produt += (mask * -1e9)

    softmax = tf.nn.softmax(scaled_produt, axis=-1)

    return tf.matmul(softmax, values)

### Multi-Head Attention

![mult-head attention](imgs/multi-head-attention.png)

In [15]:
class MultiHeadAttention(layers.Layer):
    def __init__(self, num_proj):
        super(MultiHeadAttention, self).__init__()

        # projections
        self.num_proj = num_proj

    def build(self, input_shape):
        # input_shape = inputs comming from call method
        # get the model shape
        self.d_model = input_shape[-1]

        # print(self.d_model, self.num_proj)
        
        # splitting according to projections
        assert self.d_model % self.num_proj == 0
        self.d_proj = d_model // self.num_proj

        # linear dense layers
        self.query_linear_dense = layers.Dense(units = self.d_model)
        self.keys_linear_dense = layers.Dense(units = self.d_model)
        self.values_linear_dense = layers.Dense(units = self.d_model)

        self.final_linear_dense = layers.Dense(units = self.d_model)

    def split_proj(self, inputs, batch_size):
        shape = (batch_size, -1, self.num_proj, self.d_proj)
        splitted_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, num_proj, d_proj)

        return tf.transpose(splitted_inputs, perm=[0, 2, 1, 3]) # (batch_size, num_proj, seq_length, d_proj)

    def scaled_dot_product_attention(self, queries, keys, values, mask=None):
        product = tf.matmul(queries, keys, transpose_b=True)
        keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

        scaled_produt = product / tf.math.sqrt(keys_dim)

        if mask is not None:
            scaled_produt += (mask * -1e9)

        softmax = tf.nn.softmax(scaled_produt, axis=-1)

        return tf.matmul(softmax, values)

    def call(self, queries, keys, values, mask):
        batch_size = tf.shape(queries)[0]

        queries = self.query_linear_dense(queries)
        keys = self.keys_linear_dense(keys)
        values = self.values_linear_dense(values)

        queries = self.split_proj(inputs=queries, batch_size=batch_size)
        keys = self.split_proj(inputs=keys, batch_size=batch_size)
        values = self.split_proj(inputs=values, batch_size=batch_size)

        attention = self.scaled_dot_product_attention(queries = queries,
                                                      keys = keys,
                                                      values = values,
                                                      mask=mask)

        # return the same shape as the input
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        concat = tf.reshape(attention, shape=(batch_size, -1, self.d_model))

        return self.final_linear_dense(concat)

# Encoder

## Encoder Layer

![encoder](imgs/encoder-layer.png)

In [16]:
class EncoderLayer(layers.Layer):
    def __init__(self, ff_units, num_proj, dropout_rate):
        super(EncoderLayer, self).__init__()

        self.ff_units = ff_units
        self.num_proj = num_proj
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.d_model = input_shape[-1]

        self.multi_head_attention = MultiHeadAttention(num_proj=self.num_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6) #1e-6 = 0.0000001

        self.dense_1 = layers.Dense(units=self.ff_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model, activation='relu')
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, mask, training:bool):
        attention = self.multi_head_attention(queries = inputs,
                                              keys = inputs,
                                              values = inputs,
                                              mask = mask)
        attention = self.dropout_1(attention, training = training)
        attention = self.norm_1(attention + inputs)

        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training = training)

        outputs = self.norm_2(outputs + attention)

        return outputs

## Encoder mechanism

![encoder](imgs/encoder.png)

In [17]:
class Encoder(layers.Layer):
    def __init__(self, 
                 num_layers,
                 ff_units, 
                 num_proj, 
                 dropout_rate, 
                 vocab_size, 
                 d_model, 
                 name="encoder"):
        super(Encoder, self).__init__(name=name)

        self.num_layers = num_layers
        self.d_model = d_model

        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.positional_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        self.encoder_layers = [EncoderLayer(ff_units=ff_units, num_proj=num_proj, dropout_rate=dropout_rate) for _ in range(self.num_layers)]

    def call(self, inputs, mask, training):
        # embeddings        
        embedding_inputs = self.embedding(inputs)
        embedding_inputs *= tf.math.sqrt(x=tf.cast(x=self.d_model, dtype=tf.float32))

        #positional encodings
        pe = self.positional_encoding(inputs=embedding_inputs)
        pe = self.dropout(pe, training=training)

        # encoder layers
        # enc_outputs = [enc_layer(inputs=pe, mask=mask, training=training) for enc_layer in self.encoder_layers]

        for i in range(self.num_layers):
            enc_outputs = self.encoder_layers[i](pe, mask, training)

        return enc_outputs

# Decoder

## Decoder Layer

![decoder-layer](imgs/decoder-layer.png)

In [18]:
class DecoderLayer(layers.Layer):
    def __init__(self, 
                 ff_units, 
                 num_proj, 
                 dropout_rate):
        super(DecoderLayer, self).__init__()

        self.ff_units = ff_units
        self.num_proj = num_proj
        self.dropout_rate = dropout_rate

    def build(self, input_shape):
        self.d_model = input_shape[-1]

        # attention layer
        self.mult_head_attention_1 = MultiHeadAttention(num_proj=self.num_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)

        # attention layer
        self.mult_head_attention_2 = MultiHeadAttention(num_proj=self.num_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)

        # dense layer
        self.dense_1 = layers.Dense(units=self.ff_units, activation='relu')
        self.dense_2 = layers.Dense(units=self.d_model, activation='relu')
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)

    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        # attention mechanism block
        attention_1 = self.mult_head_attention_1(queries = inputs, 
                                               keys = inputs, 
                                               values = inputs, 
                                               mask = mask_1)
        attention_1 = self.dropout_1(attention_1, training = training)
        attention_1 = self.norm_1(attention_1 + inputs)
        
        # attention mechanism block
        attention_2 = self.mult_head_attention_2(queries = attention_1, 
                                               keys = enc_outputs, 
                                               values = enc_outputs, 
                                               mask = mask_2)
        attention_2 = self.dropout_2(attention_2, training = training)
        attention_2 = self.norm_2(attention_2 + attention_1)

        # dense block
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training = training)

        outputs = self.norm_3(outputs + attention_2)

        return outputs

## Decoder mechanism

![decoder](imgs/decoder.png)

In [19]:
class Decoder(layers.Layer):
    def __init__(self, 
                 num_layers, 
                 ff_units, 
                 num_proj, 
                 dropout_rate, 
                 vocab_size, 
                 d_model, 
                 name="decoder"):
        super(Decoder, self).__init__(name=name)

        self.num_layers = num_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(input_dim=vocab_size, output_dim=d_model)
        self.positional_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)

        self.decoder_layers = [DecoderLayer(ff_units=ff_units, num_proj=num_proj, dropout_rate=dropout_rate) for _ in range(num_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        # Embedding
        embedding = self.embedding(inputs)
        embedding *= tf.math.sqrt(x=tf.cast(x=self.d_model, dtype=tf.float32))

        # Positional Encoding
        pe = self.positional_encoding(inputs = embedding)
        pe = self.dropout(pe, training)

        # Decoder layers block
        # dec_outputs = [dec_layer(inputs = pe, enc_outputs = enc_outputs, mask_1 = mask_1, mask_2 = mask_2, training = training) for dec_layer in self.decoder_layers]
        for i in range(self.num_layers):
            dec_outputs = self.decoder_layers[i](pe, enc_outputs, mask_1, mask_2, training)

        return dec_outputs

# Transformer

In [20]:
def creating_padding_mask(seq): #(batch_size, seq_length) -> (batch_size, num_proj, seq_length, d_proj)
        mask = tf.cast(x=tf.equal(seq, 0), dtype=tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

def create_look_ahead_mask(seq):
    seq_len = tf.shape(seq)[-1]
    
    return 1 - tf.linalg.band_part(input=tf.ones(shape=(seq_len, seq_len)), num_lower=-1, num_upper=0)

In [21]:
seq_test = tf.cast(x=[[865, 526, 378, 128, 0, 0, 0]], dtype=tf.int32)

In [22]:
creating_padding_mask(seq=seq_test)

<tf.Tensor: shape=(1, 1, 1, 7), dtype=float32, numpy=array([[[[0., 0., 0., 0., 1., 1., 1.]]]], dtype=float32)>

In [23]:
create_look_ahead_mask(seq=seq_test)

<tf.Tensor: shape=(7, 7), dtype=float32, numpy=
array([[0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>

In [24]:
tf.maximum(x=creating_padding_mask(seq=seq_test), y=create_look_ahead_mask(seq=seq_test))

<tf.Tensor: shape=(1, 1, 7, 7), dtype=float32, numpy=
array([[[[0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1.],
         [0., 0., 0., 1., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1.],
         [0., 0., 0., 0., 1., 1., 1.]]]], dtype=float32)>

![transformer](imgs/transformer.png)

In [25]:
class Transformer(tf.keras.Model):
    def __init__(self,
                 vocab_en_size,
                 vocab_pt_size,
                 d_model,
                 num_layers,
                 ff_units,
                 num_proj,
                 dropout_rate,
                 name="Transformer"):
        super(Transformer, self).__init__(name=name)

        self.encoder = Encoder(num_layers=num_layers,
                               ff_units=ff_units,
                               num_proj=num_proj,
                               dropout_rate=dropout_rate,
                               vocab_size=vocab_en_size,
                               d_model=d_model)

        self.decoder = Decoder(num_layers=num_layers,
                               ff_units=ff_units,
                               num_proj=num_proj,
                               dropout_rate=dropout_rate,
                               vocab_size=vocab_pt_size,
                               d_model=d_model)

        self.output_dense = layers.Dense(units=vocab_pt_size, name="output_dense")

    def create_padding_mask(self, seq): #(batch_size, seq_length) -> (batch_size, num_proj, seq_length, d_proj)
        mask = tf.cast(x=tf.math.equal(seq, 0), dtype=tf.float32)
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[-1]        
        return 1 - tf.linalg.band_part(input=tf.ones(shape=(seq_len, seq_len)), num_lower=-1, num_upper=0)

    def call(self, enc_inputs, dec_inputs, training:bool):
        # masks
        enc_mask = self.create_padding_mask(seq=enc_inputs)
        dec_mask_1 = tf.maximum(x=self.create_padding_mask(dec_inputs), y=self.create_look_ahead_mask(dec_inputs))
        dec_mask_2 = self.create_padding_mask(seq=enc_inputs)

        enc_outputs = self.encoder(inputs = enc_inputs, mask = enc_mask, training = training)
        dec_outputs = self.decoder(inputs = dec_inputs, enc_outputs = enc_outputs, mask_1 = dec_mask_1, mask_2 = dec_mask_2, training = training)

        outputs = self.output_dense(dec_outputs)

        return outputs


# Training

hyperparans

In [26]:
tf.keras.backend.clear_session()

d_model = 128 # recomends 512
num_layers = 4 # recomends 6
ff_units = 512 # recomends 2048
num_proj = 8 # recomends 8
dropout_rate = 0.1 # recomends 0.1

In [27]:
transformer = Transformer(vocab_en_size=vocab_size_en,
                         vocab_pt_size=vocab_size_pt,
                         d_model=d_model,
                         num_layers=num_layers,
                         ff_units=ff_units,
                         num_proj=num_proj,
                         dropout_rate=dropout_rate)

## Loss function

In [28]:
loss_ob = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction='none')

In [29]:
def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(x=target, y=0))
    loss_ = loss_ob(y_true=target, y_pred=pred)

    mask = tf.cast(x=mask, dtype=loss_.dtype)
    loss_ *= mask

    return tf.reduce_mean(input_tensor=loss_)

In [30]:
train_loss = tf.keras.metrics.Mean(name="train_loss")
train_acc = tf.keras.metrics.SparseCategoricalAccuracy(name="train_acc")

## Optimizer

Its recommended use Adam Optimize with $\beta_{1} = 0.9$, $\beta_{2} = 0.98$, $\epsilon = 10^{-9}$ and custom learning rate schedule

$lrate = d\_model^{-0.5} * min(step\_num^{-0.5}, step\_num * warmup\_step^{-1.5})$

$x^{-0.5}$ = inverse square root 

In [31]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self, d_model, warmup_steps = 4000):
        super(CustomSchedule, self).__init__()

        self.d_model = tf.cast(x=d_model, dtype=tf.float32)
        self.warmup_steps = warmup_steps

    def __call__(self, step):
        arg1 = tf.math.rsqrt(x=step)
        arg2 = step * (self.warmup_steps ** -1.5)

        return tf.math.rsqrt(x=self.d_model) * tf.math.minimum(x=arg1, y=arg2)

In [32]:
lrate = CustomSchedule(d_model=d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate=lrate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)

In [33]:
checkpoint_path = "checkpoint"
ckpt = tf.train.Checkpoint(transformer = transformer, optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(checkpoint=ckpt, directory=checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
    ckpt.restore(ckpt_manager.latest_checkpoint)
    print('Checkpoint restored!')

else:
    print('No checkpoint found!')

No checkpoint found!


In [34]:
epochs = 20

for epoch in range(epochs):
    print('Epoch {} of {}'.format(epoch + 1, epochs))
    start_time = time.time()

    # reset states
    train_loss.reset_states()
    train_acc.reset_states()

    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        # shifted right
        dec_inputs = targets[:, :-1]
        dec_outputs_real = targets[:, 1:]

        with tf.GradientTape() as tape:
            pred = transformer(enc_inputs = enc_inputs, dec_inputs = dec_inputs, training = True)

            loss = loss_function(target = dec_outputs_real, pred = pred)

        gradients = tape.gradient(loss, transformer.trainable_variables)
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))

        train_loss(loss)
        train_acc(dec_outputs_real, pred)

        if batch % 50 == 0:
            print('Epoch {} | Batch {} | Accuracy {:.4f} | Loss {:.4f}'.format(epoch+1, batch, train_acc.result(), train_loss.result()))
    
    ckpt_manager.save()
    print('Saving checkpoint...')
    print('Time taken in epoch {}: {} secs \n'.format(epoch, time.time() - start_time))

Epoch 1 of 20


2022-04-12 12:22:29.259865: I tensorflow/stream_executor/platform/default/dso_loader.cc:49] Successfully opened dynamic library libcublas.so.10


Epoch 1 | Batch 0 | Accuracy 0.0000 | Loss 6.3823
Epoch 1 | Batch 50 | Accuracy 0.0000 | Loss 6.4801
Epoch 1 | Batch 100 | Accuracy 0.0158 | Loss 6.3840
Epoch 1 | Batch 150 | Accuracy 0.0347 | Loss 6.2942
Epoch 1 | Batch 200 | Accuracy 0.0446 | Loss 6.1965
Epoch 1 | Batch 250 | Accuracy 0.0505 | Loss 6.0882
Epoch 1 | Batch 300 | Accuracy 0.0545 | Loss 5.9648
Epoch 1 | Batch 350 | Accuracy 0.0586 | Loss 5.8316
Epoch 1 | Batch 400 | Accuracy 0.0654 | Loss 5.6932
Epoch 1 | Batch 450 | Accuracy 0.0721 | Loss 5.5632
Epoch 1 | Batch 500 | Accuracy 0.0781 | Loss 5.4458
Epoch 1 | Batch 550 | Accuracy 0.0835 | Loss 5.3370
Epoch 1 | Batch 600 | Accuracy 0.0888 | Loss 5.2266
Epoch 1 | Batch 650 | Accuracy 0.0946 | Loss 5.1287
Epoch 1 | Batch 700 | Accuracy 0.1007 | Loss 5.0322
Epoch 1 | Batch 750 | Accuracy 0.1070 | Loss 4.9412
Epoch 1 | Batch 800 | Accuracy 0.1129 | Loss 4.8555
Epoch 1 | Batch 850 | Accuracy 0.1187 | Loss 4.7731
Epoch 1 | Batch 900 | Accuracy 0.1242 | Loss 4.6969
Epoch 1 | Batch