<a href="https://colab.research.google.com/gist/njk8/024864eca950edea8d0839b48c009baa/proj_transformer_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%cd /content/drive/My Drive/studies/NLP/project

/content/drive/My Drive/studies/NLP/project


In [None]:
import numpy as np
import math
import re
import time

In [None]:
import tensorflow as tf
from tensorflow.keras import layers
import tensorflow_datasets as tfds

**Load the dataset**

In [None]:
with open("europarl-v7.sv-en.en", mode='r', encoding='utf-8') as f:
  europarl_en = f.read()

with open("europarl-v7.sv-en.sv", mode='r', encoding='utf-8') as f:
  europarl_sv = f.read()

In [None]:
europarl_en[:50]

'Resumption of the session\nI declare resumed the se'

In [None]:
#a.m = a.$$$m = am

In [None]:
corpus_en = europarl_en
#any char following '.' replace it with '.$$$'
corpus_en = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_en)
#now we remove all such instance of '.$$$' from corpus
corpus_en = re.sub(r".\$\$\$", '', corpus_en)
#replace two whitespaces with single whitespace 
corpus_en = re.sub(r"  +", " ", corpus_en)
#replace brackets with single whitespace 
corpus_en = re.sub(r"\(", "", corpus_en)
corpus_en = re.sub(r"\)", "", corpus_en)

#split each sentence in corpus based on '\n' new line char
corpus_en = corpus_en.split('\n')

corpus_sv = europarl_sv
corpus_sv = re.sub(r"\.(?=[0-9]|[a-z]|[A-Z])", ".$$$", corpus_sv)
corpus_sv = re.sub(r".\$\$\$", '', corpus_sv)
corpus_sv = re.sub(r"  +", " ", corpus_sv)
corpus_sv = re.sub(r"\(", "", corpus_sv)
corpus_sv = re.sub(r"\)", "", corpus_sv)

corpus_sv = corpus_sv.split('\n')

In [None]:
#[[start, I, H]] 3 = 1x3

In [None]:
corpus_en[:50]

['Resumption of the session',
 'I declare resumed the session of the European Parliament adjourned on Friday 17 December 1999, and I would like once again to wish you a happy new year in the hope that you enjoyed a pleasant festive period.',
 "Although, as you will have seen, the dreaded 'millennium bug' failed to materialise, still the people in a number of countries suffered a series of natural disasters that truly were dreadful.",
 'You have requested a debate on this subject in the course of the next few days, during this part-session.',
 "In the meantime, I should like to observe a minute' s silence, as a number of Members have requested, on behalf of all the victims concerned, particularly those of the terrible storms, in the various countries of the European Union.",
 "Please rise, then, for this minute' s silence.",
 "The House rose and observed a minute' s silence",
 'Madam President, on a point of order.',
 'You will be aware from the press and television that there have been

In [None]:
corpus_en[0]

'Resumption of the session'

**Tokenizer will help us convert all sent to lower case, add spaces before '.' or ',' and also assign each word in the sentence with its unique integer value from its vocab.**

In [None]:
#Encoding is fully invertible because all out-of-vocab wordpieces are byte-encoded. 
#Which means unknown word pieces will be encoded one character at a time.
#8219 + 26(all english alphabets)
#target_vocab_size represents approx vocab size required!

In [None]:
tokenizer_en = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_en, target_vocab_size=2**13)

In [None]:
tokenizer_sv = tfds.deprecated.text.SubwordTextEncoder.build_from_corpus(corpus_sv, target_vocab_size=2**13)

In [None]:
VOCAB_SIZE_EN = tokenizer_en.vocab_size + 2 
VOCAB_SIZE_EN

8217

In [None]:
tokenizer_sv.subwords

['att_',
 ', ',
 'och_',
 'i_',
 'som_',
 'för_',
 'en_',
 'av_',
 'det_',
 'är_',
 'de_',
 'till_',
 'om_',
 'har_',
 'på_',
 'den_',
 'med_',
 'inte_',
 'vi_',
 's_',
 'ett_',
 'a_',
 't_',
 'Jag_',
 'jag_',
 'Det_',
 'kommer_',
 'kan_',
 'måste_',
 'detta_',
 'r_',
 'er_',
 'från_',
 'Vi_',
 'n_',
 'vill_',
 'skulle_',
 'också_',
 'så_',
 'na_',
 'denna_',
 'en',
 'man_',
 'EU',
 'mycket_',
 ' - ',
 '. ',
 'alla_',
 '! ',
 'sig_',
 'men_',
 'när_',
 'Europeiska_',
 'vara_',
 'eller_',
 'talman',
 'gäller_',
 'Herr_',
 'dessa_',
 'andra_',
 'kommissionen_',
 'även_',
 'mot_',
 'I_',
 'inom_',
 'oss_',
 'ska_',
 'skall_',
 'e_',
 'finns_',
 'utan_',
 'under_',
 'bara_',
 'här_',
 'et_',
 'bör_',
 'göra_',
 'ta_',
 'er',
 'EU_',
 'genom_',
 'mer_',
 'mellan_',
 ': ',
 'var_',
 'eftersom_',
 'anser_',
 'Detta_',
 'na',
 'nu_',
 'kunna_',
 'än_',
 'vid_',
 'vilket_',
 'europeiska_',
 'därför_',
 'ar_',
 'få_',
 'nde_',
 'där_',
 'ni_',
 'över_',
 'ha_',
 'fram_',
 'Den_',
 'allt_',
 'vår

In [None]:
VOCAB_SIZE_SV = tokenizer_sv.vocab_size + 2
VOCAB_SIZE_SV

8184

**Pad the 'start' and 'end' token to all sentences in the corpus**

In [None]:
inputs = [[VOCAB_SIZE_EN-2] + tokenizer_en.encode(sentence) + [VOCAB_SIZE_EN-1] for sentence in corpus_en]

In [None]:
outputs = [[VOCAB_SIZE_SV-2] + tokenizer_sv.encode(sentence) + [VOCAB_SIZE_SV-1] for sentence in corpus_sv]

In [None]:
inputs[0]

[8215, 2562, 1015, 2030, 3, 1, 2578, 8216]

In [None]:
corpus_en[0]

'Resumption of the session'

In [None]:
outputs[0]

[8182, 3362, 79, 7381, 5918, 8, 5877, 42, 8183]

In [None]:
MAX_LENGTH = 20

idx_to_remove = [count for count, sent in enumerate(inputs) if len(sent) > MAX_LENGTH]

#delete sentences from inputs(source) that exceed max len of 20, correspondingly also delete sentences in outputs(target).
for idx in reversed(idx_to_remove):
  del inputs[idx]
  del outputs[idx]

#we do the same thing for outputs(target) ie: find sent that exceed max len of 20 in target outputs and del from both outputs 
#and inputs
idx_to_remove = [count for count, sent in enumerate(outputs) if len(sent) > MAX_LENGTH]
for idx in reversed(idx_to_remove):
  del inputs[idx]
  del outputs[idx]

**Pad value of 0 for sentences less than its max length**

In [None]:
inputs = tf.keras.preprocessing.sequence.pad_sequences(inputs, value=0, padding='post', maxlen=MAX_LENGTH)

In [None]:
inputs.shape

(446931, 20)

In [None]:
outputs = tf.keras.preprocessing.sequence.pad_sequences(outputs, value=0, padding='post', maxlen=MAX_LENGTH)

In [None]:
outputs.shape

(446931, 20)

In [None]:
valid_src = inputs[-1000:]
valid_ref = outputs[-1000:]
valid_src.shape, valid_ref.shape

((1000, 20), (1000, 20))

In [None]:
inputs = inputs[:-1000]
outputs = outputs[:-1000]
inputs.shape, outputs.shape

((445931, 20), (445931, 20))

In [None]:
BATCH_SIZE = 64
BUFFER_SIZE = 20000

dataset = tf.data.Dataset.from_tensor_slices((inputs, outputs))

#to help increase speed during training - store it in cache
dataset = dataset.cache()
dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE)
#to help access to data faster - further improving speed (Note: it has no effect on accuracy)
#This transformation basically uses a background thread and an internal buffer to prefetch elements 
#from the input dataset ahead of the time they are requested.
dataset = dataset.prefetch(tf.data.experimental.AUTOTUNE)

In [None]:
next(iter(dataset))

(<tf.Tensor: shape=(64, 20), dtype=int32, numpy=
 array([[8215,   37,   17, ...,    0,    0,    0],
        [8215,  579,   60, ...,    0,    0,    0],
        [8215, 1057,  314, ...,    0,    0,    0],
        ...,
        [8215, 1314,  183, ...,    0,    0,    0],
        [8215,   67,    9, ...,    0,    0,    0],
        [8215,  262, 7717, ...,    0,    0,    0]], dtype=int32)>,
 <tf.Tensor: shape=(64, 20), dtype=int32, numpy=
 array([[8182,  411,   28, ...,    0,    0,    0],
        [8182,   26,   70, ...,    0,    0,    0],
        [8182,   26, 3088, ...,    0,    0,    0],
        ...,
        [8182,  748,  815, ...,    0,    0,    0],
        [8182,  926,   10, ...,    0,    0,    0],
        [8182, 7450, 7958, ..., 1952, 7972, 8183]], dtype=int32)>)

In [None]:
class PositionalEncoding(layers.Layer):

    def __init__(self):
        super(PositionalEncoding, self).__init__()
    
    def get_angles(self, pos, i, d_model):
        #pos - [seqlen, 1] list of all positions
        #i - [1, d_model] list of all dimensions
        #d_model - dimension size
        angles = 1 / np.power(10000., (2*(i//2)) / np.float32(d_model))
        return pos * angles

    def call(self, inputs):
        #get the first dimension of the input tensor (seq length)
        seq_length = inputs.shape.as_list()[-2]
        #get the second dimension of the input tensor (embedding dim)
        d_model = inputs.shape.as_list()[-1]
        #send list of positions from 0 to seq length with an additional axis [seq, 1], send list of dimensions [1, dim]
        angles = self.get_angles(np.arange(seq_length)[:, np.newaxis], np.arange(d_model)[np.newaxis, :], d_model)
        #all angle values 0 to : with a step of 2 (to access even part)
        angles[:, 0::2] = np.sin(angles[:, 0::2])
        #1:all:2 to access the odd part
        angles[:, 1::2] = np.cos(angles[:, 1::2])
        # add extra dim [1, seq, d_model] for batch size [batch, seq, d_model]
        pos_encoding = angles[np.newaxis, ...]
        #concat (or add the input to pos enc (convert to tensor))
        return inputs + tf.cast(pos_encoding, tf.float32)

In [None]:
def scaled_dot_product_attention(queries, keys, values, mask):
    #Q, K and V - size - [..., seq, emb_dim]
    #matrix multiply query with the transpose of key matrix => [..., 20, 20]
    product = tf.matmul(queries, keys, transpose_b=True)

    #get the keys dimension size, type caste to float
    keys_dim = tf.cast(tf.shape(keys)[-1], tf.float32)

    #scale the product by keys dimension -> so as to get consistent variance regardless the value of dim of keys
    scaled_product = product / tf.math.sqrt(keys_dim)
    
    #if there is a valid mask
    if mask is not None:
        #all values except zero will be multiplied by -1e9 in the mask which is further added to the scaled product.
        #this makes sure that softmax is applied the padded values go to zeroes thus not affecting the original sent length. 
        scaled_product += (mask * -1e9)
    
    #print(scaled_product.shape)
    #print(tf.nn.softmax(scaled_product, axis=-1).shape)
    #print(values.shape)
    
    #finally we apply softmax along last dimension such that prob of seq sum up to 1, 
    #and we multiply the result with values matrix.
    attention = tf.matmul(tf.nn.softmax(scaled_product, axis=-1), values)  # [batch, nb.proj, seq, nb.dim]
    
    return attention, tf.nn.softmax(scaled_product, axis=-1)

In [None]:
x = tf.random.uniform((64, 8, 20, 64))
product, _ = scaled_dot_product_attention(x, x, x, mask=None)
product.shape

TensorShape([64, 8, 20, 64])

In [None]:
[2, 4, 6, 0, 0 ,0 ] , [0, 0, 0 , 1, 1, 1]

([2, 4, 6, 0, 0, 0], [0, 0, 0, 1, 1, 1])

In [None]:
class MultiHeadAttention(layers.Layer):
    
    def __init__(self, nb_proj):
        #call the base class
        super(MultiHeadAttention, self).__init__()
        #initialize the no. of projections
        self.nb_proj = nb_proj
        
    def build(self, input_shape):
        #get the dimension (d_model)
        self.d_model = input_shape[-1]
        #print(input_shape[-1])

        #we check if the d_model dimension is divisible by no. of proj
        assert self.d_model % self.nb_proj == 0
        
        #divide and get only the integer part of the fraction
        self.d_proj = self.d_model // self.nb_proj
        
        #define dense layers having d_model hidden units for Q, K and V
        self.query_lin = layers.Dense(units=self.d_model)
        self.key_lin = layers.Dense(units=self.d_model)
        self.value_lin = layers.Dense(units=self.d_model)
        
        #Final output linear layer
        self.final_lin = layers.Dense(units=self.d_model)
        
    def split_proj(self, inputs, batch_size): # inputs: (batch_size, seq_length, d_model)
        #we define the shape we want of our output tensor here
        #[Batch, seq, nb_proj, dim_proj]

        shape = (batch_size, -1, self.nb_proj, self.d_proj)
        #Now we reshape the inputs into the above defined shape
        
        splited_inputs = tf.reshape(inputs, shape=shape) # (batch_size, seq_length, nb_proj, d_proj)

        return tf.transpose(splited_inputs, perm=[0, 2, 1, 3]) # (batch_size, nb_proj, seq_length, d_proj)
    
    def call(self, queries, keys, values, mask):
        #get the bactch size
        batch_size = tf.shape(queries)[0]
        
        #apply the layers onto Q, K and V
        queries = self.query_lin(queries)
        keys = self.key_lin(keys)
        values = self.value_lin(values)
        
        #we get the splitted projections for Q, K and V respectively
        queries = self.split_proj(queries, batch_size)
        keys = self.split_proj(keys, batch_size)
        values = self.split_proj(values, batch_size)
        
        #get the attention weights
        attention, weights = scaled_dot_product_attention(queries, keys, values, mask)
        
        #permute and get back original tensor shape of [batch, seq, nb.proj, dim_proj]
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        
        #merge and reshape back into [Batch, seq, nb.proj * dim_proj] = [Batch, seq, d_model]
        #print(attention[-1][-1].shape)
        concat_attention = tf.reshape(attention, shape=(batch_size, -1, self.d_model))
        #print(concat_attention.shape)
        
        outputs = self.final_lin(concat_attention)  #[Batch, Seq, d_model]

        return outputs, weights

In [None]:
temp_mha = MultiHeadAttention(8)
y = tf.random.uniform((64, 20, 512))  # (batch_size, seq_len, d_model)

In [None]:
out, w = temp_mha(y, y, y, mask=None)
out.shape

TensorShape([64, 20, 512])

In [None]:
class EncoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(EncoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        #call MHA here the key, query and value == input
        self.multi_head_attention = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, mask, training):
        attention, _ = self.multi_head_attention(inputs, inputs, inputs, mask)
        attention = self.dropout_1(attention, training=training)
        attention = self.norm_1(attention + inputs)
        
        outputs = self.dense_1(attention)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_2(outputs, training=training)
        outputs = self.norm_2(outputs + attention)
        
        return outputs

In [None]:
EL = EncoderLayer(1024, 8, 0.1)
x = tf.random.uniform((64, 20, 512))
EL(x, None, False).shape

TensorShape([64, 20, 512])

In [None]:
class Encoder(layers.Layer):
    
    def __init__(self, nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size, d_model, name="encoder"):
        super(Encoder, self).__init__(name=name)
        self.nb_layers = nb_layers
        self.d_model = d_model
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        #create Encodinglayer for given nb_layers (eg:6) of time!
        self.enc_layers = [EncoderLayer(FFN_units, nb_proj, dropout_rate) for _ in range(nb_layers)]
    
    def call(self, inputs, mask, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)
        
        for i in range(self.nb_layers):
            outputs = self.enc_layers[i](outputs, mask, training)

        return outputs

In [None]:
enc = Encoder(6, 1024, 8, 0.1, 8192, 512)
x = tf.random.uniform((64, 20)) #[Batch, Seq]
enc(x, None, False).shape

TensorShape([64, 20, 512])

In [None]:
class DecoderLayer(layers.Layer):
    
    def __init__(self, FFN_units, nb_proj, dropout_rate):
        super(DecoderLayer, self).__init__()
        self.FFN_units = FFN_units
        self.nb_proj = nb_proj
        self.dropout_rate = dropout_rate
    
    def build(self, input_shape):
        self.d_model = input_shape[-1]
        
        # Self Multi head attention with itself
        self.multi_head_attention_1 = MultiHeadAttention(self.nb_proj)
        self.dropout_1 = layers.Dropout(rate=self.dropout_rate)
        self.norm_1 = layers.LayerNormalization(epsilon=1e-6)
        
        # Multi head attention combined with encoder output
        self.multi_head_attention_2 = MultiHeadAttention(self.nb_proj)
        self.dropout_2 = layers.Dropout(rate=self.dropout_rate)
        self.norm_2 = layers.LayerNormalization(epsilon=1e-6)
        
        # Feed foward Network
        self.dense_1 = layers.Dense(units=self.FFN_units, activation="relu")
        self.dense_2 = layers.Dense(units=self.d_model)
        self.dropout_3 = layers.Dropout(rate=self.dropout_rate)
        self.norm_3 = layers.LayerNormalization(epsilon=1e-6)
        
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        #apply MHA with padding mask
        attention, wb1 = self.multi_head_attention_1(inputs, inputs, inputs, mask_1)
        attention = self.dropout_1(attention, training)
        attention = self.norm_1(attention + inputs)
        
        #apply MHA with look ahead mask
        attention_2, wb2 = self.multi_head_attention_2(attention, enc_outputs, enc_outputs, mask_2)
        attention_2 = self.dropout_2(attention_2, training)
        attention_2 = self.norm_2(attention_2 + attention)
        
        #apply the final FFN layer
        outputs = self.dense_1(attention_2)
        outputs = self.dense_2(outputs)
        outputs = self.dropout_3(outputs, training)
        outputs = self.norm_3(outputs + attention_2)
        
        return outputs, wb1, wb2

In [None]:
class Decoder(layers.Layer):
    
    def __init__(self, nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size, d_model, name="decoder"):
        super(Decoder, self).__init__(name=name)
        self.d_model = d_model
        self.nb_layers = nb_layers
        
        self.embedding = layers.Embedding(vocab_size, d_model)
        self.pos_encoding = PositionalEncoding()
        self.dropout = layers.Dropout(rate=dropout_rate)
        
        self.dec_layers = [DecoderLayer(FFN_units, nb_proj, dropout_rate) for i in range(nb_layers)]
    
    def call(self, inputs, enc_outputs, mask_1, mask_2, training):
        outputs = self.embedding(inputs)
        outputs *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        outputs = self.pos_encoding(outputs)
        outputs = self.dropout(outputs, training)

        for i in range(self.nb_layers):
            #loop through and call all such decoder layer instances
            outputs, wb1, wb2 = self.dec_layers[i](outputs, enc_outputs, mask_1, mask_2, training)

        return outputs, wb2 #[Batch, Seq, d_model]

##**Now Lets! combine everything together to build our final transformer architecture**

In [None]:
class Transformer(tf.keras.Model):
    
    def __init__(self, vocab_size_enc, vocab_size_dec, d_model, nb_layers, FFN_units, nb_proj, dropout_rate, name="transformer"):
        super(Transformer, self).__init__(name=name)
        
        self.encoder = Encoder(nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size_enc, d_model)
        
        self.decoder = Decoder(nb_layers, FFN_units, nb_proj, dropout_rate, vocab_size_dec, d_model)

        self.last_linear = layers.Dense(units=vocab_size_dec, name="final_output")
    
    def create_padding_mask(self, seq):
        mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
        #add extra 2 empty dimesions [Batch, nb.proj, seq, seq] to be used after scaled dot product [batch, nb.proj, seq, seq]
        return mask[:, tf.newaxis, tf.newaxis, :]

    def create_look_ahead_mask(self, seq):
        seq_len = tf.shape(seq)[1]
        #here we only consider the lower left traingle and hide upper right traingle of the matrix
        #-1 -> keep lower half and 0 -> disable upper traingle (using the linalg.band_part function)
        look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
        return look_ahead_mask
    
    def call(self, enc_inputs, dec_inputs, training):
        #encoder mask
        enc_mask = self.create_padding_mask(enc_inputs)
        #decoder first mask 
        dec_mask_1 = tf.maximum(self.create_padding_mask(dec_inputs), self.create_look_ahead_mask(dec_inputs))
        #decoder second mask: here we use encoder inputs since our keys and values to MHA are from the output of encoder 
        #and queries come from decoder side. We want to mask encoder padded outputs when we recombine with decoder inputs.
        dec_mask_2 = self.create_padding_mask(enc_inputs)
        
        #get the encoder outputs
        enc_outputs = self.encoder(enc_inputs, enc_mask, training)
        #get the decoder outputs
        dec_outputs, weights = self.decoder(dec_inputs, enc_outputs, dec_mask_1, dec_mask_2, training)
        
        #apply the final output layer of unit = decoder vocab size (such that the model will 
        #predict the words from swedish vocab that have high probability given english input sentence.)
        outputs = self.last_linear(dec_outputs)
        
        return outputs, weights  #[Batch, Seq, Voacab_size_dec]

In [None]:
tf.linalg.band_part(tf.ones((10, 10)), 0, -1)

<tf.Tensor: shape=(10, 10), dtype=float32, numpy=
array([[1., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 1., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 1.]], dtype=float32)>

In [None]:
def create_padding_mask(seq):
  mask = tf.cast(tf.math.equal(seq, 0), tf.float32)
  return mask[:, tf.newaxis, tf.newaxis, :]

In [None]:
def create_look_ahead_mask(seq):
  seq_len = tf.shape(seq)[1]
  look_ahead_mask = 1 - tf.linalg.band_part(tf.ones((seq_len, seq_len)), -1, 0)
  return look_ahead_mask

In [None]:
seq = tf.cast([[583, 288, 0, 412, 103, 0, 0, 0]], tf.int32)
create_padding_mask(seq)

<tf.Tensor: shape=(1, 1, 1, 8), dtype=float32, numpy=array([[[[0., 0., 1., 0., 0., 1., 1., 1.]]]], dtype=float32)>

In [None]:
create_look_ahead_mask(seq)

<tf.Tensor: shape=(8, 8), dtype=float32, numpy=
array([[0., 1., 1., 1., 1., 1., 1., 1.],
       [0., 0., 1., 1., 1., 1., 1., 1.],
       [0., 0., 0., 1., 1., 1., 1., 1.],
       [0., 0., 0., 0., 1., 1., 1., 1.],
       [0., 0., 0., 0., 0., 1., 1., 1.],
       [0., 0., 0., 0., 0., 0., 1., 1.],
       [0., 0., 0., 0., 0., 0., 0., 1.],
       [0., 0., 0., 0., 0., 0., 0., 0.]], dtype=float32)>

In [None]:
#automatically reshapes into [..., seq, seq] and compares with look ahead max
#this operation helps us apply both the mask!
tf.maximum(create_padding_mask(seq), create_look_ahead_mask(seq))

<tf.Tensor: shape=(1, 1, 8, 8), dtype=float32, numpy=
array([[[[0., 1., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 1., 1., 1., 1., 1.],
         [0., 0., 1., 0., 1., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.],
         [0., 0., 1., 0., 0., 1., 1., 1.]]]], dtype=float32)>

In [None]:
tm = Transformer(10000, 10000, 512, 6, 1024, 8, 0.1)
t_input = tf.random.uniform((1, 20), dtype=tf.int64, minval=0, maxval=200)
t_target = tf.random.uniform((1, 20), dtype=tf.int64, minval=0, maxval=200)

In [None]:
a, w = tm(t_input, t_target, False)
a.shape, w.shape

(TensorShape([1, 20, 10000]), TensorShape([1, 8, 20, 20]))

In [None]:
head = 0
# shape: (batch=1, num_heads, seq_len_q, seq_len_k)
attention_heads = tf.squeeze(w, 0)
attention = attention_heads[head]
attention.shape

TensorShape([20, 20])

**Hyperparameters**

In [None]:
tf.keras.backend.clear_session()

# Lets define the Hyper-parameters needed to train our dataset
# we initially choose small size parameter values for faster training compared to as stated in the paper!
D_MODEL = 128       # 512
NB_LAYERS = 4       # 6
FFN_UNITS = 512     # 2048
NB_PROJ = 8         # 8
DROPOUT_RATE = 0.1  # 0.1

#Instantiate the transformer model
transformer = Transformer(VOCAB_SIZE_EN, VOCAB_SIZE_SV, D_MODEL, NB_LAYERS, FFN_UNITS, NB_PROJ, DROPOUT_RATE)

**Now before we start training we need to do few very important steps:**<br><br>1) First we define our loss object as Sparse CategoricalCrossentropy (we use this crossentropy loss function since in the output we have two or more class labels to predict.)<br><br>2) Next we define loss function that creates a mask to hide the padded values and do not include it in the computaion of loss metric.

In [None]:
#since our outputs from model are real numbers ready to be transformed into probabilities we set from_logits = True.
#And reduction none indicates dont sum over all probabilities and calc mean loss as of yet. Since we need to remove the 
#padding part before summing the loss!

loss_object = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True, reduction="none")

def loss_function(target, pred):
    mask = tf.math.logical_not(tf.math.equal(target, 0))
    loss = loss_object(target, pred)
    
    mask = tf.cast(mask, dtype=loss.dtype)
    loss *= mask
    
    return tf.reduce_mean(loss)

train_loss = tf.keras.metrics.Mean(name="training_loss") #to keep track of training loss!

**Now Instead using a fixed pre-set learning rate at all times during the training phase the research paper decides to use a custom learning rate, such that for initial 4000 steps the learning rate is fast (increases linearly) and thereafter it starts decreasing.** 

In [None]:
class CustomLearningRate(tf.keras.optimizers.schedules.LearningRateSchedule):

    def __init__(self, d_model, start_steps=4000):
        super(CustomLearningRate, self).__init__()

        self.d_model = tf.cast(d_model, tf.float32)
        self.start_steps = start_steps
    
    def __call__(self, step):
        arg1 = tf.math.rsqrt(step)
        arg2 = step * (self.start_steps**-1.5)

        return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)

new_learning_rate = CustomLearningRate(D_MODEL)

#we set the paramertes as given in the paper!
optimizer = tf.keras.optimizers.Adam(new_learning_rate, beta_1=0.9, beta_2=0.98, epsilon=1e-9)   

**create checkpoints path to save checkpoints after every epoch**

In [None]:
import sys

checkpoint_path = "./store/ckpt/10hepo"

ckpt = tf.train.Checkpoint(transformer=transformer, optimizer=optimizer)

ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)

if ckpt_manager.latest_checkpoint:
  ckpt.restore(ckpt_manager.latest_checkpoint)
  print("Latest checkpoint restored!!")

In [None]:
#dec_inputs.shape

**Its time to start training!!!**

In [None]:

EPOCHS = 10  #5-6
for epoch in range(EPOCHS):
    print("Start of epoch {}".format(epoch+1))
    start = time.time()
    train_loss.reset_states()
    
    for (batch, (enc_inputs, targets)) in enumerate(dataset):
        #get everything except last word for input to the decoder
        dec_inputs = targets[:, :-1]
        #and the output from decoder is the shifted right part.
        dec_outputs_real = targets[:, 1:]

        

        #store everything that happens during training on a tape
        with tf.GradientTape() as tape:
            #print('hello')
            predictions, _ = transformer(enc_inputs, dec_inputs, True)
            loss = loss_function(dec_outputs_real, predictions)
            
        
        #Calc gradients dL/dw
        gradients = tape.gradient(loss, transformer.trainable_variables)
        #update the weights
        optimizer.apply_gradients(zip(gradients, transformer.trainable_variables))
        
        train_loss(loss)

        if batch % 50 == 0:
            print("Epoch {} Batch {} Loss {:.4f}".format(epoch+1, batch, train_loss.result()))
            
    ckpt_save_path = ckpt_manager.save()
    print("Saving checkpoint for epoch {} at {}".format(epoch+1, ckpt_save_path))

    print("Time taken for 1 epoch: {} secs\n".format(time.time() - start))

Start of epoch 1
Epoch 1 Batch 0 Loss 5.2952
Epoch 1 Batch 50 Loss 5.6741
Epoch 1 Batch 100 Loss 5.6392
Epoch 1 Batch 150 Loss 5.5800
Epoch 1 Batch 200 Loss 5.5215
Epoch 1 Batch 250 Loss 5.4559
Epoch 1 Batch 300 Loss 5.3572
Epoch 1 Batch 350 Loss 5.2503
Epoch 1 Batch 400 Loss 5.1466
Epoch 1 Batch 450 Loss 5.0503
Epoch 1 Batch 500 Loss 4.9566
Epoch 1 Batch 550 Loss 4.8648
Epoch 1 Batch 600 Loss 4.7858
Epoch 1 Batch 650 Loss 4.7068
Epoch 1 Batch 700 Loss 4.6354
Epoch 1 Batch 750 Loss 4.5656
Epoch 1 Batch 800 Loss 4.5015
Epoch 1 Batch 850 Loss 4.4415
Epoch 1 Batch 900 Loss 4.3879
Epoch 1 Batch 950 Loss 4.3354
Epoch 1 Batch 1000 Loss 4.2870
Epoch 1 Batch 1050 Loss 4.2405
Epoch 1 Batch 1100 Loss 4.1969
Epoch 1 Batch 1150 Loss 4.1551
Epoch 1 Batch 1200 Loss 4.1152
Epoch 1 Batch 1250 Loss 4.0756
Epoch 1 Batch 1300 Loss 4.0402
Epoch 1 Batch 1350 Loss 4.0088
Epoch 1 Batch 1400 Loss 3.9765
Epoch 1 Batch 1450 Loss 3.9442
Epoch 1 Batch 1500 Loss 3.9156
Epoch 1 Batch 1550 Loss 3.8872
Epoch 1 Batch 

**Evaluate Model Performance**

In [None]:
def evaluate(inp_sentence):
    inp_sentence = [VOCAB_SIZE_EN-2] + tokenizer_en.encode(inp_sentence) + [VOCAB_SIZE_EN-1]
    enc_input = tf.expand_dims(inp_sentence, axis=0)
    
    output = tf.expand_dims([VOCAB_SIZE_SV-2], axis=0)
    
    for _ in range(MAX_LENGTH):
        predictions, weights = transformer(enc_input, output, False)
        
        prediction = predictions[:, -1:, :]
        
        predicted_id = tf.cast(tf.argmax(prediction, axis=-1), tf.int32)
        
        if predicted_id == VOCAB_SIZE_SV-1:
            return tf.squeeze(output, axis=0), weights
        
        output = tf.concat([output, predicted_id], axis=-1)
        
    return tf.squeeze(output, axis=0), weights

In [None]:
def translate(sentence):
    output, weights = evaluate(sentence)
    
    output = output.numpy()
    predicted_sentence = tokenizer_sv.decode([i for i in output if i < VOCAB_SIZE_SV-2])
    
    #print("Input: {}".format(sentence))
    #print("Predicted translation: {}".format(predicted_sentence))

    return predicted_sentence, weights

In [None]:
result, a = translate("I congratulate him on his excellent report.")
result

'Jag gratulerar honom till hans utmärkta betänkande.'

In [None]:
a = tf.squeeze(a, 0)
a = a[0]
a.shape[0]

9

In [None]:
!pip install sacrebleu

Collecting sacrebleu
[?25l  Downloading https://files.pythonhosted.org/packages/7e/57/0c7ca4e31a126189dab99c19951910bd081dea5bbd25f24b77107750eae7/sacrebleu-1.5.1-py3-none-any.whl (54kB)
[K     |██████                          | 10kB 20.2MB/s eta 0:00:01[K     |████████████                    | 20kB 28.3MB/s eta 0:00:01[K     |██████████████████              | 30kB 20.3MB/s eta 0:00:01[K     |████████████████████████        | 40kB 18.0MB/s eta 0:00:01[K     |██████████████████████████████  | 51kB 16.8MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 8.2MB/s 
[?25hCollecting portalocker==2.0.0
  Downloading https://files.pythonhosted.org/packages/89/a6/3814b7107e0788040870e8825eebf214d72166adf656ba7d4bf14759a06a/portalocker-2.0.0-py2.py3-none-any.whl
Installing collected packages: portalocker, sacrebleu
Successfully installed portalocker-2.0.0 sacrebleu-1.5.1


In [None]:
import sacrebleu
avg = 0
total = 0

for j in range(len(valid_src)):
  src = tokenizer_en.decode([i for i in valid_src[j] if i < VOCAB_SIZE_EN-2])
  ref = tokenizer_sv.decode([i for i in valid_ref[j] if i < VOCAB_SIZE_SV-2])
  if len(src.split()) < 1 or len(ref.split()) < 1:
    continue
  total += 1
  translated, _ = translate(src)
  avg += sacrebleu.raw_corpus_bleu(translated, ref, 0.01).score

**We obtain an impressive BLEU score of 22.11 on the validation dataset!**

In [None]:
print('BLEU SCORE: ', round(avg/len(valid_src), 2))

In [None]:
###Test case for attention plot###

import matplotlib.pyplot as plt

def plot_attention(attention):

  translation, attention = translate(sentence)

  attention = tf.squeeze(attention, 0)
  attention = attention[0]

  ax = plt.gca()
  ax.matshow(attention)
  ax.set_xticks(range(len(sentence.split())))
  ax.set_yticks(range(len(translation.split())))

  ax.set_xticklabels(sentence.split(), rotation=90)
  ax.set_yticklabels(translation.split())

In [None]:
import matplotlib.pyplot as plt
import numpy as np

%config InlineBackend.figure_format = 'svg'

plt.style.use('seaborn')

def plot_attention(sentence):
  
    translation, attention = translate(sentence)

    attention = tf.squeeze(attention, 0)
    attention = attention[0]

    fig, ax = plt.subplots()
    heatmap = ax.pcolor(attention, cmap='GnBu_r')

    ax.set_xticklabels(sentence.split(), minor=False, rotation='vertical')
    ax.set_yticklabels(translation.split(), minor=False)

    ax.xaxis.tick_top()
    ax.set_xticks(np.arange(attention.shape[1]) + 0.5, minor=False)
    ax.set_yticks(np.arange(attention.shape[0]) + 0.5, minor=False)
    ax.invert_yaxis()

    plt.colorbar(heatmap)

In [None]:
sentence = 'Are there any comments?'
plot_attention(sentence)

In [None]:
sentence = 'Safety advisers for the transport of dangerous goods'
plot_attention(sentence)