In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input,Embedding,Add,Dense,Attention,LayerNormalization,Multiply

In [None]:
# length is the seq_len , depth is the embeding dim
def positional_encoding(length, depth):
    positions=np.arange(length)[:,np.newaxis]  # seq_len , 1
    
    depths=np.arange(depth)[np.newaxis , : ]  # 1, depth
    
    angle_rates=1/np.power(10000,(2 * depths // 2)) / depth
    angles=positions*angle_rates    # pos , depth 
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1::2])
    
    return tf.cast(angles,tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.layer):
    def __init__(self,vocab_size,d_model):
        super().__init__()
        self.d_model=d_model
        self.embedding=Embedding(vocab_size,d_model,mask_zero=True)
        # here pos_encoding are generated with length (seq_length) = 2048 
        self.pos_encoding=positional_encoding(length=2048,depth=d_model)
        
    def compute_mask(self, *args, **kwargs):
       return self.embedding.compute_mask(*args, **kwargs)
   
    def call(self,x):
        # x is input its shape will be  batch_size,seq_len
        length=tf.shape(x)[1]
        x=self.embedding(x)
        # now x will be of shape batch_size,seq_len,embed_dim
        
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x=x+self.positional_encoding[tf.newaxis,:length,:]

In [None]:
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, **kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class GlobalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class CrossAttention(BaseAttention):
  def call(self, x, context):
    attn_output, attn_scores = self.mha(
        query=x,
        key=context,
        value=context,
        return_attention_scores=True)
   
    # Cache the attention scores for plotting later.
    self.last_attn_scores = attn_scores

    x = self.add([x, attn_output])
    x = self.layernorm(x)

    return x

In [None]:
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x) 
    return x


# Encoder

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

In [None]:
class Encoder(tf.keras.layers.Layer):
  def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
    
    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x  # Shape `(batch_size, seq_len, d_model)`.

# Decoder

In [None]:
class DecoderLayer(tf.keras.layers.Layer):
    def __init__(self,num_heads,d_model,dff,dropout_rate=0.1):
        super(DecoderLayer,self).__init__()
        
        self.cross_attention=CrossAttention(num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)
        
        self.masked_attention=CausalSelfAttention(num_heads=num_heads,
            key_dim=d_model,
            dropout=dropout_rate)
        
        self.ff=FeedForward(d_model, dff, dropout_rate)
        
    def call(self,x,encoder_x):
        x=self.masked_attention(x)
        x=self.cross_attention(x,encoder_x)
        x=self.ff(x)
        return x

In [None]:
class Decoder(tf.keras.layers.Layer):
    def __init__(self,num_layers,vocab_size,num_heads,d_model,dff,dropout_rate=0.1):
        super(Decoder,self).__init__()
        
        self.positional_embedding=PositionalEmbedding(vocab_size,d_model)
        
        self.decoder_layers=[DecoderLayer(num_heads=num_heads,d_model=d_model,dff=dff,dropout_rate=dropout_rate) for i in range(num_layers)]
        
        
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
        
    def call(self,x,encoder_x):
        pe=self.positional_embedding(x)
        
        pe=self.dropout(pe)
        
        for decoder_layer in self.decoder_layers:
            pe=decoder_layer(x,encoder_x)
            
        self.output_layer(pe)

# Transformer 

In [None]:
class Transformer(tf.keras.Model):
    def __init__(self,input_vocab_size,target_vocab_size,num_layers,num_heads,d_model,dff,dropout_rate=0.1):
        super.__init__()
        
        self.encoder=Encoder(num_layers=num_layers,num_heads=num_heads,d_model=d_model,vocab_size=input_vocab_size,dff=dff,dropout_rate=dropout_rate)
        
        self.decoder=Decoder(num_layers=num_layers,vocab_size=target_vocab_size,num_heads=num_heads,d_model=d_model,dff=dff,dropout_rate=dropout_rate)
        
        self.output_layer=Dense(target_vocab_size,activation="softmax")
        
    def call(self,inputs):
        context,x=inputs
        
        context=self.encoder(context)
        x=self.decoder(x,context)
        
        out=self.output_layer(x)
        try:
      # Drop the keras mask, so it doesn't scale the losses/metrics.
      # b/250038731
            del out._keras_mask
        except AttributeError:
            pass
        
        return out

In [None]:
num_layers = 4
d_model = 128
dff = 512
num_heads = 8
dropout_rate = 0.1

In [None]:
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=-1,
    target_vocab_size=-1,
    dropout_rate=dropout_rate)

# Learning Rate
Use the Adam optimizer with a custom learning rate scheduler according to the formula in the original Transformer [paper](https://arxiv.org/abs/1706.03762).

$$\Large{lrate = d_{model}^{-0.5} * \min(step{\_}num^{-0.5}, step{\_}num \cdot warmup{\_}steps^{-1.5})}$$

In [None]:
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
    def __init__(self,d_model,warmup_steps):
        super().__init__()
        
        self.d_model=tf.cast(d_model,tf.float32)
        self.warmup_steps=warmup_steps
        
    def __call__(self,step):
        step = tf.cast(step, dtype=tf.float32)
        lrate=tf.math.rsqrt(self.d_model)*tf.math.minimum(tf.math.sqrt(step),step*(self.warmup_steps ** -1.5))
        return lrate

In [None]:
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

# LOSS and ACCURACY

In [None]:
def masked_loss(label,pred):
    mask=label!=pred
    loss_object=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True,reduction="None")
    loss=loss_object(label,pred)
    
    mask=tf.cast(mask,dtype=loss.dtype)
    loss*=mask
    
    return tf.reduce_sum(loss)/tf.reduce_sum(mask)

def masked_accuracy(label,pred):
    mask=label!=pred
    pred=tf.argmax(pred,axis=2)
    label=tf.cast(label,pred.dtype)
    
    match=label==mask
    match=match & mask
    
    match = tf.cast(match, dtype=tf.float32)    
    mask = tf.cast(mask, dtype=tf.float32)
    
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

# Training

In [None]:
transformer.compile(optimizer=optimizer,loss=masked_loss,metrics=[masked_accuracy])
transformer.summary()

In [None]:
transformer.fit(train_batches,epochs=EPOCHS,validation_data=val_batch)

# Inference

In [None]:
class Translator(tf.Module):
  def __init__(self, tokenizers, transformer):
    self.tokenizers = tokenizers
    self.transformer = transformer

  def __call__(self, sentence, max_length=MAX_TOKENS):
    # The input sentence is Portuguese, hence adding the `[START]` and `[END]` tokens.
    assert isinstance(sentence, tf.Tensor)
    if len(sentence.shape) == 0:
      sentence = sentence[tf.newaxis]

    sentence = self.tokenizers.pt.tokenize(sentence).to_tensor()

    encoder_input = sentence

    # As the output language is English, initialize the output with the
    # English `[START]` token.
    start_end = self.tokenizers.en.tokenize([''])[0]
    start = start_end[0][tf.newaxis]
    end = start_end[1][tf.newaxis]

    # `tf.TensorArray` is required here (instead of a Python list), so that the
    # dynamic-loop can be traced by `tf.function`.
    output_array = tf.TensorArray(dtype=tf.int64, size=0, dynamic_size=True)
    output_array = output_array.write(0, start)

    for i in tf.range(max_length):
      output = tf.transpose(output_array.stack())
      predictions = self.transformer([encoder_input, output], training=False)

      # Select the last token from the `seq_len` dimension.
      predictions = predictions[:, -1:, :]  # Shape `(batch_size, 1, vocab_size)`.

      predicted_id = tf.argmax(predictions, axis=-1)

      # Concatenate the `predicted_id` to the output which is given to the
      # decoder as its input.
      output_array = output_array.write(i+1, predicted_id[0])

      if predicted_id == end:
        break

    output = tf.transpose(output_array.stack())
    # The output shape is `(1, tokens)`.
    text = tokenizers.en.detokenize(output)[0]  # Shape: `()`.

    tokens = tokenizers.en.lookup(output)[0]

    # `tf.function` prevents us from using the attention_weights that were
    # calculated on the last iteration of the loop.
    # So, recalculate them outside the loop.
    self.transformer([encoder_input, output[:,:-1]], training=False)
    attention_weights = self.transformer.decoder.last_attn_scores

    return text, tokens, attention_weights

In [None]:
translator = Translator(tokenizers, transformer)

In [None]:
class ExportTranslator(tf.Module):
  def __init__(self, translator):
    self.translator = translator

  @tf.function(input_signature=[tf.TensorSpec(shape=[], dtype=tf.string)])
  def __call__(self, sentence):
    (result,
     tokens,
     attention_weights) = self.translator(sentence, max_length=MAX_TOKENS)

    return result

In [None]:
translator = ExportTranslator(translator)

In [None]:
tf.saved_model.save(translator, export_dir='translator')