In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Input,Embedding,Add,Dense,Attention,LayerNormalization,Multiply

In [None]:
import numpy as np

def generate_angles(pos,embed_dim):
    a=[i for i in range(embed_dim)]
    return [np.cos(pos/10000**(2*i/embed_dim)) if i%2==1 else np.sin(pos/10000**(2*i/embed_dim)) for i in range(embed_dim)]

def positional_encoding(x):
    # shape of x will be batch_size,timestamp,embed_dim
    shape=tf.shape(x)
    pos=shape[1]
    embed_dim=shape[2]
    ans=[]
    for i in range(shape[0]):
        temp_ans=[]
        for j in range(pos):
            angles=generate_angles(j,embed_dim)
            temp_ans.append(angles)
        ans.append(temp_ans)
    return Add()[x,ans]

In [None]:
# length is the seq_len , depth is the embeding dim
def positional_encoding(length, depth):
    positions=np.arange(length)[:,np.newaxis]  # seq_len , 1
    
    depths=np.arange(depth)[np.newaxis , : ]  # 1, depth
    
    angle_rates=1/np.power(10000,(2 * depths // 2)) / depth
    angles=positions*angle_rates    # pos , depth 
    angles[:, 0::2] = np.sin(angles[:, 0::2])
    angles[:, 1::2] = np.cos(angles[:, 1::2])
    
    return tf.cast(angles,tf.float32)

In [None]:
class PositionalEmbedding(tf.keras.layers.layer):
    def __init__(self,vocab_size,d_model):
        super().__init__()
        self.d_model=d_model
        self.embedding=Embedding(vocab_size,d_model,mask_zero=True)
        # here pos_encoding are generated with length (seq_length) = 2048 
        self.pos_encoding=positional_encoding(length=2048,depth=d_model)
        
    def compute_mask(self, *args, **kwargs):
       return self.embedding.compute_mask(*args, **kwargs)
   
    def call(self,x):
        # x is input its shape will be  batch_size,seq_len
        length=tf.shape(x)[1]
        x=self.embedding(x)
        # now x will be of shape batch_size,seq_len,embed_dim
        
        # This factor sets the relative scale of the embedding and positonal_encoding.
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        
        x=x+self.positional_encoding[tf.newaxis,:length,:]

In [None]:
#  i am writing a base attention then inherting it because it will reduce code part when we have to add cross attention in decoder 
class BaseAttention(tf.keras.layers.Layer):
  def __init__(self, d_model ,**kwargs):
    super().__init__()
    self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
    self.layernorm = tf.keras.layers.LayerNormalization()
    self.add = tf.keras.layers.Add()

In [None]:
class GlobalSelfAttention(BaseAttention):
    def call(self,x):
        
        # If query, key, value are the same in MultiHeadAttention keras internally multiply them with weight matrix to convert them to respect vectors as per paper
        attn_output=self.mha(query=x,value=x,key=x)
        
        x=self.add([x,attn_output])
        x=self.layernorm(x)
        return x

In [None]:
class FeedForward(tf.keras.layers.Layer):
    def __init__ (self,d_model,dff,dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
        tf.keras.layers.Dense(dff, activation='relu'),
        tf.keras.layers.Dense(d_model),
        tf.keras.layers.Dropout(dropout_rate)
        ])
        self.add = tf.keras.layers.Add()
        self.layer_norm = tf.keras.layers.LayerNormalization()
        
    def call(self,x):
        out=self.seq(x)
        x=self.add([x,out])
        return self.layer_norm(x)

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
        super().__init__()
        
        self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)
        
        self.ffn = FeedForward(d_model,dff)
        
    def call(self, x):
        x = self.self_attention(x)
        x = self.ffn(x)
        return x

In [None]:


d_model = 512
input_vocab_size=-1

# Encoder
class Encoder(tf.keras.layers.Layer):
    def __init__(self, *, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
        super().__init__()
        self.d_model = d_model
        self.num_layers = num_layers
        
        self.pos_embedding = PositionalEmbedding(
        vocab_size=vocab_size, d_model=d_model)
        
        self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
        
        self.dropout = tf.keras.layers.Dropout(dropout_rate)
    
    def call(self, x):
        # `x` is token-IDs shape: (batch, seq_len)
        x = self.pos_embedding(x)  # Shape `(batch_size, seq_len, d_model)`.
        
        # Add dropout.
        x = self.dropout(x)

        for i in range(self.num_layers):
            x = self.enc_layers[i](x)

        return x  # Shape `(batch_size, seq_len, d_model)`.
    
    
# keras also has a encoder layer -> https://keras.io/keras_hub/api/modeling_layers/transformer_encoder/