In [1]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Dense, Dropout, LayerNormalization

ModuleNotFoundError: No module named 'tensorflow'

In [None]:
class ScaledDotProductAttention(Layer):
  def __init__(self, d_model, num_heads):
    super(ScaledDotProductAttention, self).__init__()
    self.scaling_factor = tf.sqrt(tf.cast(d_model, tf.float32)/num_heads)

  def call(self, Q, K, V, mask = None):
    output = tf.matmul(Q,K,transpose_b = True) # Q*K^T
    output = output/self.scaling_factor
    if mask is not None:
      output += (mask * -1e9)

    output = tf.nn.softmax(output, axis = -1)
    output = tf.matmul(output,V)
    return output

In [None]:
class MultiHeadAttention(Layer):
  def __init__(self, d_model, num_heads):
    super(MultiHeadAttention, self).__init__()

    self.num_heads = num_heads
    self.d_model = d_model
    self.attention = ScaledDotProductAttention(d_model, num_heads)

    self.W_Q = Dense(d_model)
    self.W_K = Dense(d_model)
    self.W_V = Dense(d_model)

  def call(self, Q,K,V, mask = None):
    Q = self.W_Q(Q)
    K = self.W_K(K)
    V = self.W_V(V)

    attention_output = self.attention(Q,K,V,mask)
    output = self.W_O(attention_output)
    return output


In [None]:
class FeedForward(Layer):
  def __init__(self, d_model, d_ff):
    super(FeedForward, self).__init__()
    self.dense1 = Dense(d_ff, activation='relu')
    self.dense2 = Dense(d_model)

  def call(self,x):
    x = self.dense1(x)
    x = self.dense2(x)
    return x

In [None]:
class EncoderLayer(Layer):
  def __init__(self, d_model, num_heads, d_ff):
    super(EncoderLayer, self).__init__()
    self.multihead = MultiHeadAttention(d_model, num_heads)
    self.feedforward = FeedForward(d_model, d_ff)

    self.layernorm1 = LayerNormalization()
    self.layernorm2 = LayerNormalization()

  def call(self, x, mask = None):
    attn_output = self.multihead(x,x,x,mask)
    x = self.layernorm1(x)

    ff_output = self.feedforward(x)
    x = self.layernorm2(x)
    return x

In [None]:
class DecoderLayer(Layer):
  def __init__(self, d_model, num_heads, d_ff):
    super(DecoderLayer, self).__init__()

    self.mha1 = MultiHeadAttention(d_model, num_heads)
    self.mha2 = MultiHeadAttention(d_model, num_heads)
    self.ff = FeedForward(d_model, d_ff)

    self.layernorm1 = LayerNormalization()
    self.layernorm2 = LayerNormalization()
    self.layernorm2 = LayerNormalization()

  def call(self, x, encoder_output, ahead_mask = None, padding_mask = None):
    attention_output1 = self_mha1(x, x, x, ahead_mask)
    x = self.layernorm1(x)

    attention_output2 = self_mha2(x, encoder_output, encoder_output, padding_mask)
    x = self.layernorm2(x)

    ff_output = self.ff(x)
    x = self.layernorm3(x)
    return x
