In [1]:
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np

from pylab import rcParams
from collections import Counter

import tensorflow as tf
from tensorflow import math
from tensorflow.keras import optimizers, backend
from tensorflow.keras.models import Model, load_model, Sequential
from tensorflow.keras.layers import Input, Dense, Dropout, AlphaDropout, Layer, Embedding, Attention, LayerNormalization
from tensorflow.keras.callbacks import ModelCheckpoint, TensorBoard

In [0]:
def attention(query, key, value, mask=None, dropout=None):
  d_k = query.get_shape()[-1]
  scores = tf.matmul(query, key, transpose_b=True) / 8
  print(scores.shape)
  weights = tf.nn.softmax(scores)
  print(weights.shape)
  s_atten = tf.matmul(weights, value)
  print(s_atten.shape)

  if dropout is not None:
      s_attn = dropout(s_attn)
  
  return s_atten, weights

In [0]:
class FeedForward(Layer):
    def __init__(self, d_model, d_ff, dropout=0.1):
        super(FeedForward, self).__init__()
        self.l_1 = Dense(d_ff, activation='relu')
        self.l_2 = Dense(d_model)
        self.dropout = Dropout(dropout)

    def call(self, x):
        return self.dropout(self.l_2(self.l_1(x)))

In [0]:
class MultiHeadAttention(Layer):

  def __init__(self, h, d_model, dropout=0.1):
    super(MultiHeadAttention, self).__init__()

    assert d_model % h == 0
    self.d_k = d_model // h
    self.h = h
    #self.linears = Dense(d_model, d_model)
    self.attn = None
    self.dropout = Dropout(dropout)
    
    self.wq = Dense(d_model)
    self.wk = Dense(d_model)
    self.wv = Dense(d_model)

    self.dense = Dense(d_model)

  def attention_heads(self, x, batch_size):
    x = tf.reshape(x, (batch_size, self.h, -1, self.d_k))
    return x
    
  def call(self, v, k, q, mask):
    batch_size = tf.shape(q)[0]
    
    q = self.wq(q)
    k = self.wk(k)
    v = self.wv(v)
    print(q.shape, k.shape, v.shape)
    
    q = self.attention_heads(q, batch_size)
    k = self.attention_heads(k, batch_size)  
    v = self.attention_heads(v, batch_size)
    print(q.shape, k.shape, v.shape)

    attn, weights = attention(q, k, v, mask)
    attn = tf.transpose(attn, perm=[0, 2, 1, 3])
    concat_attention = tf.reshape(attn, (batch_size, -1, self.h * self.d_k))
    output = self.dense(concat_attention)
    print('output', output.shape)
    return output, weights

In [0]:
 class PositionalEncoding(Layer):
  def __init__(self, d_model, rate=0.1, max_len=5000):
        super(PositionalEncoding, self).__init__()
        
        self.pe = np.zeros((max_len, d_model))

        position = np.arange(0, max_len, dtype='float32')
        position = np.expand_dims(position, axis=1)

        div_term = np.exp(np.arange(0, d_model, 2, dtype='float32') * (- np.log(10000.0) / d_model))

        self.pe[:, 0::2] = np.sin(position * div_term)
        self.pe[:, 1::2] = np.cos(position * div_term)
        
        self.pe=np.expand_dims(self.pe,axis=0)
        self.pe=tf.cast(self.pe, dtype=tf.float32)

        self.dropout = Dropout(rate)

  def call(self, x):
        x = x + tf.Variable(self.pe[:, :x.get_shape()[1]], 
                            trainable=False)
        return self.dropout(x)

In [0]:
class Generator(Layer): 
    def __init__(self,d_model):
        super(Generator, self).__init__()
        self.proj = Dense(d_model)
        
    def call(self, x):
        return tf.nn.log_softmax(self.proj(x), dim=-1)

In [0]:
class EncoderLayer(Layer) : 

  def __init__(self, d_model, h, output_dim, rate=0.1,**kwargs):
    self.attention = MultiHeadAttention(h, d_model)
    self.Norm1 = LayerNormalization(axis=-1,epsilon=1e-6)
    self.feedforward = FeedForward(d_model,output_dim)
    self.Norm2 = LayerNormalization(axis=-1,epsilon=1e-6)
    self.dk = np.sqrt(64)
    self.output_dim = output_dim
    self.dropout1 = Dropout(rate)
    self.dropout2 = Dropout(rate)
    super(EncoderLayer, self).__init__(**kwargs)

  def call(self, x, training, mask):
    attn_output, _ = self.attention(x, x, x, mask)
    attn_output = self.dropout1(attn_output)
    print(attn_output.shape, x.shape)
    output1 = self.Norm1(x + attn_output)
    
    ff_output = self.feedforward(output1)
    ff_output = self.dropout2(ff_output)
    print(ff_output.shape, output1.shape)
    output2 = self.Norm2(output1 + ff_output)

    return output2

In [0]:
class DecoderLayer(Layer):
  
    def __init__(self, d_model, h, dff, rate=0.1,**kwargs):
      self.attention = MultiHeadAttention(h, d_model)
      self.Norm1 = LayerNormalization(axis=-1, epsilon=1e-6)
      self.attention_mask = MultiHeadAttention(h, d_model)
      self.Norm2 = LayerNormalization(axis=-1, epsilon=1e-6)
      self.feedforward = FeedForward(d_model,dff)
      self.Norm3 = LayerNormalization(axis=-19, epsilon=1e-6)
      self.dropout1 = Dropout(rate)
      self.dropout2 = Dropout(rate)
      self.dropout3 = Dropout(rate)
      super(DecoderLayer, self).__init__(**kwargs)

        
    def call(self, x, enc_x, training, mask, padding_mask):
      attn_output, weights1 = self.attention(x, x, x, mask) 
      attn_output = self.dropout1(attn_output)
      output1 = self.Norm1(x + attn_output)
      attn_output2, weights2 = self.attention(enc_x, enc_x, output1 , padding_mask) 
      attn_output2 = self.dropout2(attn_output2)
      output2 = self.Norm2(output1 + attn_output2)
      output3 = self.dropout2(self.feedforward(output2))
      output3 = self.Norm2(output2 + output3)
      return output3, weights1, weights2

In [0]:
class Transformer(Model):
  def __init__ (self, num_layers, d_model, num_heads, dff, input_vocab_size, 
               target_vocab_size, pe_input, pe_target, rate=0.1):
    
    super(Transformer, self).__init__()

    self.num_layers = num_layers

    self.enc_Embedding = Embedding(input_vocab_size, d_model)
    self.pos_enc = PositionalEncoding(d_model)

    self.enc_layers = [EncoderLayer(d_model, num_heads, dff, rate)
                       for _ in range(num_layers)]

    self.dec_Embedding = Embedding(target_vocab_size, d_model)    
    self.pos_dec = PositionalEncoding(d_model)

    self.dec_layers = [DecoderLayer(d_model, num_heads, dff, rate) 
                       for _ in range(num_layers)]

    self.generator = Generator(d_model)
    #self.dropout = Dropout(rate)

  def call(self, enc_x, dec_x, training, enc_padding_mask,look_ahead_mask, dec_padding_mask):

    print(enc_x.shape)
    enc_x = self.enc_Embedding(enc_x)
    print(enc_x.shape)
    enc_x = self.pos_enc(enc_x)
    print(enc_x.shape)

    for i in range(self.num_layers):
      print(i)
      enc_x = self.enc_layers[i](enc_x, training, enc_padding_mask)
      print(enc_x.shape)
    
    enc_output = enc_x

    print(dec_x.shape)
    dec_x = self.dec_Embedding(dec_x)
    print(dec_x.shape)
    dec_x = self.pos_dec(dec_x)
    print(dec_x.shape)

    for i in range(self.num_layers):
      dec_output, block1, block2 = self.dec_layers[i](dec_x, enc_output, training,
                                                      look_ahead_mask, dec_padding_mask)
      print(dec_output.shape)
      #attention_weights['decoder_layer{}_block1'.format(i+1)] = block1
      #attention_weights['decoder_layer{}_block2'.format(i+1)] = block2

    output = self.generator(dec_output)

    #return output, attention_weights
    return output

In [23]:
model = Transformer(
    num_layers=2, d_model=512, num_heads=8, dff=2048, 
    input_vocab_size=8500, target_vocab_size=8500, 
    pe_input=10000, pe_target=6000,rate=0.1)

x_input = tf.random.uniform((32, 20), dtype=tf.int64, minval=0, maxval=8500)
x_target = tf.random.uniform((32, 20), dtype=tf.int64, minval=0, maxval=8500)

output = model(x_input,
               x_target,
               training=False,
               enc_padding_mask=None,
               look_ahead_mask=None,
               dec_padding_mask=None)

(32, 20)
(32, 20, 512)
(32, 20, 512)
0
(32, 20, 512) (32, 20, 512) (32, 20, 512)
(32, 8, 20, 64) (32, 8, 20, 64) (32, 8, 20, 64)
(32, 8, 20, 20)
(32, 8, 20, 20)
(32, 8, 20, 64)
output (32, 20, 512)
(32, 20, 512) (32, 20, 512)
(32, 20, 512) (32, 20, 512)
(32, 20, 512)
1
(32, 20, 512) (32, 20, 512) (32, 20, 512)
(32, 8, 20, 64) (32, 8, 20, 64) (32, 8, 20, 64)
(32, 8, 20, 20)
(32, 8, 20, 20)
(32, 8, 20, 64)
output (32, 20, 512)
(32, 20, 512) (32, 20, 512)
(32, 20, 512) (32, 20, 512)
(32, 20, 512)
(32, 20)
(32, 20, 512)
(32, 20, 512)
(32, 20, 512) (32, 20, 512) (32, 20, 512)
(32, 8, 20, 64) (32, 8, 20, 64) (32, 8, 20, 64)
(32, 8, 20, 20)
(32, 8, 20, 20)
(32, 8, 20, 64)
output (32, 20, 512)
(32, 20, 512) (32, 20, 512) (32, 20, 512)
(32, 8, 20, 64) (32, 8, 20, 64) (32, 8, 20, 64)
(32, 8, 20, 20)
(32, 8, 20, 20)
(32, 8, 20, 64)
output (32, 20, 512)
(32, 20, 512)
(32, 20, 512) (32, 20, 512) (32, 20, 512)
(32, 8, 20, 64) (32, 8, 20, 64) (32, 8, 20, 64)
(32, 8, 20, 20)
(32, 8, 20, 20)
(32, 8, 20

In [24]:
model.summary()

Model: "transformer_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_4 (Embedding)      multiple                  4352000   
_________________________________________________________________
positional_encoding_4 (Posit multiple                  0         
_________________________________________________________________
encoder_layer_4 (EncoderLaye multiple                  3152384   
_________________________________________________________________
encoder_layer_5 (EncoderLaye multiple                  3152384   
_________________________________________________________________
embedding_5 (Embedding)      multiple                  4352000   
_________________________________________________________________
positional_encoding_5 (Posit multiple                  0         
_________________________________________________________________
decoder_layer_4 (DecoderLaye multiple                