In [None]:
parameters = """
#### parameters
-----------------
vocab_size (vocabulary size of training data) =
embedding_dim (d_model) =
context_length =
num_heads =
dff (feedforward network units) =
num_layers (num of encoders/decoders) =

# optional
dropout_rate
"""
print(parameters)


#### parameters
-----------------
vocab_size (vocabulary size of training data) =
embedding_dim (d_model) =
context_length =
num_heads =
dff (feedforward network units) =
num_layers (num of encoders/decoders) =

# optional
dropout_rate



In [None]:
import tensorflow as tf
import numpy as np
import logging

In [None]:
logging.basicConfig(filename='transformer.log', encoding='utf-8', level=logging.DEBUG)

In [None]:
def positional_encoding(context_length,embeding_dim):

    # embeding_dim = 4  # DIMENSIONS OF THE MODEL
    # context_length = 4 # CONTEXT LENGTH OF THE MODEL, means it can handle 4 words from the input

    # TOTAL POSITIONS
    positions = np.arange(context_length)[:,np.newaxis]

    embeding_dim = embeding_dim/2

    positions = np.arange(context_length)[:, np.newaxis]     # (seq, 1)
    embeding_dim = np.arange(embeding_dim)[np.newaxis, :]/embeding_dim   # (1, depth)

    angle_rates = 1 / (10000**embeding_dim)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    first_half = np.sin(angle_rads)
    second_half = np.cos(angle_rads)

    positional_encoding = np.concatenate([first_half, second_half],axis=1)

    return tf.cast(positional_encoding, dtype=tf.float32)

In [None]:
class PositionalEncoding(tf.keras.layers.Layer):

    def __init__(self, vocab_size=10000, context_length=2064, d_model=512):

        self.d_model = d_model
        self.context_length = context_length
        self.vocab_size = vocab_size

        super(PositionalEncoding, self).__init__()

        self.embedding = tf.keras.layers.Embedding(vocab_size, d_model, mask_zero=True)
        self.positional_encoding = positional_encoding(context_length, d_model)

    def call(self, x):
        print(f"Dimensions of the each word = {self.d_model}\n")
        print(f"Input words shape {x.shape}\n")

        assert len(x.shape) == 2

        # LENGTH OF THE SENTECES, IN REAL TIME THE LENGTH OF SENTECES WILL GET DIFFER
        length = tf.shape(x)[1]

        # GETTING EMBEDDINGS
        embeds = self.embedding(x)
        print(f"embedding shape = {embeds.shape}\n")

        # OPTIONAL
        embeds *= tf.math.sqrt(tf.cast(512, tf.float32))

        # GETTING POSITIONS BASED ON THE LENGTH OF THE SENTENCE(WORDS)
        pos_en = self.positional_encoding[tf.newaxis, :length, :]
        print(f"positional encoding shape = {pos_en.shape}\n")


        # COMBINING POSITIONAL ENCODING AND EMBEDDINGS
        op = embeds + pos_en
        print(f"Output shape = {op.shape}")
        return op

In [None]:
pe = PositionalEncoding(10000, 2048, 1024)

In [None]:
pe(tf.Variable([[2],[3]]))

Dimensions of the each word = 1024

Input words shape (2, 1)

embedding shape = (2, 1, 1024)

positional encoding shape = (1, 1, 1024)

Output shape = (2, 1, 1024)


<tf.Tensor: shape=(2, 1, 1024), dtype=float32, numpy=
array([[[ 0.69965667, -0.10972746, -0.5721705 , ...,  1.9067972 ,
         -0.00867653,  1.207585  ]],

       [[-0.389815  ,  0.2653671 ,  1.1063471 , ..., -0.01849329,
         -0.09468436,  1.6995642 ]]], dtype=float32)>

In [None]:
# Base Attentions, core components will have in this base attentions
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()

In [None]:
class GlobalAttention(BaseAttention):
    def call(self, input):
        attention_output = self.mha(
            query=input,
            value=input,
            key=input,
        )
        addition = self.add([input, attention_output])
        return self.layernorm(addition)

In [None]:
class CrossAttention(BaseAttention):
    """
    At Decoder after casual attention
    """
    def call(self, input, context):
        attention_output, attention_score = self.mha(
            query=input,
            value=context,
            key=context,
            return_attention_scores = True
        )

        self.attention_score = attention_score

        addition = self.add([input, attention_output])
        return self.layernorm(addition)

In [None]:
class CasualAttention(BaseAttention):
    """
    At Decoder starting layer
    """
    def call(self, input):
        attention_output = self.mha(
            query=input,
            value=input,
            key=input,
            use_casual_mask = True
        )
        addition = self.add([input, attention_output])
        return self.layernorm(addition)

In [None]:


class FeedForward(tf.keras.layers.Layer):
    def __init__(self, d_model, dff, dropout_rate=0.1):
        super().__init__()
        self.seq = tf.keras.Sequential([
            tf.keras.layers.Dense(dff, activation = 'relu'),
            tf.keras.layers.Dense(d_model),
            tf.keras.layers.Dropout(dropout_rate),
        ])
        self.add = tf.keras.layers.Add()
        self.layernorm = tf.keras.layers.LayerNormalization()

    def call(self, input):
        seq = self.seq(input)
        add = self.add([input, seq])
        return self.layernorm(add)

In [None]:
class EncoderLayer(tf.keras.layers.Layer):
    def __init__(self, num_heads, key_dim, dff, dropout_rate=0.1):
        super().__init__()
        self.self_attention = GlobalAttention(
            num_heads=num_heads,
            key_dim=key_dim,
            dropout= dropout_rate,
        )
        self.ff = FeedForward(key_dim, dff)

    def call(self, input):
        x = self.self_attention(input)
        return self.ff(x)

In [None]:
class Encoder(tf.keras.layers.Layer):

    def __init__(self,num_layers, d_model, num_heads, dff, vocab_size, context_length, dropout_rate=0.1):
        super().__init__()
        self.num_layers = num_layers

        self.pos_embedding = PositionalEncoding(vocab_size,context_length, d_model)
        self.en_layers = [
            EncoderLayer(
                num_heads , d_model, dff, dropout_rate
            ) for layer in range(num_layers)
        ]
        self.dropout = tf.keras.layers.Dropout(dropout_rate)

    def call(self, input):
        print(f"Encoder Input Shape = {input.shape}")
        x = self.pos_embedding(input)
        print(f"Endoder Embedding Shape = {x.shape}")
        for layer in range(self.num_layers):
            x = self.en_layers[layer](x)

        print(f"All Endoder Output Shape = {x.shape}")
        return x

In [None]:
# Prepare Data

num_words = 20000 # total words
max_length = 200 # length of the sentence
ds = tf.keras.datasets.imdb.load_data(num_words=num_words)

In [None]:
train_x, train_y, val_x, val_y = ds[0][0], ds[0][1], ds[1][0], ds[1][1]

In [None]:
print(len(train_x), "Training sequences")
print(len(val_x), "Validation sequences")

25000 Training sequences
25000 Validation sequences


In [None]:
train_x = tf.keras.utils.pad_sequences(train_x,maxlen=max_length)
val_x = tf.keras.utils.pad_sequences(val_x,maxlen=max_length)

In [None]:
print(train_x.shape, "Training sequences")
print(val_x.shape, "Validation sequences")

(25000, 200) Training sequences
(25000, 200) Validation sequences


In [None]:
transformer_encoder = Encoder(num_layers=1,d_model=32,num_heads=2,dff=32,vocab_size=num_words,context_length=max_length)

In [None]:
inputs = tf.keras.layers.Input(shape=(max_length,))
x = transformer_encoder(inputs)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
x = tf.keras.layers.Dropout(0.1)(x)
x = tf.keras.layers.Dense(20, activation="relu")(x)
x = tf.keras.layers.Dropout(0.1)(x)
outputs = tf.keras.layers.Dense(2, activation="softmax")(x)
model = tf.keras.Model(inputs=inputs, outputs=outputs)

Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)


In [None]:
model.compile(optimizer="adam", loss="sparse_categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
    train_x, train_y, batch_size=64, epochs=3, validation_data=(val_x, val_y)
)

Epoch 1/3
Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)
Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)
Epoch 2/3
Epoch 3/3


In [None]:
print(train_x[0][np.newaxis,:].shape)
prediction = model.predict(train_x[0][np.newaxis,:])

(1, 200)
Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)


In [None]:
prediction

array([[0.00169578, 0.99830425]], dtype=float32)

In [None]:
train_y[0]

1

In [None]:
model.save("/content/drive/MyDrive/Models/transformer_imdb_classification")

Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)




Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)
Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)
Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 200, 32)

positional encoding shape = (1, 200, 32)

Output shape = (None, 200, 32)
Endoder Embedding Shape = (None, 200, 32)
All Endoder Output Shape = (None, 200, 32)
Encoder Input Shape = (None, 200)
Dimensions of the each word = 32

Input words shape (None, 200)

embedding shape = (None, 