# Transformer Pre-Processing Part

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout
import numpy as np


# Load and preprocess text
def load_data(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text

file_path = "hp_1.txt" # Ensure u have this file in ur colab
text = load_data(file_path).lower()


# Convert txt to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input txt into a list of nors based on the word index
seq_length = 50 # Each input sequence contains 50 words

# First seq_length tokens (input): used for training the model
# Last token (target): used as the label the model tries to predict.
# so total of (50 + 1) in one input_seq idx

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i-seq_length:i + 1])


# pad seq and split inputs/targets
# after this x will have input and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
x, y = input_sequences[:,:-1], input_sequences[:,-1]

# One_hot encode the labels, note - there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=len(tokenizer.word_index) + 1)

# THE Above entire code is pre-processing part

FileNotFoundError: [Errno 2] No such file or directory: 'hp_1.txt'

# CORE OF THE TRANSFORMER MODEL

In [None]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout

class MultiHeadAttention(Layer):
    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # example = 8

        self.embed_dim = embed_dim # 512
        # embed_dim = dimension of Q, K, and V before splitting into xle heads
        # It is same as total dimension of the input embeddings (word embeddings)

        self.projection_dim = embed_dim // num_heads # Size of Each Attention Head's Subspace
        # Each head gets a samller subspace of the embedding dimension
        # ex = 64

        # Fully connected (dense) layers that project the input into Q, K, V
        # These layers map the input embeddings  to the same embed_dim
        # These layers will be reshaped / split later to split across attention heads
        # A single large matirx xlcn is more effiecent than many small ones
        # GPUs love large matrix xlcn casue they are optimized for ll le computation
        # This allows TF/keras to efficiently batch the computation, leaveraging better GPU memory utilization

        self.query_dense = Dense(embed_dim) # Q deterimines "what to focous on"
        self.key_dense = Dense(embed_dim) # K acts as "labels to be matched with queries"
        self.value_dense = Dense(embed_dim) # V holds the actual info

        self.combine_heads = Dense(embed_dim)
        # After multi-head attention is applied, the outputs from all heads are concatenated back into embed_dim

     # Calculating self attention over here
    def attention (self, query, key, value):
      scores = tf.matmul(query, key, transpose_b=True)
      scores /= tf.math.sqrt(tf.cast(self.projection_dim, tf.float32)) # Converting integer to a float32 tensor

      # Calculating softmax here
      attention_probs = tf.nn.softmax(scores, axis = 1) # how much attention each token should give to other tokens
      # The higher the score, the more focus that token gets
      # softmax should be applied along keys (i.e, across the last D of the scores matrix)
      # each row corresponds to a query token attending to all key tokens.
      # This ensures that each query distributes its attention across all keys property
      # each row sums to 1

      return tf.matmul(attention_probs, value), attention_probs

      # x - query, key or value with shape - (batch_size, seq_len, embed_dim)
      # batch_size - nor of sequeneces being processed in llle (for bacth processing)

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])
        # before transpose - (batch_size, seq_len, num_heads, projection_dim)
        # after transpose - (batch_size, num_heads, seq_len, projection_dim)
        # the -1 in tf.reshape is a placeholder that tells Tensorflow to automatically
        # infer that dimension's value based on the total nor of elements in the tensor
        # -1 is replaced by seq_len by tensorflow

     # In TF.keras - call(self, inputs) is a standard method used inside Layer subclasses
     # to define the forward pass of a NN layer

    def call(self, inputs):
        query, key, value = inputs
        batch_size = tf.shape(query)[0] # (batch_size, seq_len, embed_dim)

        query = self.split_heads(self.query_dense(query), batch_size)
        key = self.split_heads(self.key_dense(key), batch_size)
        value = self.split_heads(self.value_dense(value), batch_size)

        attention, _ = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3]) # Corrected transpose dimensions
        # after transpose - (batch_size, num_heads, seq_len, projection_dim)
        # before transpose - (batch_size, seq_len, num_heads, projection_dim)

        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        # Merges all heads back into a single vector
        # (batch_size, seq_len, num_heads, projection_Dim) - (batch_size, seq_len, embed_dim)
        return self.combine_heads(concat_attention) # Return the combined attention output

class TransformerBlock(Layer):
      def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
          super(TransformerBlock, self).__init__()
          self.att = MultiHeadAttention(embed_dim, num_heads)
          self.ffn = tf.keras.Sequential([
              Dense(ff_dim, activation="relu"),
              Dense(embed_dim),
          ])

          # y = (x-mean) / root(variance + epsilon)
          # epsilon ensures we never divide by zero
          # it is small enough not to affect the result but large enough to prevent instablility
          self.layernorm1 = LayerNormalization(epsilon=1e-6)
          self.layernorm2 = LayerNormalization(epsilon=1e-6)
          self.dropout1 = Dropout(rate)
          self.dropout2 = Dropout(rate)

      def call(self, inputs, training):
        attn_output = self.att([inputs, inputs, inputs])

        #Dropout randomly deactivates some neurons during training to reduce overfitting
        # ensure dropout is only applied during training, not interfernce(vector)
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) # Residual connection

        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) # Residual connection

# Token and position embedding part done over here
class TokenAndPositionEmbedding(Layer):
    def __init__(self, max_len, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=max_len, output_dim=embed_dim)
        # The embedding layer takes an integer tensor and replaces each integer with an embed_dim-sized vector

    def call(self, x):
      # the max seq len the model can handle
        max_len = tf.shape(x)[-1] # sets maxlen to the length of the input sequence
        positions = tf.range(start=0, limit=max_len, delta=1) # Generate [0, 1, 2, ...., maxlen=1]
        positions = self.pos_emb(positions) # Each position index is mapped of shape (batch_size, maxlen, embed_dim),
        x = self.token_emb(x) # Each token ID in x is mapped  to an embedding of shape (batch_size, maxlen, embed_dim)
        return x + positions

        # x has shape (batch_size, maxlen, embed_dim)
        # positions has shape (maxlen, embed_dim)
        # But maxlen == seq_len, so positions effectively has shape (seq_len, embed_dim),
        # Tensorflow broadcasts positions across batch_size, treating it as if it were (1, seq_len, embed_dim).
        # This allows element-wise addition btw x and position

# Model the whole architecture, compile and run the training

In [None]:
# Model parameters
embed_dim = 128 # Embedding size for each token
num_heads = 4 # Number of attention heads
ff_dim = 512 # Hidden layer size in feed forward network inside transformer
maxlen = seq_length # Maximum sequence length (50 defined above)

# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>") # Out of vocabulary token
                                         # If a word not seen during training appears later. it will be replaced with oov
                                         # Helps handle unknown words insted of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input txt and creates a word index (mapping of words to unique integers)


# Calculate total_words based on the tokenizer from the previous cell
total_words = len(tokenizer.word_index) + 1

# Build the model
inputs = tf.keras.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, total_words, embed_dim)

x = embedding_layer(inputs)
print(x.shape)

transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)

print(x.shape)
x = x[:, -1, :]
print(x.shape)

x= Dense(total_words, activation="softmax")(x)
print(x.shape)
model = tf.keras.Model(inputs=inputs, outputs=x)

# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()

NameError: name 'text' is not defined

In [None]:
history = model.fit(x, y, batch_size=32, epochs=10)

TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'