In [23]:
import tensorflow as tf
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.preprocessing.text import Tokenizer
import numpy as np

# load data
def load_data(filePath):
    with open(filePath, 'r', encoding='utf-8') as f:
        text=f.read()
    return text

text=load_data('HarryPotterPart1.txt')


#tokenize data
tokenizer= Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])
totalWords= len(tokenizer.word_index)+1

tokens=tokenizer.texts_to_sequences([text])[0]

input_sequences=[]
seq_length =50


for i in range(seq_len, len(tokens)):
    input_sequences.append(tokens[i-seq_len:i+1])

input_sequences=np.array(pad_sequences(input_sequences,maxlen=seq_len+1, padding='pre'))

X, Y= input_sequences[:,:-1], input_sequences[:,-1]
Y=tf.keras.utils.to_categorical(Y, num_classes=totalWords)


## Core of the Transformer model ##

In [25]:
import tensorflow as tf
from tensorflow.keras.layers import Layer, Embedding, Dense, LayerNormalization, Dropout

class MultiHeadAttention(Layer):

    def __init__(self, embed_dim, num_heads):
        super(MultiHeadAttention, self).__init__()
        self.num_heads = num_heads # example - 8

        self.embed_dim = embed_dim # example - 512

        self.projection_dim = embed_dim // num_heads # Size of Each Attention Head's Subspace


        self.query_dense = Dense(embed_dim) # Q Determines "what to focus on"
        self.key_dense = Dense(embed_dim) # K Acts as "labels" to be matched with queries
        self.value_dense = Dense(embed_dim) # V Holds the actual information

        self.combine_heads = Dense(embed_dim)


    def attention(self, query, key, value):
        scores = tf.matmul(query, key, transpose_b=True)
        scores /= tf.math.sqrt(tf.cast(self.projection_dim, tf.float32)) # converting integer to a float32 tensor

        attention_probs = tf.nn.softmax(scores, axis=-1) # how much attention each token should give to other tokens

        return tf.matmul(attention_probs, value), attention_probs

    def split_heads(self, x, batch_size):
        x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
        return tf.transpose(x, perm=[0, 2, 1, 3])


    def call(self, inputs):
        query, key, value = inputs
        batch_size = tf.shape(query)[0] # (batch_size, seq_len, embed_dim)

        query = self.split_heads(self.query_dense(query), batch_size)
        key = self.split_heads(self.key_dense(key), batch_size)
        value = self.split_heads(self.value_dense(value), batch_size)

        attention, _ = self.attention(query, key, value)
        attention = tf.transpose(attention, perm=[0, 2, 1, 3])
        
        concat_attention = tf.reshape(attention, (batch_size, -1, self.embed_dim))
        return self.combine_heads(concat_attention)

class TransformerBlock(Layer):
    def __init__(self, embed_dim, num_heads, ff_dim, rate=0.1):
        super(TransformerBlock, self).__init__()
        self.att = MultiHeadAttention(embed_dim, num_heads)
        self.ffn = tf.keras.Sequential([
            Dense(ff_dim, activation="relu"),
            Dense(embed_dim),
        ])
 
        self.layernorm1 = LayerNormalization(epsilon=1e-6)
        self.layernorm2 = LayerNormalization(epsilon=1e-6)
        self.dropout1 = Dropout(rate)
        self.dropout2 = Dropout(rate)

    def call(self, inputs, training):
        attn_output = self.att([inputs, inputs, inputs])
        attn_output = self.dropout1(attn_output, training=training)
        out1 = self.layernorm1(inputs + attn_output) # Residual Connection
        ffn_output = self.ffn(out1)
        ffn_output = self.dropout2(ffn_output, training=training)
        return self.layernorm2(out1 + ffn_output) # Residual Connection

class TokenAndPositionEmbedding(Layer):
    def __init__(self, maxlen, vocab_size, embed_dim):
        super(TokenAndPositionEmbedding, self).__init__()
        self.token_emb = Embedding(input_dim=vocab_size, output_dim=embed_dim)
        self.pos_emb = Embedding(input_dim=maxlen, output_dim=embed_dim)
       

    def call(self, x):
        maxlen = tf.shape(x)[-1] # sets maxlen to the length of the input sequence
        positions = tf.range(start=0, limit=maxlen, delta=1) # Generate [0, 1, 2, ..., maxlen-1]
        positions = self.pos_emb(positions) # Each position index is mapped to a trainable embedding of shape (maxlen, embed_dim)
        x = self.token_emb(x) # Each token ID in x is mapped to an embedding of shape (batch_size, maxlen, embed_dim)
        return x + positions


In [31]:
# Model Parameters
embed_dim = 128  # Embedding size
num_heads = 4    # Number of attention heads
ff_dim = 512     # Feed-forward layer size
maxlen = seq_length # here it is 50 defined above

# Build the model
inputs = tf.keras.Input(shape=(maxlen,))
embedding_layer = TokenAndPositionEmbedding(maxlen, totalWords, embed_dim)
x = embedding_layer(inputs)
print(x.shape)
transformer_block = TransformerBlock(embed_dim, num_heads, ff_dim)
x = transformer_block(x, training=True)
print(x.shape)
x = x[:, -1, :]
print(x.shape)
x = Dense(totalWords, activation="softmax")(x)
print(x.shape)
model = tf.keras.Model(inputs=inputs, outputs=x)

# Compile the model
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])

model.summary()

(None, 50, 128)
(None, 50, 128)
(None, 128)
(None, 6663)


In [35]:
history = model.fit(X, Y, batch_size=32, epochs=10)

Epoch 1/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m66s[0m 24ms/step - accuracy: 0.0827 - loss: 6.5130
Epoch 2/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 24ms/step - accuracy: 0.1570 - loss: 5.1030
Epoch 3/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 24ms/step - accuracy: 0.2078 - loss: 4.2920
Epoch 4/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m61s[0m 24ms/step - accuracy: 0.2535 - loss: 3.6546
Epoch 5/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 24ms/step - accuracy: 0.3142 - loss: 3.1157
Epoch 6/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m60s[0m 24ms/step - accuracy: 0.3908 - loss: 2.6360
Epoch 7/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 23ms/step - accuracy: 0.4748 - loss: 2.1891
Epoch 8/10
[1m2531/2531[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m59s[0m 23ms/step - accuracy: 0.5457 - loss: 1.8325
Epoch 9/

In [48]:
def generate_text(seed_text, next_words, max_sequence_len):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word = tokenizer.index_word[np.argmax(predicted)]
        seed_text += " " + predicted_word
    return seed_text

# Generate text
seed_text = "Dumbledoor was shocked"
generated_text = generate_text(seed_text, next_words=100, max_sequence_len=seq_length + 1)
print((generated_text))

Dumbledoor was shocked and angry with the teachers behind him and harry could have sworn a low hissing voice came back to the floor in low he’d lost he looked quickly out of the window to talk to him he was looking at the wall hagrid was standing on the edge of the forest harry’s heart he was looking at his head he was looking at his head knocking on his bacon he had hardly fluttered from the way of climbing down the house championship was no good have found out who a lot of time he was looking for teams during gym
