In [2]:
 # hppts://www.kaggle.com/datasets/shubhammaindola/harry-potter-books


import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import numpy as np


# Load and preprocess text
def load_data(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text

# Load Harry potter book txt
file_path = "hp_1.txt"
text = load_data(file_path).lower()


# Tokenization
tokenizer = Tokenizer(oov_token="<OOV>") # Out of vocabulary token
                                         # If a word not seen during training appears later. it will be replaced with oov
                                         # Helps handle unknown words insted of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input txt and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1 # 0 is usually reserved for padding

# Convert txt to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input txt into a list of nors based on the word index
seq_length = 50 # Each input sequence contains 50 words

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i-seq_length:i + 1])


# pad seq and split inputs/targets
# after this x will have input and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
x, y = input_sequences[:,:-1], input_sequences[:,-1]

# One_hot encode the labels, note - there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the simple LSTM MODEL

model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length), # word embeddings
    LSTM(256, return_sequences=True), # LSTM Layer
    LSTM(256), # Second LSTM Layer
    Dense(total_words, activation='softmax') # Output Layer

])

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x, y, epochs=5, batch_size=128)

# Function to generates txt using LSTM
def generate_text(seed_text, next_words=50, temperature = 0.7):
    for _ in range(next_words):
       tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
       tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

       predicted_probs = model.predict(tokenized_input, verbose=0)
       predicted_probs = np.log(predicted_probs) / temperature # Adjust randomness
       predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
       predicted_index = np.random.choice(len(predicted_probs[0]), p=predicted_probs[0])

       output_word = tokenizer.index_word.get(predicted_index, "")
       seed_text += " " + output_word

    return seed_text


# Generate txt using the trained model
print(generate_text("Harry looked at", next_words=50, temperature = 0.7))

Epoch 1/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m760s[0m 1s/step - accuracy: 0.0443 - loss: 7.2053
Epoch 2/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m804s[0m 1s/step - accuracy: 0.0610 - loss: 6.4746
Epoch 3/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m766s[0m 1s/step - accuracy: 0.0933 - loss: 6.1040
Epoch 4/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m787s[0m 1s/step - accuracy: 0.1057 - loss: 5.8285
Epoch 5/5
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m805s[0m 1s/step - accuracy: 0.1147 - loss: 5.6075
Harry looked at the witch and the chamber of snape’s student and ron hastily “but mr the way of the empty time ” said fred harry didn’t worry to see the chamber of a chamber of ron in his arm and george explained as a car he looked ginny ” he said neville
