# RNN Model CODE

In [None]:
 # hppts://www.kaggle.com/datasets/shubhammaindola/harry-potter-books


import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import SimpleRNN, Embedding, Dense
import numpy as np


# Load and preprocess text
def load_data(file_path):
        with open(file_path, 'r', encoding='utf-8') as file:
            text = file.read()
        return text

file_path = "hp_1.txt" # Ensure u have this file in ur colab
text = load_data(file_path).lower()


# Tokenization
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences

tokenizer = Tokenizer(oov_token="<OOV>") # Out of vocabulary token
                                         # If a word not seen during training appears later. it will be replaced with oov
                                         # Helps handle unknown words insted of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input txt and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1 # 0 is usually reserved for padding

# Convert txt to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input txt into a list of nors based on the word index
seq_length = 50 # Each input sequence contains 50 words

# First seq_length tokens (input): used for training the model
# Last token (target): used as the label the model tries to predict.
# so total of (50 + 1) in one input_seq idx

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i-seq_length:i + 1])


# pad seq and split inputs/targets
# after this x will have input and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
x, y = input_sequences[:,:-1], input_sequences[:,-1]

# One_hot encode the labels, note - there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the simple RNN MODEL

model = Sequential([
    Embedding(input_dim=total_words, output_dim=100, input_length=seq_length), # word embeddings
    SimpleRNN(256, return_sequences=False), # RNN Layer
    Dense(256, activation='relu'), # Fully Connected Layer
    Dense(total_words, activation='softmax') # Output Layer

])

# 256 in RNN - The nor of hidden units (size of the hidden state vector)
# return_sequences=False - the RNN will only return the final hidden state after processing the entire sequence

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(x, y, epochs=10, batch_size=128)

# Function to generates txt using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
       tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
       # Ensure tokenized_input is a list of lists for pad_sequences
       tokenized_input = [tokenized_input] if isinstance(tokenized_input, list) else [[tokenized_input]]
       tokenized_input = pad_sequences(tokenized_input, maxlen=seq_length, padding='pre')

       predicted_probs = model.predict(tokenized_input, verbose=0)
       predicted_index = np.argmax(predicted_probs)
       predicted_word = tokenizer.index_word.get(predicted_index, "<OOV>") # Use .get() to handle potential missing keys

       seed_text += " " + predicted_word

    return seed_text


# Generate txt using the trained model
print(generate_text("Harry looked at")) # Here model should guess the next 50 words form the hp_1.txt file and give the output of it



Epoch 1/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m139s[0m 194ms/step - accuracy: 0.0466 - loss: 7.0145
Epoch 2/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m141s[0m 194ms/step - accuracy: 0.0868 - loss: 6.2068
Epoch 3/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 194ms/step - accuracy: 0.1165 - loss: 5.7271
Epoch 4/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 194ms/step - accuracy: 0.1336 - loss: 5.4209
Epoch 5/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m142s[0m 194ms/step - accuracy: 0.1469 - loss: 5.1562
Epoch 6/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m134s[0m 192ms/step - accuracy: 0.1563 - loss: 4.9189
Epoch 7/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m135s[0m 194ms/step - accuracy: 0.1704 - loss: 4.6845
Epoch 8/10
[1m697/697[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m136s[0m 195ms/step - accuracy: 0.1812 - loss: 4.4500
Epoch 9/

The model learns local patterns, not long-term dependencies

    . RNN struggle with long-range dependencies cause they do not retain info well over long sequences

    . This is why the txt seems grammatically ok but lacks deeper context.


The model generates phrases on probabilites

    .It predicts the most likely nxt word given the past words.

    . It does not understand meaning but follows statistical patterns.

    . It captures writting style but lacks coherence


Word appear logically related but do not form a strong narrative/meaning. The model does not truly "understand" the book, it just mimics word usage.