In [52]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN, Dense
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import pad_sequences
import numpy as np


# Load text
def load_data(file_path):
    with open(file_path,'r',encoding='utf-8') as f:
        text=f.read()
    return text

filePath='HarryPotterPart1.txt'
text=load_data(filePath).lower()

tokenizer = Tokenizer(oov_token='<OOV>') # Out-Of-Vocabulary token
                                        # If a word not seen during training appears later, it will be replaced with <OOV>
                                        # Helps handle unknown words instead of ignoring them
tokenizer.fit_on_texts([text]) # analyzes the input text and creates a word index (mapping of words to unique integers)
total_words = len(tokenizer.word_index) + 1 #  0 is usually reserved for padding


# Convert text to sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0] # converts the input text into a list of numbers based on the word index
# if we were training tokenizer on more than 1 text, then for whichever text we require tokens we will take it from above line

seq_length = 50  # Each input sequence contains 50 words

# First seq_length tokens (input): Used for training the model.
# Last token (target): Used as the label the model tries to predict.
# so total of (50 + 1) in one input_sequence index

for i in range(seq_length, len(tokens)):
    input_sequences.append(tokens[i - seq_length:i + 1])

# Pad sequences and split inputs/targets
# after this X will have inputs and y will have label for those inputs

input_sequences = np.array(pad_sequences(input_sequences, maxlen=seq_length + 1, padding='pre'))
X, y = input_sequences[:, :-1], input_sequences[:, -1]

# One-hot encode the labels , note- there are other ways for
# encoding like pre-trained word2vec encoding and so on

y = tf.keras.utils.to_categorical(y, num_classes=total_words)

# Build the Simple RNN model
model = Sequential([
    Embedding(input_dim=total_words, output_dim=64, input_length=seq_length),  # Word embeddings
    SimpleRNN(256, return_sequences=False),  # RNN Layer
    Dense(256, activation='relu'),  # Fully Connected Layer
    Dense(total_words, activation='softmax')  # Output Layer
])

# 256 in RNN - The number of hidden units (size of the hidden state vector)
# return_sequences=False  - The RNN will only return the final hidden state after processing the entire sequence

# Compile the model
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
model.fit(X, y, epochs=30, batch_size=128)


# Function to generate text using RNN
def generate_text(seed_text, next_words=50):
    for _ in range(next_words):
        tokenized_input = tokenizer.texts_to_sequences([seed_text])[0]
        tokenized_input = pad_sequences([tokenized_input], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(tokenized_input, verbose=0)
        predicted_index = np.argmax(predicted_probs)
        predicted_word = tokenizer.index_word.get(predicted_index, "<OOV>")

        seed_text += " " + predicted_word
    return seed_text







Epoch 1/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 50ms/step - accuracy: 0.0416 - loss: 6.9401
Epoch 2/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 52ms/step - accuracy: 0.0677 - loss: 6.2561
Epoch 3/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 53ms/step - accuracy: 0.1014 - loss: 5.8055
Epoch 4/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 51ms/step - accuracy: 0.1182 - loss: 5.4872
Epoch 5/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 50ms/step - accuracy: 0.1314 - loss: 5.2150
Epoch 6/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 50ms/step - accuracy: 0.1435 - loss: 4.9769
Epoch 7/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 50ms/step - accuracy: 0.1557 - loss: 4.7537
Epoch 8/30
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 49ms/step - accuracy: 0.1629 - loss: 4.5561
Epoch 9/30
[1m633/633[

In [56]:

# Generate text using the trained model
print(generate_text("Harry is so cute and"))

Harry is so cute and buy the ‘gar’ “dumbledore ” he said finally “no we’ll ignore the letter from ter the first question he said and stopped out of the hut of his bed “merry christmas ” he said “no ” said hermione “i don’t care about the train ” he ignored them two identical


In [113]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, Dense, LSTM

# load data
def load_data(file_path):
    with open(file_path,'r',encoding='utf-8') as f:
        text=f.read()
    return text

text=load_data('HarryPotterPart1.txt').lower()

#tokenize data
tokenizer=Tokenizer(oov_token='<OOV>')
tokenizer.fit_on_texts([text])
totalWords=len(tokenizer.word_index)+1;

# Convert text into sequences
input_sequences = []
tokens = tokenizer.texts_to_sequences([text])[0]

seq_len=50

for i in range (seq_len, len(tokens)):
    input_sequences.append(tokens[i-seq_len: i+1])

input_sequences=np.array(input_sequences)
X,Y= input_sequences[:,:-1], input_sequences[:,-1]
Y=tf.keras.utils.to_categorical(Y, num_classes=totalWords)

model=Sequential([
    Embedding(input_dim=totalWords, output_dim=100),
    LSTM(256, return_sequences=True),
    LSTM(256),
    Dense(total_words, activation='softmax')
])

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X,Y, epochs=10,  batch_size=128)


Epoch 1/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m160s[0m 248ms/step - accuracy: 0.0409 - loss: 7.0609
Epoch 2/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 281ms/step - accuracy: 0.0528 - loss: 6.3783
Epoch 3/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m217s[0m 343ms/step - accuracy: 0.0758 - loss: 6.0866
Epoch 4/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m241s[0m 381ms/step - accuracy: 0.0983 - loss: 5.7929
Epoch 5/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m245s[0m 388ms/step - accuracy: 0.1112 - loss: 5.6045
Epoch 6/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m642s[0m 1s/step - accuracy: 0.1229 - loss: 5.3776
Epoch 7/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m177s[0m 280ms/step - accuracy: 0.1281 - loss: 5.2047
Epoch 8/10
[1m633/633[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m178s[0m 281ms/step - accuracy: 0.1380 - loss: 5.0227
Epoch 9/10


<keras.src.callbacks.history.History at 0x13b8f3c9700>

In [141]:
def generate_LSTM(seed_text, next_words=50, temperature=0.7):
    for _ in range(next_words):
        token_list = tokenizer.texts_to_sequences([seed_text])[0]
        token_list = pad_sequences([token_list], maxlen=seq_length, padding='pre')

        predicted_probs = model.predict(token_list, verbose=0)[0]
        predicted_probs = np.log(predicted_probs) / temperature  # Adjust randomness
        predicted_probs = np.exp(predicted_probs) / np.sum(np.exp(predicted_probs))
        predicted_index = np.random.choice(range(len(predicted_probs)), p=predicted_probs)

        output_word = tokenizer.index_word.get(predicted_index, "")
        seed_text += " " + output_word

    return seed_text

# Generate text
print(generate_LSTM("harry loves hermione",next_words=50, temperature=0.7))


harry loves hermione ” said ron impatiently — you get me ” said ron “but “knuts ” said ron but harry shared his own chat said ron had been not on a very muggle badge and join you all by your way in the library seemed in the hospital umbrella on the window
