In [None]:
# Data collection

import nltk
nltk.download('gutenberg')  # Download the Gutenberg corpus for text processing

from nltk.corpus import gutenberg


# load hamlet corpus
data = gutenberg.raw('shakespeare-hamlet.txt')
# save to a file
with open('hamlet.txt', 'w') as f:
    f.write(data)

In [None]:
# Data preprocessing
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from sklearn.model_selection import train_test_split


In [None]:
with open('hamlet.txt', 'r') as f:
    text = f.read()
from tensorflow.keras.preprocessing.text import Tokenizer
# Tokenization
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text]) # Fit the tokenizer on the text to create a vocabulary
total_words = len(tokenizer.word_index) + 1  # Total number of unique words , we are adding 1 for padding because index starts from 1
# Convert text to sequences


In [None]:
#  create an input sequence and the next word
input_sequences = []

#  now   we will create the input sequences and the next word because we want to predict the next word
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0] # Convert the line to a sequence of integers
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]  # Create a sequence of n words
        input_sequences.append(n_gram_sequence)  # Append the sequence to the input sequences

In [None]:
#  Pad sequences to ensure uniform input size
from tensorflow.keras.preprocessing.sequence import pad_sequences
max_sequence_length = max(len(x) for x in input_sequences)  # Find the maximum sequence length
#  Pad sequences to the maximum length
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_length, padding='pre'))

In [None]:

#  create predictors and label

#  we will use all words except the last one as input and the last word as output
X, y = input_sequences[:, :-1], input_sequences[:, -1]  # X is all words except the last one, y is the last word



#  now we will convert the output to categorical
from tensorflow.keras.utils import to_categorical

#  we will convert y to categorical so that we can use it for training 
y = to_categorical(y, num_classes=total_words)  # Convert y to categorical


In [None]:
#  now divide the input sequences into X and y 
x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)  # Split the data into training and testing sets

In [None]:
#  Train the model
from tensorflow.keras.models import Sequential # Import Sequential model to build the model
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout 
# Embedding is for word embeddings, 
# LSTM is for the recurrent layer, 
# Dense is for the output layer, 
# Dropout is used to prevent overfitting by randomly setting a fraction of input units to 0 at each update during training time, which helps to prevent overfitting.


model = Sequential()  # Create a Sequential model

# now we need to add the layers to the model 
model.add(Embedding(total_words, 100, input_length=max_sequence_length - 1))
# LSTM layer with 150 units and return sequences because we want to stack another LSTM layer
model.add(LSTM(150, return_sequences=True)) 

#  We will add Dropout layer for regularization because LSTM can overfit easily
model.add(Dropout(0.2))  # Dropout layer to prevent overfitting

#  Add another LSTM layer because we want to stack another LSTM layer to capture more complex patterns
model.add(LSTM(120))  # Second LSTM layer

#  Add a Dense layer with softmax activation for multi-class classification
model.add(Dense(total_words, activation='softmax'))  # Output layer with softmax activation

# compile the model

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])  


model.summary()  # Print the model summary to see the architecture




In [None]:
#  Define early stopping to prevent overfitting
from tensorflow.keras.callbacks import EarlyStopping
early_stopping = EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)  # Early stopping to prevent overfitting

In [None]:

# Train the model
history = model.fit(x_train, y_train, epochs=50, validation_data=(x_test, y_test),callbacks=[early_stopping])  # Train the model with training and testing data

In [None]:
#  now predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_length):
    # Tokenize the input text
    token_list = tokenizer.texts_to_sequences([text])[0]
    # Pad the sequence to the maximum length
    token_list = pad_sequences([token_list], maxlen=max_sequence_length - 1, padding='pre')
    # Predict the next word
    predicted = model.predict(token_list, verbose=0)
    # Get the index of the highest probability word
    predicted_index = np.argmax(predicted, axis=-1)[0]
    #  check if the predicted index is in the tokenizer's index_word
    if predicted_index in tokenizer.index_word:
        # Get the word corresponding to the index
        predicted_word = tokenizer.index_word[predicted_index]
        return predicted_word
    else:
        return None

In [None]:
#  Now predict the next word
text = "To be or not to be"
max_sequence_length = model.input_shape[1]+1  # Get the maximum sequence length from the model's input shape
predicted_word = predict_next_word(model, tokenizer, text, max_sequence_length)
print(f"The next word after '{text}' is '{predicted_word}'")  # Print

In [None]:
#  save the model
model.save('lstm_text_prediction_model.h5')  # Save the model to a file


#  save the tokenizer
import pickle
with open('tokenizer.pkl', 'wb') as f:
    # Save the tokenizer to a file
    #  protocol=pickle.HIGHEST_PROTOCOL ensures that the file is saved in the highest protocol available which is more efficient in terms of storage
    pickle.dump(tokenizer, f, protocol=pickle.HIGHEST_PROTOCOL)  
print("Model and tokenizer saved successfully.")  # Print success message