# Data Collection

In [None]:
# Step 1: Import necessary libraries
import nltk
import pandas as pd

# Step 2: Download the Gutenberg corpus from NLTK
nltk.download('gutenberg')

# Step 3: Import the Gutenberg corpus
from nltk.corpus import gutenberg

# Step 4: Load the 'Hamlet' text
data = gutenberg.raw('shakespeare-hamlet.txt')  # This loads the raw text of Hamlet

# Step 5: Save the loaded text into a local file
with open('hamlet.txt', 'w') as file:
    file.write(data)  # Write the full text into 'hamlet.txt' file


: 

Now that you have trained and saved your model and tokenizer, you can download them to your local machine for deployment. The following code will create a zip file containing the necessary files.

You can now download the `next_word_prediction_model.zip` file from the Colab file explorer (the folder icon on the left sidebar).

# Data Preprocessing

In [None]:
# Step 1: Import necessary libraries
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

# Step 2: Load the dataset
with open('hamlet.txt', 'r') as file:
    text = file.read().lower()  # Read the text file and convert all text to lowercase

# Step 3: Initialize the Tokenizer
tokenizer = Tokenizer()  # This will convert words into unique integer indices

# Step 4: Fit the tokenizer on the text
tokenizer.fit_on_texts([text])  # Learn the vocabulary from the text

# Step 5: Get total number of unique words
total_words = len(tokenizer.word_index) + 1  # +1 because index starts from 1
print(f'Total unique words: {total_words}')


In [None]:
total_words
# how tokenize in entair data set
tokenizer.word_index

In [None]:
# Step 1: Initialize a list to store input sequences
input_sequences = []  # This list will hold sequences of words for training

# Step 2: Loop through each line of the text
for line in text.split('\n'):  # Split text by newline to process line by line
    # Step 3: Convert the line into a sequence of word indices
    token_list = tokenizer.texts_to_sequences([line])[0]  # texts_to_sequences returns a list, take the first element

    # Step 4: Create n-gram sequences from the token list
    for i in range(1, len(token_list)):
        # Take tokens from start to i+1 to form an n-gram
        n_gram_sequence = token_list[:i+1]

        # Step 5: Append the sequence to the input_sequences list
        input_sequences.append(n_gram_sequence)

# Step 6: Check how many sequences were created
print(f'Total input sequences: {len(input_sequences)}')



Input Sequences: Teaches the model the relationship between a context and the next word.

Sentence: "to be or not to be"
Tokens: [5, 12, 3, 7, 5, 12]
Generated Sequences:
[5, 12] -> Target: 3
[5, 12, 3] -> Target: 7
[5, 12, 3, 7] -> Target: 5
[5, 12, 3, 7, 5] -> Target: 12

In [None]:
# Step 1: Find the maximum sequence length
max_sequence_len = max([len(x) for x in input_sequences])
# This is needed because LSTM models expect all input sequences to have the same length
print(f"Maximum sequence length: {max_sequence_len}")

### **Why padding is  needed?**

**LSTM Requires Fixed-Length Input:**
- LSTM or RNN models require input sequences to be of the same length.
- However, the n-gram sequences we created have different lengths.

**Padding Solves the Problem:**
- We add zeros at the beginning of shorter sequences to make their length equal to `max_sequence_len`.

**Example:**
- Original sequence: `[5, 12, 3]`
- Max length = 6
- After padding: `[0, 0, 0, 5, 12, 3]`

**The Model Understands:**
- The model learns to ignore the zero padding and focuses on learning to predict the next word from the proper context.

In [None]:
# Step 1: Import pad_sequences (already imported earlier)
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Step 2: Pad all input sequences to have the same length
input_sequences = np.array(pad_sequences(input_sequences,
                                        maxlen=max_sequence_len,  # Pad all sequences to max length
                                        padding='pre'))           # Add padding at the beginning

# Step 3: Check the padded sequences
print(input_sequences[:5])  # Display first 5 sequences to see padding


In [None]:
# Step 1: Import TensorFlow (already imported)
import tensorflow as tf

# Step 2: Split input_sequences into predictors (X) and label (y)
x = input_sequences[:, :-1]  # All words except the last one are inputs (predictors)
y = input_sequences[:, -1]   # The last word in each sequence is the target (label)

# Step 3: Check shapes
print(f'X shape: {x.shape}')
print(f'y shape: {y.shape}')


### **Why is this step needed?**

**Predictors (X):**
- The model learns to predict the next word from these sequences.
- Example: `[0, 0, 0, 5, 12]` → The model will predict the next word based on this context.

**Label (y):**
- This is the target word that the model needs to predict.
- Example: From the sequence `[0, 0, 0, 5, 12, 3]`, the number `3` will be the target.

**Training-ready format:**
- Now we can one-hot encode `y` because the LSTM output will be categorical (with vocabulary size number of classes).
- This creates the final format needed for training the neural network.

In [None]:
# Step 1: One-hot encode the target labels
y = tf.keras.utils.to_categorical(y, num_classes=total_words)
# Converts each integer label into a vector of length = total_words
# Example: if total_words = 100, label 5 → [0,0,0,0,0,1,0,...,0]

# Step 2: Check the shape of y
print(f'One-hot encoded y shape: {y.shape}')


# Split the data into training and testing sets

In [None]:
# Split the data into training and testing sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

Model Trinig

In [None]:
# Step 1: Import EarlyStopping callback from Keras
from tensorflow.keras.callbacks import EarlyStopping

# Step 2: Define EarlyStopping
early_stopping = EarlyStopping(
    monitor='val_loss',        # Monitor the validation loss during training
    patience=3,                # Stop training if val_loss doesn't improve for 3 consecutive epochs
    restore_best_weights=True  # After stopping, restore model weights from the epoch with the best val_loss
)

# Step 3: Why we use it?
# - Prevents overfitting by stopping training early
# - Saves time by not training unnecessary epochs
# - Ensures the model keeps the best weights observed during training


In [None]:
# Step 1: Import necessary layers and model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout

# Step 2: Define the Sequential model
model = Sequential()

# Step 3: Add Embedding layer
model.add(Embedding(
    input_dim=total_words,     # Vocabulary size
    output_dim=100,            # Embedding vector size
    input_length=max_sequence_len-1  # Input sequence length (excluding target word)
))
# Embedding layer converts word indices into dense vectors

# Step 4: Add first LSTM layer
model.add(LSTM(150, return_sequences=True))
# return_sequences=True because we will stack another LSTM on top

# Step 5: Add Dropout layer to prevent overfitting
model.add(Dropout(0.2))

# Step 6: Add second LSTM layer
model.add(LSTM(100))
# return_sequences=False by default, outputs the last hidden state

# Step 7: Add Dense output layer with softmax activation
model.add(Dense(total_words, activation="softmax"))
# Predicts probability for each word in the vocabulary

# Step 8: Compile the model
model.compile(
    loss="categorical_crossentropy",  # Suitable for multi-class classification
    optimizer='adam',                 # Adam optimizer for faster convergence
    metrics=['accuracy']              # Track accuracy during training
)

# Explicitly build the model with the input shape
model.build(input_shape=(None, max_sequence_len - 1))


# Step 9: Show model summary
model.summary()

In [None]:
# ## GRU RNN
# ## Define the model
# model=Sequential()
# model.add(Embedding(total_words,100,input_length=max_sequence_len-1))
# model.add(GRU(150,return_sequences=True))
# model.add(Dropout(0.2))
# model.add(GRU(100))
# model.add(Dense(total_words,activation="softmax"))

# # #Compile the model
# model.compile(loss="categorical_crossentropy",optimizer='adam',metrics=['accuracy'])
# model.build(input_shape=(None, max_sequence_len-1))
# model.summary()

In [None]:
history=model.fit(x_train,y_train,epochs=50,validation_data=(x_test,y_test),verbose=1)

In [None]:
# Function to predict the next word
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    if len(token_list) >= max_sequence_len:
        token_list = token_list[-(max_sequence_len-1):]  # Ensure the sequence length matches max_sequence_len-1
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=1)
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return None

In [None]:
input_text="I love AI "
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

In [None]:
## Save the model
model.save("next_word_lstm.keras")
## Save the tokenizer
import pickle
with open('tokenizer.pickle','wb') as handle:
    pickle.dump(tokenizer,handle,protocol=pickle.HIGHEST_PROTOCOL)

In [None]:
input_text="  Barn. Last night of all,When yond same"
print(f"Input text:{input_text}")
max_sequence_len=model.input_shape[1]+1
next_word=predict_next_word(model,tokenizer,input_text,max_sequence_len)
print(f"Next Word PRediction:{next_word}")

In [None]:
# Folder Create and download model

In [None]:
import os
print(os.listdir('.'))

In [None]:
import zipfile

# List of files to include
files_to_zip = ['next_word_lstm.keras', 'tokenizer.pickle']
zip_filename = 'next_word_prediction_model.zip'

# Create a zip archive
with zipfile.ZipFile(zip_filename, 'w') as zipf:
    for file in files_to_zip:
        zipf.write(file)  # Add file to zip

print(f"Created {zip_filename} containing the model and tokenizer.")


In [None]:
from google.colab import files
files.download(zip_filename)


In [None]:
import pkg_resources

# Define a list of the packages used in the notebook
packages = ['nltk', 'pandas', 'numpy', 'tensorflow', 'sklearn', 'streamlit', 'pickle']

# Print the version of each package if it's installed
for package in packages:
    try:
        print(f"{package}: {pkg_resources.get_distribution(package).version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package}: Not installed")

In [None]:
import pkg_resources

# Define a list of the packages used in the notebook
packages = ['nltk', 'pandas', 'numpy', 'tensorflow', 'sklearn', 'streamlit', 'pickle']

# Print the version of each package if it's installed
for package in packages:
    try:
        print(f"{package}: {pkg_resources.get_distribution(package).version}")
    except pkg_resources.DistributionNotFound:
        print(f"{package}: Not installed")

In [None]:
import tensorflow as tf
print(tf.__version__)


In [None]:
!pip freeze > requirements.txt


In [None]:
# tokenizer JSON format save
tokenizer_json = tokenizer.to_json()
with open("tokenizer.json", "w", encoding="utf-8") as f:
    f.write(tokenizer_json)
