In [2]:
import numpy as np

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras .preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [None]:
# # Step 1: Sample Corpus
# corpus = [
#     "I love machine learning",
#     "I love deep learning",
#     "I love natural language processing",
#     "Deep learning is fun",
#     "Natural language processing is a part of AI",
#     "Machine learning is a subset of AI",
#     "AI is transforming the world"
# ]


In [6]:
with open("corpus.txt", "r", encoding="utf-8") as f:
    corpus = f.read().splitlines()

print(corpus[:5])  # check the first 5 sentences


['    "I love machine learning",', '    "I love deep learning",', '    "I love natural language processing",', '    "Deep learning is fun",', '    "Natural language processing is a part of AI",']


In [7]:
# Step2: Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index)+1

In [8]:
# Step 3: Creating an input sequences
input_sequences = []
for line in corpus:
  token_list = tokenizer.texts_to_sequences([line])[0]
  for i in range(1, len(token_list)):
    n_gram_sequence = token_list[:i+1]
    input_sequences.append(n_gram_sequence)

# input_sequences

In [9]:
# Step 4: pad sequences and split into x and y
max_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_len, padding='pre'))

X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = np.eye(total_words)[y]  # one-hot-encoding

In [10]:
# Step 5: Bulding the Model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_len-1))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()



In [11]:
# Step 6: Training the model
model.fit(X, y, epochs=300, verbose=1)

Epoch 1/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 7ms/step - accuracy: 0.0563 - loss: 6.4213
Epoch 2/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0729 - loss: 5.7718
Epoch 3/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0714 - loss: 5.6632
Epoch 4/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 7ms/step - accuracy: 0.0743 - loss: 5.6007
Epoch 5/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0815 - loss: 5.4032
Epoch 6/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0946 - loss: 5.2095
Epoch 7/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step - accuracy: 0.0878 - loss: 5.1294
Epoch 8/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 6ms/step - accuracy: 0.0937 - loss: 4.9728
Epoch 9/300
[1m79/79[0m [32m━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x782e0c426650>

In [12]:
# Step 7: Predict next word
def predict_next_word(seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word = tokenizer.index_word[np.argmax(predicted)]
    return predicted_word

In [14]:
# Test
print("Input: I love -->", predict_next_word("i love"))
print("Input: Deep learning -->", predict_next_word("why"))

Input: I love --> deep
Input: Deep learning --> visited


In [53]:
predict_next_word('and the name was')

'overfitting'

In [54]:
# Saving the model
model.save("next_word_model.h5")




In [55]:
# Load the Model
from tensorflow.keras.models import load_model

model = load_model("next_word_model.h5")




In [56]:
# Save Tokenizer:
import pickle

with open('tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)


In [57]:
# Load Tokenizer:
with open('tokenizer.pkl', 'rb') as f:
    tokenizer = pickle.load(f)


In [58]:
# Predicting After Loading

seed = "I love"
token_list = tokenizer.texts_to_sequences([seed])[0]
token_list = pad_sequences([token_list], maxlen=max_len-1, padding='pre')
predicted = model.predict(token_list, verbose=0)
print("Predicted word:", tokenizer.index_word[np.argmax(predicted)])


Predicted word: deep
