In [43]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

In [44]:
text = """Artificial intelligence is transforming industries.
Machine learning models can make predictions.
Neural networks mimic the human brain.
Data preprocessing improves model accuracy.
Deep learning requires large datasets.
Feature engineering is crucial for machine learning.
Reinforcement learning trains agents through rewards.
Supervised learning uses labeled data.
Unsupervised learning finds hidden patterns.
LSTM networks handle sequential data.
Natural language processing understands text.
Computer vision detects objects in images.
Gradient descent optimizes model parameters.
Overfitting occurs when a model learns noise.
Cross-validation checks model performance."""

In [45]:
sentences = text.lower().split(".")

In [46]:
tokenizer = Tokenizer(oov_token="<OOV>")
tokenizer.fit_on_texts(sentences)

In [47]:
vocab_size = len(tokenizer.word_index) + 1
print("Vocabulary size:", vocab_size)

Vocabulary size: 72


In [48]:
labeled_sen = tokenizer.texts_to_sequences(sentences)

In [49]:
n_grams = []
for seq in labeled_sen:
    for j in range(1, len(seq)):
        n_grams.append(seq[:j+1])

In [50]:
max_len = max(len(x) for x in n_grams)
print("Max sequence length:", max_len)

Max sequence length: 7


In [51]:
padded_sen = pad_sequences(n_grams, maxlen=max_len, padding="pre")

In [52]:
X = padded_sen[:, :-1]
y = padded_sen[:, -1]

In [53]:
y = to_categorical(y, num_classes=vocab_size)

In [54]:
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=50, input_length=max_len-1))
model.add(LSTM(100))
model.add(Dense(100, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [55]:
model.compile(loss="categorical_crossentropy", optimizer="adam", metrics=["accuracy"])
model.fit(X, y, epochs=200, verbose=1)

Epoch 1/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 59ms/step - accuracy: 0.0000e+00 - loss: 4.2773
Epoch 2/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step - accuracy: 0.1029 - loss: 4.2701 
Epoch 3/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0882 - loss: 4.2644 
Epoch 4/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.0882 - loss: 4.2578
Epoch 5/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.1029 - loss: 4.2502
Epoch 6/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 37ms/step - accuracy: 0.0882 - loss: 4.2417
Epoch 7/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 32ms/step - accuracy: 0.1029 - loss: 4.2302
Epoch 8/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step - accuracy: 0.0882 - loss: 4.2117
Epoch 9/200
[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x254b2ee0df0>

In [65]:
def predict_next_word(seed_text, tokenizer, model, max_len):
    seed_text = seed_text.lower().strip()
    seq = tokenizer.texts_to_sequences([seed_text])[0]

    if len(seq) == 0:
        return "❌ Input is empty!"

   
    if 1 in seq:
        return f"❌ '{seed_text}' contains words not in training data!"

  
    padded = pad_sequences([seq], maxlen=max_len-1, padding="pre")
    pred_index = model.predict(padded, verbose=0).argmax()
    return tokenizer.index_word.get(pred_index, "<OOV>")


In [67]:
print("Next word after 'sandeep':", predict_next_word("manan", tokenizer, model, max_len))
print("Next word after 'machine learning':", predict_next_word("sandeep", tokenizer, model, max_len))


Next word after 'sandeep': ❌ 'manan' contains words not in training data!
Next word after 'machine learning': ❌ 'sandeep' contains words not in training data!
