<a href="https://colab.research.google.com/github/mohitmalviya0707/next-Word-predict/blob/main/next_word_pridect.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [8]:
df = pd.read_csv("qoute_dataset.csv")

In [9]:
df.head()

Unnamed: 0,quote,Author
0,“The world as we have created it is a process ...,Albert Einstein
1,"“It is our choices, Harry, that show what we t...",J.K. Rowling
2,“There are only two ways to live your life. On...,Albert Einstein
3,"“The person, be it gentleman or lady, who has ...",Jane Austen
4,"“Imperfection is beauty, madness is genius and...",Marilyn Monroe


In [10]:
quotes = df['quote']
quotes.head()

Unnamed: 0,quote
0,“The world as we have created it is a process ...
1,"“It is our choices, Harry, that show what we t..."
2,“There are only two ways to live your life. On...
3,"“The person, be it gentleman or lady, who has ..."
4,"“Imperfection is beauty, madness is genius and..."


In [11]:
quotes = quotes.str.lower()

In [12]:
import string
translator = str.maketrans('', '', string.punctuation)
quotes = quotes.apply(lambda x: x.translate(translator))

In [13]:
from tensorflow.keras.preprocessing.text import Tokenizer

In [52]:
from IPython.core.displayhook import tokenize
# Initialize tokenizer without num_words to capture all unique words
tokinizer = Tokenizer()
tokinizer.fit_on_texts(quotes)
# Calculate the actual vocab_size including 0 for padding
vocab_size = len(tokinizer.word_index) + 1
print(f"Calculated vocab_size: {vocab_size}")

Calculated vocab_size: 8979


In [53]:
word_index = tokinizer.word_index
print(f"Length of word_index: {len(word_index)}")
list(word_index.items())[:10]

Length of word_index: 8978


[('the', 1),
 ('you', 2),
 ('to', 3),
 ('and', 4),
 ('a', 5),
 ('i', 6),
 ('is', 7),
 ('of', 8),
 ('that', 9),
 ('it', 10)]

In [54]:
sequence = tokinizer.texts_to_sequences(quotes)

In [55]:
for i in range(3):
  print(quotes[i])

“the world as we have created it is a process of our thinking it cannot be changed without changing our thinking”
“it is our choices harry that show what we truly are far more than our abilities”
“there are only two ways to live your life one is as though nothing is a miracle the other is as though everything is a miracle”


In [56]:
for i in range(3):
  print(sequence[i])

[713, 62, 29, 19, 16, 946, 10, 7, 5, 1156, 8, 70, 293, 10, 145, 12, 809, 104, 752, 70, 2461]
[947, 7, 70, 871, 373, 9, 433, 21, 19, 465, 14, 294, 52, 54, 70, 3676]
[1337, 14, 53, 201, 714, 3, 81, 15, 36, 37, 7, 29, 329, 93, 7, 5, 1157, 1, 101, 7, 29, 329, 126, 7, 5, 3677]


In [57]:
X = []
y = []

for seq in sequence:
  for i in range(1,len(seq)):
    input_seq = seq[:i]
    output_seq = seq[i]
    X.append(input_seq)
    y.append(output_seq)

In [58]:
max_len = max(len(x) for x in X)
print(max_len)

745


In [59]:

from tensorflow.keras.preprocessing.sequence import pad_sequences
X_padded = pad_sequences(X, maxlen=max_len, padding='pre')

In [60]:

y = np.array(y)

In [61]:
X_padded.shape

(85271, 745)

In [62]:
from tensorflow.keras.utils import to_categorical
y_one_hot = to_categorical(y, num_classes=vocab_size)

In [63]:

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,SimpleRNN,LSTM, Dense

In [64]:
embedding_dim = 50
rnn_units = 128


In [65]:

rnn_model = Sequential()

rnn_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
)
rnn_model.add(SimpleRNN(units=rnn_units))
rnn_model.add(Dense(units=vocab_size, activation='softmax'))



In [66]:
rnn_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


In [67]:

rnn_model.summary()

In [68]:
print(f"Vocab_size used for LSTM model: {vocab_size}")
lstm_model = Sequential()
lstm_model.add(
    Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=max_len)
)
lstm_model.add(LSTM(units=rnn_units))
lstm_model.add(Dense(units=vocab_size, activation='softmax'))

Vocab_size used for LSTM model: 8979


In [69]:
lstm_model.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)


y.shape


(85271,)

In [70]:
lstm_model.summary()

In [71]:
# from tensorflow.keras.models import load_model

# lstm_model = load_model("lstm_model.h5")

In [72]:
# lstm_model.save("lstm_model.h5")

In [73]:

index_to_word = {}
for word, index in word_index.items():
  index_to_word[index] = word

In [74]:

from tensorflow.keras.preprocessing.sequence import pad_sequences

In [75]:
def predictor(model,tokenizer,text,max_len):
  text = text.lower()

  seq = tokenizer.texts_to_sequences([text])[0]
  seq = pad_sequences([seq], maxlen=max_len, padding='pre')

  pred = model.predict(seq,verbose = 0)
  print(f"Prediction array shape: {pred.shape}") # Debug print
  print(f"Prediction array min value: {pred.min()}, max value: {pred.max()}") # Debug print
  pred_index = np.argmax(pred)
  print(f"Predicted index: {pred_index}") # Debug print
  return index_to_word[pred_index]

In [76]:
seed_text = "what are you"
next_word = predictor(lstm_model,tokinizer,seed_text,max_len)
print(next_word)

Prediction array shape: (1, 8979)
Prediction array min value: 0.00011080834519816563, max value: 0.00011190954683115706
Predicted index: 4026
meantlove


In [77]:
def generate_text(model,tokenizer,seed_text,max_len,n_words):
  for _ in range(n_words):
    next_word = predictor(model,tokenizer,seed_text,max_len)
    if next_word == "":
      break
    seed_text += " " + next_word
  return seed_text


In [78]:
seed = "are you a "
generate_text = generate_text(lstm_model,tokinizer,seed,max_len,10)
print(generate_text)

Prediction array shape: (1, 8979)
Prediction array min value: 0.000110865272290539, max value: 0.00011190498480573297
Predicted index: 3755
Prediction array shape: (1, 8979)
Prediction array min value: 0.00011086854647146538, max value: 0.00011184863978996873
Predicted index: 3919
Prediction array shape: (1, 8979)
Prediction array min value: 0.00011098292452516034, max value: 0.00011180499132024124
Predicted index: 3755
Prediction array shape: (1, 8979)
Prediction array min value: 0.00011093176726717502, max value: 0.00011179229477420449
Predicted index: 8318
Prediction array shape: (1, 8979)
Prediction array min value: 0.00011097799870185554, max value: 0.00011175252438988537
Predicted index: 4012
Prediction array shape: (1, 8979)
Prediction array min value: 0.00011099549010396004, max value: 0.00011173642997164279
Predicted index: 2336
Prediction array shape: (1, 8979)
Prediction array min value: 0.00011100544361397624, max value: 0.00011173688835697249
Predicted index: 5868
Predicti