In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


## Importing Libraries

In [1]:
import tensorflow.keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Dense, Embedding,  LSTM, Dropout
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.models import load_model
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import pickle
import os

### Reading File

In [2]:
file = open("/content/drive/MyDrive/Next Word Predictor Dataset.txt", "r")
content = file.read()
content

'I hope this email finds you well\nThank you for your prompt response\nI appreciate your valuable input on this matter\nPlease find attached the document you requested\nI am writing to inform you about the upcoming event\nYour attention to detail is truly commendable\nI wanted to follow up on our recent conversation\nIt was a pleasure meeting you at the conference\nI am reaching out to discuss potential collaboration\nI wanted to express my gratitude for your support\nLet\'s schedule a meeting to go over the project details\nYour insights have been instrumental in our decision-making process\nI look forward to hearing your thoughts on this proposal\nPlease let me know if you require any further clarification\nCongratulations on your recent achievements\nI am thrilled to be a part of this exciting initiative\nYour professionalism is truly inspiring\nThe attached report contains a comprehensive analysis\nI wanted to share some exciting news with you\nI apologize for any inconvenience cau

In [3]:
tk = Tokenizer()
tk.fit_on_texts([content])

In [4]:
len(tk.word_index)

2963

In [5]:
input_sequence =[]
for text in content.split('\n'):
  for word in text:
     tokenized_sentence = tk.texts_to_sequences([text])[0]
     for i in range(1, len(tokenized_sentence)):
       input_sequence.append(tokenized_sentence[:i+1])



In [6]:
input_sequence

[[4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 1979, 5, 119],
 [4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 1979, 5, 119],
 [4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 1979, 5, 119],
 [4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 1979, 5, 119],
 [4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 1979, 5, 119],
 [4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 1979, 5, 119],
 [4, 1576],
 [4, 1576, 70],
 [4, 1576, 70, 1978],
 [4, 1576, 70, 1978, 1979],
 [4, 1576, 70, 1978, 1979, 5],
 [4, 1576, 70, 1978, 

In [7]:
max_length = max([len(x) for x in input_sequence])

In [8]:
max_length

21

## Generating Padding

In [9]:
padded_input_sequence = pad_sequences(input_sequence, maxlen=max_length, padding='pre')


#### Creating Input and Output

In [10]:
X = padded_input_sequence[:, :-1]
y = padded_input_sequence[:, -1]

#### Now making OHE of y to make this a classification problem as if we try to make it a regression task than outputs can be 2.1 ,3.5,etc. which can not be matched by our dictionary of tokenized words.

In [11]:
X.shape, y.shape

((3329062, 20), (3329062,))

In [None]:
y = to_categorical(y,num_classes=len(tk.word_index)+1)

In [None]:
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size=0.2)

In [None]:
model = Sequential()
model.add(Embedding(input_dim=len(tk.word_index)+1, output_dim=100, input_length=max_length-1))
model.add(LSTM(units=128,dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=64, activation='relu'))
model.add(Dense(len(tk.word_index)+1,activation='softmax'))

In [None]:
from keras.utils import plot_model
plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)

In [None]:
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.01), metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
history = model.fit(X_train, y_train, batch_size=1000, epochs=25, validation_data=(X_test, y_test), verbose=1)

In [None]:
text =" you must"
for i in range(4):
  token_text = tk.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([token_text], maxlen=max_length-1, padding='pre')
  pos = np.argmax(model.predict(padded_token_text))
  for word, index in tk.word_index.items():
    if index == pos:
      text += ' ' + word
text

In [None]:
|