### Importing Required Libraries

In [2]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [3]:
from google.colab import files
uploaded = files.upload()

Saving Pride and Prejudice.txt to Pride and Prejudice.txt


### Load and Preprocess data

In [4]:
file = open('Pride and Prejudice.txt', 'r', encoding='utf8')

lines = []
for i in file:
  lines.append(i)

data = ""
for i in lines:
  data = ' '.join(lines)

data = data.replace('\n','').replace('\r','').replace('\ufeff','').replace('"','').replace('*','').replace('#','').replace('[','').replace(']','')

In [5]:
data = data.split()
data = ' '.join(data)
data[:1000]

'The Project Gutenberg eBook of Pride and Prejudice, by Jane Austen This eBook is for the use of anyone anywhere in the United States and most other parts of the world at no cost and with almost no restrictions whatsoever. You may copy it, give it away or re-use it under the terms of the Project Gutenberg License included with this eBook or online at www.gutenberg.org. If you are not located in the United States, you will have to check the laws of the country where you are located before using this eBook. Title: Pride and Prejudice Author: Jane Austen Release Date: June, 1998 eBook 1342 Most recently updated: August 23, 2021 Language: English Character set encoding: UTF-8 Produced by: Anonymous Volunteers and David Widger START OF THE PROJECT GUTENBERG EBOOK PRIDE AND PREJUDICE THERE IS AN ILLUSTRATED EDITION OF THIS TITLE WHICH MAY VIEWED AT EBOOK 42671 cover Pride and Prejudice By Jane Austen CONTENTS Chapter 1 Chapter 2 Chapter 3 Chapter 4 Chapter 5 Chapter 6 Chapter 7 Chapter 8 Cha

In [6]:
len(data) # corpus

701895

## Apply Tokenization

In [7]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

pickle.dump(tokenizer, open('token.pkl', 'wb'))

In [8]:
sequence_data = tokenizer.texts_to_sequences([data])[0]    # every word has unique token
sequence_data[:10]

[1, 176, 160, 947, 3, 335, 4, 1209, 31, 74]

In [9]:
len(sequence_data)     

127051

In [10]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)                # unique words 

7243


In [11]:
sequences = []

for i in range(3, len(sequence_data)):
  words = sequence_data[i-3:i+1]          # next word will be predict based on previous three words (if i=3, se_da[0:4])
  sequences.append(words)

print("The length of the sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The length of the sequences are:  127048


array([[   1,  176,  160,  947],
       [ 176,  160,  947,    3],
       [ 160,  947,    3,  335],
       [ 947,    3,  335,    4],
       [   3,  335,    4, 1209],
       [ 335,    4, 1209,   31],
       [   4, 1209,   31,   74],
       [1209,   31,   74, 2600],
       [  31,   74, 2600,   43],
       [  74, 2600,   43,  947]])

In [12]:
X = []
y = []

for i in sequences:
  X.append(i[0:3])
  y.append(i[3])

In [13]:
X = np.array(X)
y = np.array(y)

print("X : ", X[:10])
print("y : ", y[:10])

X :  [[   1  176  160]
 [ 176  160  947]
 [ 160  947    3]
 [ 947    3  335]
 [   3  335    4]
 [ 335    4 1209]
 [   4 1209   31]
 [1209   31   74]
 [  31   74 2600]
 [  74 2600   43]]
y :  [ 947    3  335    4 1209   31   74 2600   43  947]


In [14]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [15]:
y.shape

(127048, 7243)

# Creating the Model

In [16]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=3))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation='relu'))
model.add(Dense(vocab_size, activation='softmax'))

In [17]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 3, 10)             72430     
                                                                 
 lstm (LSTM)                 (None, 3, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 7243)              7250243   
                                                                 
Total params: 20,371,673
Trainable params: 20,371,673
Non-trainable params: 0
_________________________________________________________________


## Build our model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint

checkpoint = ModelCheckpoint('next_words.h5', monitor='loss', verbose=1, save_best_only=True)
model.compile(loss='categorical_crossentropy', optimizer=Adam(learning_rate=0.001))
model.fit(X, y, epochs=70, batch_size=64, callbacks=[checkpoint])

## Prediction

In [19]:
from tensorflow.keras.models import load_model

model = load_model('next_words.h5')
tokenizer = pickle.load(open('token.pkl', 'rb'))

In [23]:
def predict_next_word(model, tokenizer, text):

  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)

  preds = np.argmax(model.predict(sequence))
  predicted_word = ""

  for key, val in tokenizer.word_index.items():
    if val == preds:
      predicted_word = key
      break

  print(predicted_word)
  return predicted_word

In [24]:
while True:
  text = input("Enter Your Text :")

  if text == '0':
    print("Execution Completed")
    break
  
  else:
    try:
      text = text.split(' ')
      text = text[-3:]
      print(text)

      predict_next_word(model, tokenizer, text)

    except Exception as e:
      print('Error Occured: ', e)
      continue

Enter Your Text :The Project Gutenberg eBook
['Project', 'Gutenberg', 'eBook']
pride
Enter Your Text :Pride and Prejudice,
['Pride', 'and', 'Prejudice,']
a
Enter Your Text :the Project Gutenberg License
['Project', 'Gutenberg', 'License']
included
Enter Your Text : Chapter 19
['', 'Chapter', '19']
the
Enter Your Text :her he was
['her', 'he', 'was']
writing
Enter Your Text :it is a subject
['is', 'a', 'subject']
which
Enter Your Text :He paused in
['He', 'paused', 'in']
hopes
Enter Your Text :accosted by Miss Bingley
['by', 'Miss', 'Bingley']
“i
Enter Your Text :I can guess
['I', 'can', 'guess']
how
Enter Your Text :I should imagine 
['should', 'imagine', '']
”
Enter Your Text :I have been
['I', 'have', 'been']
the
Enter Your Text :My name is
['My', 'name', 'is']
all
Enter Your Text :I am all
['I', 'am', 'all']
astonishment
Enter Your Text :He was interrupted
['He', 'was', 'interrupted']
by
Enter Your Text :During dinner, Mr. Bennet scarcely
['Mr.', 'Bennet', 'scarcely']
she
Enter Your