# Next Word Prediction using RNN's

In [6]:
import tensorflow as tf
print(tf.__version__)

2.8.0


Importing Libraries and Data

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam
import pickle
import numpy as np
import os

In [8]:
file = open("C:\\Users\\Parth Salke\\Downloads\\Datasets\\metamorphosis-clean.txt", "r", encoding = "utf8")
lines = []

In [9]:
for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  ï»¿One morning, when Gregor Samsa woke from troubled dreams, he found

The Last Line:  first to get up and stretch out her young body.


Cleaning the Data

In [10]:
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin.  He lay on his armour-like back, and if he lifted his head a little he could see his brown belly, slightly domed and divided by arches into stiff sections.  The bedding was hardly able to cover it and seemed ready to slide off any moment.'

In [11]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'One morning  when Gregor Samsa woke from troubled dreams  he found himself transformed in his bed into a horrible vermin   He lay on his armour like back  and if he lifted his head a little he could see his brown belly  slightly domed and divided by arches into stiff sections   The bedding was hardly able to cover it and seemed ready to slide off any moment   His many legs  pitifully thin compared with the size of the rest of him  waved about helplessly as he looked    What s happened to me   he'

In [12]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'One morning, when Gregor Samsa woke from troubled dreams, he found himself transformed in his bed into a horrible vermin. He lay on armour-like back, and if lifted head little could see brown belly, slightly domed divided by arches stiff sections. The bedding was hardly able to cover it seemed ready slide off any moment. His many legs, pitifully thin compared with the size of rest him, waved about helplessly as looked. "What\'s happened me?" thought. It wasn\'t dream. room, proper human room altho'

Labeling

In [13]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[17, 53, 293, 2, 18, 729, 135, 730, 294, 8]

In [14]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

2617


In [15]:
sequences = []

for i in range(1, len(sequence_data)):
    words = sequence_data[i-1:i+1]
    sequences.append(words)
    
print("The Length of sequences are: ", len(sequences))
sequences = np.array(sequences)
sequences[:10]

The Length of sequences are:  3889


array([[ 17,  53],
       [ 53, 293],
       [293,   2],
       [  2,  18],
       [ 18, 729],
       [729, 135],
       [135, 730],
       [730, 294],
       [294,   8],
       [  8, 731]])

In [16]:
X = []
y = []

for i in sequences:
    X.append(i[0])
    y.append(i[1])
    
X = np.array(X)
y = np.array(y)

In [17]:
print("The Data is: ", X[:5])
print("The responses are: ", y[:5])

The Data is:  [ 17  53 293   2  18]
The responses are:  [ 53 293   2  18 729]


In [18]:
y = to_categorical(y, num_classes=vocab_size)
y[:5]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

Creating RNN/LSTM Model

In [19]:
model = Sequential()
model.add(Embedding(vocab_size, 10, input_length=1))
model.add(LSTM(1000, return_sequences=True))
model.add(LSTM(1000))
model.add(Dense(1000, activation="relu"))
model.add(Dense(vocab_size, activation="softmax"))

In [20]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 1, 10)             26170     
                                                                 
 lstm (LSTM)                 (None, 1, 1000)           4044000   
                                                                 
 lstm_1 (LSTM)               (None, 1000)              8004000   
                                                                 
 dense (Dense)               (None, 1000)              1001000   
                                                                 
 dense_1 (Dense)             (None, 2617)              2619617   
                                                                 
Total params: 15,694,787
Trainable params: 15,694,787
Non-trainable params: 0
_________________________________________________________________


In [23]:
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import ReduceLROnPlateau
from tensorflow.keras.callbacks import TensorBoard

checkpoint = ModelCheckpoint("nextword1.h5", monitor='loss', verbose=1,
    save_best_only=True, mode='auto')

reduce = ReduceLROnPlateau(monitor='loss', factor=0.2, patience=3, min_lr=0.0001, verbose = 1)

logdir='logsnextword1'
tensorboard_Visualization = TensorBoard(log_dir=logdir)

In [25]:
model.compile(loss="categorical_crossentropy", optimizer=Adam(learning_rate=0.001))

In [26]:
model.fit(X, y, epochs=150, batch_size=64, callbacks=[checkpoint, reduce, tensorboard_Visualization])

Epoch 1/150
Epoch 1: loss improved from inf to 7.87530, saving model to nextword1.h5
Epoch 2/150
Epoch 2: loss improved from 7.87530 to 7.86287, saving model to nextword1.h5
Epoch 3/150
Epoch 3: loss improved from 7.86287 to 7.82678, saving model to nextword1.h5
Epoch 4/150
Epoch 4: loss improved from 7.82678 to 7.68016, saving model to nextword1.h5
Epoch 5/150
Epoch 5: loss improved from 7.68016 to 7.48601, saving model to nextword1.h5
Epoch 6/150
Epoch 6: loss improved from 7.48601 to 7.30528, saving model to nextword1.h5
Epoch 7/150
Epoch 7: loss improved from 7.30528 to 7.14841, saving model to nextword1.h5
Epoch 8/150
Epoch 8: loss improved from 7.14841 to 6.95563, saving model to nextword1.h5
Epoch 9/150
Epoch 9: loss improved from 6.95563 to 6.71176, saving model to nextword1.h5
Epoch 10/150
Epoch 10: loss improved from 6.71176 to 6.46713, saving model to nextword1.h5
Epoch 11/150
Epoch 11: loss improved from 6.46713 to 6.24786, saving model to nextword1.h5
Epoch 12/150
Epoch 12

Epoch 36/150
Epoch 36: loss improved from 3.33161 to 3.22039, saving model to nextword1.h5
Epoch 37/150
Epoch 37: loss improved from 3.22039 to 3.11089, saving model to nextword1.h5
Epoch 38/150
Epoch 38: loss improved from 3.11089 to 3.06914, saving model to nextword1.h5
Epoch 39/150
Epoch 39: loss improved from 3.06914 to 2.99223, saving model to nextword1.h5
Epoch 40/150
Epoch 40: loss improved from 2.99223 to 2.87915, saving model to nextword1.h5
Epoch 41/150
Epoch 41: loss improved from 2.87915 to 2.81954, saving model to nextword1.h5
Epoch 42/150
Epoch 42: loss improved from 2.81954 to 2.72969, saving model to nextword1.h5
Epoch 43/150
Epoch 43: loss improved from 2.72969 to 2.67107, saving model to nextword1.h5
Epoch 44/150
Epoch 44: loss improved from 2.67107 to 2.63837, saving model to nextword1.h5
Epoch 45/150
Epoch 45: loss improved from 2.63837 to 2.54965, saving model to nextword1.h5
Epoch 46/150
Epoch 46: loss improved from 2.54965 to 2.47312, saving model to nextword1.h5

Epoch 70: loss improved from 1.73643 to 1.69514, saving model to nextword1.h5
Epoch 71/150
Epoch 71: loss improved from 1.69514 to 1.67729, saving model to nextword1.h5
Epoch 72/150
Epoch 72: loss did not improve from 1.67729
Epoch 73/150
Epoch 73: loss improved from 1.67729 to 1.67285, saving model to nextword1.h5
Epoch 74/150
Epoch 74: loss improved from 1.67285 to 1.63470, saving model to nextword1.h5
Epoch 75/150
Epoch 75: loss improved from 1.63470 to 1.61047, saving model to nextword1.h5
Epoch 76/150
Epoch 76: loss improved from 1.61047 to 1.59378, saving model to nextword1.h5
Epoch 77/150
Epoch 77: loss improved from 1.59378 to 1.56837, saving model to nextword1.h5
Epoch 78/150
Epoch 78: loss improved from 1.56837 to 1.55563, saving model to nextword1.h5
Epoch 79/150
Epoch 79: loss did not improve from 1.55563
Epoch 80/150
Epoch 80: loss improved from 1.55563 to 1.53295, saving model to nextword1.h5
Epoch 81/150
Epoch 81: loss improved from 1.53295 to 1.51282, saving model to ne

Epoch 106/150
Epoch 106: loss improved from 1.16136 to 1.14440, saving model to nextword1.h5
Epoch 107/150
Epoch 107: loss improved from 1.14440 to 1.12664, saving model to nextword1.h5
Epoch 108/150
Epoch 108: loss improved from 1.12664 to 1.12423, saving model to nextword1.h5
Epoch 109/150
Epoch 109: loss improved from 1.12423 to 1.11535, saving model to nextword1.h5
Epoch 110/150
Epoch 110: loss did not improve from 1.11535
Epoch 111/150
Epoch 111: loss improved from 1.11535 to 1.11453, saving model to nextword1.h5
Epoch 112/150
Epoch 112: loss improved from 1.11453 to 1.09772, saving model to nextword1.h5
Epoch 113/150
Epoch 113: loss did not improve from 1.09772
Epoch 114/150
Epoch 114: loss improved from 1.09772 to 1.09393, saving model to nextword1.h5
Epoch 115/150
Epoch 115: loss improved from 1.09393 to 1.07381, saving model to nextword1.h5
Epoch 116/150
Epoch 116: loss did not improve from 1.07381
Epoch 117/150
Epoch 117: loss did not improve from 1.07381
Epoch 118/150
Epoch 

Epoch 141/150
Epoch 141: loss improved from 0.63860 to 0.63835, saving model to nextword1.h5
Epoch 142/150
Epoch 142: loss improved from 0.63835 to 0.63832, saving model to nextword1.h5
Epoch 143/150
Epoch 143: loss did not improve from 0.63832
Epoch 144/150
Epoch 144: loss improved from 0.63832 to 0.63788, saving model to nextword1.h5
Epoch 145/150
Epoch 145: loss improved from 0.63788 to 0.63672, saving model to nextword1.h5
Epoch 146/150
Epoch 146: loss did not improve from 0.63672
Epoch 147/150
Epoch 147: loss improved from 0.63672 to 0.63671, saving model to nextword1.h5
Epoch 148/150
Epoch 148: loss improved from 0.63671 to 0.63635, saving model to nextword1.h5
Epoch 149/150
Epoch 149: loss improved from 0.63635 to 0.63578, saving model to nextword1.h5
Epoch 150/150
Epoch 150: loss did not improve from 0.63578


<keras.callbacks.History at 0x27326d95a30>

# Predictions code:

In [33]:
# Importing the Libraries

from tensorflow.keras.models import load_model
import numpy as np
import pickle

# Load the model and tokenizer

model = load_model('nextword1.h5')
tokenizer = pickle.load(open('tokenizer1.pkl', 'rb'))

def Predict_Next_Words(model, tokenizer, text):
    """
        In this function we are using the tokenizer and models trained
        and we are creating the sequence of the text entered and then
        using our model to predict and return the the predicted word.
    
    """
    for i in range(3):
        sequence = tokenizer.texts_to_sequences([text])[0]
        sequence = np.array(sequence)
        
        preds = model.predict_classes(sequence)
        #print(preds)
        predicted_word = ""
        
        for key, value in tokenizer.word_index.items():
            if value == preds:
                predicted_word = key
                break
        
        print(predicted_word)
        return predicted_word

In [34]:
"""
    We are testing our model and we will run the model
    until the user decides to stop the script.
    While the script is running we try and check if 
    the prediction can be made on the text. If no
    prediction can be made we just continue.

"""

# text1 = "at the dull"
# text2 = "collection of textile"
# text3 = "what a strenuous"
# text4 = "stop the script"

while(True):

    text = input("Enter your line: ")
    
    if text == "stop the script":
        print("Ending The Program.....")
        break
    
    else:
        try:
            text = text.split(" ")
            text = text[-1]

            text = ''.join(text)
            Predict_Next_Words(model, tokenizer, text)
            
        except:
            continue

Enter your line: at your service
Enter your line: at the dull
Enter your line: of textile
Enter your line: stop the script
Ending The Program.....
