In [1]:
#Predicting the next word in a sequence of words. Most likely word to come in the given sequence. Will use Recurrent Neural Networks


In [2]:
#Get the data

#Here I'm getting the book Adventures of Sherlock Holmes. Saving it to book.txt
!wget https://www.gutenberg.org/files/1661/1661-0.txt -O book.txt

--2021-02-17 11:26:09--  https://www.gutenberg.org/files/1661/1661-0.txt
Resolving www.gutenberg.org (www.gutenberg.org)... 152.19.134.47, 2610:28:3090:3000:0:bad:cafe:47
Connecting to www.gutenberg.org (www.gutenberg.org)|152.19.134.47|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 607792 (594K) [text/plain]
Saving to: ‘book.txt’


2021-02-17 11:26:10 (610 KB/s) - ‘book.txt’ saved [607792/607792]



In [3]:
#Textual data. Converting it to lower case
text=open('book.txt','r',encoding='utf-8').read()
text=text.lower()

In [4]:
#Using sequence of words
#Right now text is a string. Split this string line by line to get list of sentences
sentences=text.split('\n')

In [5]:
# We want to fit a tokenizer available in keras. Will build a dictionary of words where each word will have a unique integer identifier

import tensorflow as tf
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [6]:
#To tokenize the text. Initialize an object of Tokenizer
tokenizer=Tokenizer(oov_token='<UNK>') #UNK--> Unknown

In [7]:
#Updates internal vocabulary based on a list of texts
tokenizer.fit_on_texts(sentences)

In [8]:
#To check number of words in vocabulary. Tokenized to build vocabulary
vocab_size=len(tokenizer.word_index) +1 #tokenizer.word_index tells which word is mapped to which index

#1 is added to consider the out of vocabulary token (i.e. ‘oov_token’) while calculating the vocabulary size.
vocab_size

8933

In [9]:
#Converting sentences to sequences of integers
sequences=tokenizer.texts_to_sequences(sentences)

In [10]:
#Build n_gram sequences from the sequences obtained above
# Example of a sequence [23,15,5,6,4052]
#Its n_gram sequences would be [23,15], [23,15,5], [23,15,5,6], [23,15,5,6,4052]

#Initializing a list to save n_gram sequences
input_sequences=[]

for sequence in sequences:
  for i in range(1,len(sequence)):
    n_gram_sequence=sequence[:i+1]
    input_sequences.append(n_gram_sequence)

In [11]:
input_sequences

[[146, 4790],
 [146, 4790, 2],
 [146, 4790, 2, 1021],
 [146, 4790, 2, 1021, 5],
 [146, 4790, 2, 1021, 5, 129],
 [146, 4790, 2, 1021, 5, 129, 35],
 [146, 4790, 2, 1021, 5, 129, 35, 46],
 [146, 4790, 2, 1021, 5, 129, 35, 46, 612],
 [146, 4790, 2, 1021, 5, 129, 35, 46, 612, 2236],
 [146, 4790, 2, 1021, 5, 129, 35, 46, 612, 2236, 2237],
 [31, 1022],
 [31, 1022, 16],
 [31, 1022, 16, 24],
 [31, 1022, 16, 24, 2],
 [31, 1022, 16, 24, 2, 276],
 [31, 1022, 16, 24, 2, 276, 5],
 [31, 1022, 16, 24, 2, 276, 5, 395],
 [31, 1022, 16, 24, 2, 276, 5, 395, 2238],
 [31, 1022, 16, 24, 2, 276, 5, 395, 2238, 22],
 [31, 1022, 16, 24, 2, 276, 5, 395, 2238, 22, 52],
 [31, 1022, 16, 24, 2, 276, 5, 395, 2238, 22, 52, 1677],
 [31, 1022, 16, 24, 2, 276, 5, 395, 2238, 22, 52, 1677, 3],
 [31, 1022, 16, 24, 2, 276, 5, 395, 2238, 22, 52, 1677, 3, 19],
 [573, 52],
 [573, 52, 3399],
 [573, 52, 3399, 3400],
 [573, 52, 3399, 3400, 14],
 [573, 52, 3399, 3400, 14, 76],
 [573, 52, 3399, 3400, 14, 76, 818],
 [573, 52, 3399, 34

In [12]:
print(input_sequences[0],input_sequences[1]) #All these sequences are of different length.

[146, 4790] [146, 4790, 2]


In [13]:
#When you are going to train an RNN model on it, you need that all the input sequences are of the same length. To do that
#padding can be done.
#Padding. Finding max_ sequence length in the list of sequences and pad all other sequences shorter than max_sequence length
max_seq_len=max([len(seq)for seq in input_sequences])
max_seq_len

20

In [14]:
#To pad the sequences, save them in a variable
padded_sequences=pad_sequences(input_sequences,maxlen=max_seq_len)

In [15]:
#Now the length of each sequence is equal to that of the maximum sequence lenth i.e. 20
print(padded_sequences[0],padded_sequences[1]) 

[   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0    0  146 4790] [   0    0    0    0    0    0    0    0    0    0    0    0    0    0
    0    0    0  146 4790    2]


In [16]:
#Need to form the input and output pairs from the padded sequences formed above.
#We will keep the last number in the sequence as the O/P label and the rest of the numbers in the sequence as the I/P.

#Converting the padded sequences to numpy array
import numpy as np
padded_sequences=np.array(padded_sequences)

In [17]:
x=padded_sequences[:,:-1] #I/P, keeping all the rows (training examples) and all the columns in each sequence except the last one
labels=padded_sequences[:,-1] #O/P, all the rows, last number in each sequence

In [18]:
#One-hot encoding
y=tf.keras.utils.to_categorical(labels,num_classes=vocab_size)
y.shape

(101619, 8933)

In [19]:
x.shape #19 is I/P sequence length

(101619, 19)

In [20]:
#Build the model

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense
from tensorflow.keras.optimizers import Adam

model= Sequential()
model.add(Embedding(vocab_size,100,input_length=max_seq_len-1)) #Embedding is done to represent the words in n dimensional space
model.add(Bidirectional(LSTM(256))) #Taking the context from left to right i.e. both the directions
model.add(Dense(vocab_size,activation='softmax')) #Dealing with categorical data, number of units in O/P dense layer will be equal to the vocab_size
adam=Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy',optimizer=adam,metrics=['acc'])
model.summary()


Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 19, 100)           893300    
_________________________________________________________________
bidirectional (Bidirectional (None, 512)               731136    
_________________________________________________________________
dense (Dense)                (None, 8933)              4582629   
Total params: 6,207,065
Trainable params: 6,207,065
Non-trainable params: 0
_________________________________________________________________


In [21]:
#Initialize the callback for early stopping the training if there is not atleast 1% improve in accuracy
from tensorflow.keras.callbacks import EarlyStopping
es=EarlyStopping(monitor='acc',min_delta=0.01)

In [22]:
model.fit(x,y,epochs=50,verbose=1,batch_size=512,callbacks=[es])

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50


<tensorflow.python.keras.callbacks.History at 0x7f10500c1160>

In [23]:
#Time to become storyteller
seed_text="I could not help laughing at the ease with which he explained his process of deduction"
next_words=100 #Predict the next 100 words after the sequence given in seed_text

for _ in range(next_words):
  sequence=tokenizer.texts_to_sequences([seed_text])
  padded=pad_sequences(sequence,maxlen=max_seq_len-1)
  predicted=model.predict_classes(padded,verbose=0)
  output_word=''
  for word,index in tokenizer.word_index.items():
    if index == predicted:
      output_word=word
      break
  seed_text+=' ' + output_word
print(seed_text)



I could not help laughing at the ease with which he explained his process of deduction “when i got into it that he had known so and i could not see a little difficulties to my wife and i have not a man of the whole house ’ said he said that i had been quite a day just at the day and i take it not a very good friend of a cold beef and a very sweet manner he must be a woman ” striding the smile hardened of a man in a pew at the man with a smile i am not a man of strong character with a cry of a part
