In [1]:
with open('book.txt', 'r', encoding = 'utf-8') as book :
  text = book.read()

In [2]:
text = text.lower()

In [3]:
sentences = text.split('\n')
sentences[:3]

['the project gutenberg ebook of the adventures of sherlock holmes',
 '    ',
 'this ebook is for the use of anyone anywhere in the united states and']

In [4]:
import tensorflow as tf

In [5]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer

In [6]:
# tokenizer skips the tokens that are not part of our vocabulary
tokenizer = Tokenizer(oov_token = '<UNK>')
tokenizer.fit_on_texts(sentences)


In [7]:
type(tokenizer.word_index)

dict

In [8]:
tokenizer.word_index['<UNK>']

1

In [9]:
vocab_size = len(tokenizer.word_index)+1
vocab_size

8921

In [10]:
sequences = tokenizer.texts_to_sequences(sentences)

In [11]:
# now we want to generate sequence -> (sequence means keeping the order same)

In [12]:
input_sequences = []
for sequence in sequences :
  for i in range(1, len(sequence)) :
    input_sequences.append(sequence[:i+1])

In [13]:
input_sequences[:3]

[[2, 158], [2, 158, 331], [2, 158, 331, 886]]

In [14]:
# now we want our input sequences should be of small range, therefore we will pad these sequences

In [15]:
max_sequence_len = max([len(x) for x in input_sequences])
max_sequence_len

20

In [16]:
from keras.preprocessing.sequence import pad_sequences
padded_sequences = pad_sequences (input_sequences, max_sequence_len, padding = 'pre')

In [17]:
padded_sequences[:3]

array([[  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   0,   2, 158],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   0,   2, 158, 331],
       [  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
          0,   0,   0,   2, 158, 331, 886]])

In [18]:
# now in each sequence we will keep the last word as output and remaining words as input
# for example if we have sequences as
# [1, 2]
# [1,2, 3]
# [1, 2, 3, 4]
# then input and output for each of these sequences will be
# [1] [2]
# [1,2] [3]
# [1,2,3] [4]

In [19]:
import numpy as np
padded_sequences = np.array(padded_sequences)

In [20]:
x = padded_sequences[:, :-1]
y = padded_sequences[:, -1]

In [21]:
x.shape, y.shape

((101519, 19), (101519,))

In [22]:
y

array([ 158,  331,  886, ...,   84,  360, 1673])

In [23]:
# we want to predict the next word from out vocabulary so for that we need to one hot encode as we will apply
# softmax with units = number of words in our dense layer and output will be the one with max probability
from keras.utils import to_categorical
y = to_categorical(y, num_classes = vocab_size)

In [24]:
x.shape, y.shape

((101519, 19), (101519, 8921))

In [25]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Input, Dense, Dropout, LSTM, Bidirectional

In [26]:
model = Sequential()
model.add(Input(shape = (max_sequence_len-1, )))
model.add(Embedding(vocab_size, 100)) # -1 because we remove 1 element from the sequence and kept it as output
model.add(LSTM(1024))
model.add(Dense(vocab_size, activation = 'softmax'))
model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])
model.summary()

In [27]:
from tensorflow.keras.callbacks import EarlyStopping
# monitor accuracy and if the difference between accuracy of epochs is less tha 1% then stop
es = EarlyStopping(monitor = 'accuracy', min_delta = 0.01, mode = "min", patience=15)

In [28]:
model.fit(x, y, epochs = 50, verbose = 1, batch_size = 256, callbacks = [es])


Epoch 1/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m361s[0m 900ms/step - accuracy: 0.0512 - loss: 6.7553
Epoch 2/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m348s[0m 877ms/step - accuracy: 0.0943 - loss: 5.8451
Epoch 3/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m343s[0m 864ms/step - accuracy: 0.1212 - loss: 5.4213
Epoch 4/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m350s[0m 880ms/step - accuracy: 0.1406 - loss: 5.0804
Epoch 5/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m342s[0m 862ms/step - accuracy: 0.1580 - loss: 4.7192
Epoch 6/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m341s[0m 858ms/step - accuracy: 0.1811 - loss: 4.3367
Epoch 7/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m339s[0m 854ms/step - accuracy: 0.2185 - loss: 3.9072
Epoch 8/50
[1m397/397[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m346s[0m 871ms/step - accuracy: 0.2827 - loss: 3.4375
Epoch 9/

<keras.src.callbacks.history.History at 0x26e841929d0>

In [34]:
model.save('model_.h5')



In [33]:
import pickle
with open('tokenizer.pkl', 'wb') as file :
    pickle.dump(tokenizer, file)
