In [63]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
import pickle
import numpy as np
import os

In [64]:
# Read the text file
with open('./sherlock-holm.es_stories_plain-text_advs.txt', 'r', encoding='utf-8') as file:
    text = file.read()

In [65]:
file = open("sherlock-holm.es_stories_plain-text_advs.txt", "r", encoding = "utf8")
lines = []

for i in file:
    lines.append(i)
    
print("The First Line: ", lines[0])
print("The Last Line: ", lines[-1])

The First Line:  To Sherlock Holmes she is always the woman. I have seldom heard him

The Last Line:       This text comes from the collection's version 3.1.


In [66]:
#Cleaning Data
data = ""

for i in lines:
    data = ' '. join(lines)
    
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '')
data[:360]

'To Sherlock Holmes she is always the woman. I have seldom heard him      mention her under any other name. In his eyes she eclipses and      predominates the whole of her sex. It was not that he felt any      emotion akin to love for Irene Adler. All emotions, and that one      particularly, were abhorrent to his cold, precise but admirably      balanced min'

In [67]:
import string

translator = str.maketrans(string.punctuation, ' '*len(string.punctuation)) #map punctuation to space
new_data = data.translate(translator)

new_data[:500]

'To Sherlock Holmes she is always the woman  I have seldom heard him      mention her under any other name  In his eyes she eclipses and      predominates the whole of her sex  It was not that he felt any      emotion akin to love for Irene Adler  All emotions  and that one      particularly  were abhorrent to his cold  precise but admirably      balanced mind  He was  I take it  the most perfect reasoning and      observing machine that the world has seen  but as a lover he would      have place'

In [68]:
z = []

for i in data.split():
    if i not in z:
        z.append(i)
        
data = ' '.join(z)
data[:500]

'To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes eclipses and predominates whole of sex. It was not that he felt emotion akin to love for Irene Adler. All emotions, one particularly, were abhorrent cold, precise but admirably balanced mind. He was, take it, most perfect reasoning observing machine world has seen, as a lover would placed himself in false position. never spoke softer passions, save with gibe sneer. They admirable thi'

In [69]:

tokenizer = Tokenizer()
tokenizer.fit_on_texts([data])

# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

sequence_data = tokenizer.texts_to_sequences([data])[0]
sequence_data[:10]

[2, 858, 25, 113, 26, 524, 20, 114, 31, 115]

In [70]:
# tokenizer = Tokenizer()
# tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

In [71]:
vocab_size = len(tokenizer.word_index) + 1
print(vocab_size)

8198


In [72]:
input_sequences = []
for line in text.split('\n'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [73]:
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [74]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [75]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [76]:
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

None


In [77]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=10, verbose=1)

Epoch 1/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 11ms/step - accuracy: 0.0622 - loss: 6.5537
Epoch 2/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - accuracy: 0.1188 - loss: 5.5599
Epoch 3/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 11ms/step - accuracy: 0.1481 - loss: 5.1307
Epoch 4/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 11ms/step - accuracy: 0.1646 - loss: 4.7834
Epoch 5/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 11ms/step - accuracy: 0.1834 - loss: 4.4631
Epoch 6/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 11ms/step - accuracy: 0.2051 - loss: 4.1590
Epoch 7/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 12ms/step - accuracy: 0.2305 - loss: 3.8822
Epoch 8/10
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m701s[0m 233ms/step - accuracy: 0.2623 - loss: 3.6186
Epoch 

<keras.src.callbacks.history.History at 0x17727624c80>

In [78]:
seed_text = "I will leave if they"
next_words = 5

for _ in range(next_words):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    seed_text += " " + output_word

print(seed_text)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 110ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
I will leave if they were all and i had


In [79]:
model.summary()

In [80]:
# !pip install pyyaml h5py  

In [81]:
import os

import tensorflow as tf
from tensorflow import keras

print(tf.version.VERSION)

2.16.0-rc0


In [82]:
model.summary()

In [83]:
checkpoint_path = "./training_1/cp.keras"
checkpoint_dir = os.path.dirname(checkpoint_path)

# Create a callback that saves the model's weights
cp_callback = tf.keras.callbacks.ModelCheckpoint(filepath=checkpoint_path,
                                                 verbose=1)

model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=20, verbose=1, callbacks=[cp_callback])

Epoch 1/20
[1m3006/3008[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.3290 - loss: 3.2192
Epoch 1: saving model to ./training_1/cp.keras
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 12ms/step - accuracy: 0.3289 - loss: 3.2194
Epoch 2/20
[1m3007/3008[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 11ms/step - accuracy: 0.3708 - loss: 2.9546
Epoch 2: saving model to ./training_1/cp.keras
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m34s[0m 11ms/step - accuracy: 0.3708 - loss: 2.9546
Epoch 3/20
[1m3007/3008[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.4198 - loss: 2.6955
Epoch 3: saving model to ./training_1/cp.keras
[1m3008/3008[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 12ms/step - accuracy: 0.4198 - loss: 2.6956
Epoch 4/20
[1m3007/3008[0m [32m━━━━━━━━━━━━━━━━━━━[0m[37m━[0m [1m0s[0m 12ms/step - accuracy: 0.4568 - loss: 2.4986
Epoch 4: saving model to ./t

<keras.src.callbacks.history.History at 0x177277dda90>

In [84]:
model.save('./predictor.keras')

In [85]:
import pickle
# saving the tokenizer for predict function.
pickle.dump(tokenizer, open('tokenizer1.pkl', 'wb'))

In [86]:
max_sequence_len

18