In [59]:
# Path to the file in Kaggle
file_path = "/kaggle/input/word-prediction/1661-0.txt"

# Open and read the file
with open(file_path, 'r', encoding='utf-8') as f:
    text = f.read()

# Display first 500 characters
print(text[:3000])


﻿
Project Gutenberg's The Adventures of Sherlock Holmes, by Arthur Conan Doyle

This eBook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever.  You may copy it, give it away or
re-use it under the terms of the Project Gutenberg License included
with this eBook or online at www.gutenberg.net


Title: The Adventures of Sherlock Holmes

Author: Arthur Conan Doyle

Release Date: November 29, 2002 [EBook #1661]
Last Updated: May 20, 2019

Language: English

Character set encoding: UTF-8

*** START OF THIS PROJECT GUTENBERG EBOOK THE ADVENTURES OF SHERLOCK HOLMES ***



Produced by an anonymous Project Gutenberg volunteer and Jose Menendez



cover



The Adventures of Sherlock Holmes



by Arthur Conan Doyle



Contents


   I.     A Scandal in Bohemia
   II.    The Red-Headed League
   III.   A Case of Identity
   IV.    The Boscombe Valley Mystery
   V.     The Five Orange Pips
   VI.    The Man with the Twisted Lip
   VII.   The Adventure of the Blue 

In [60]:
# Cell 1: Import required libraries

import numpy as np
import re
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences


In [61]:
# Remove punctuation (anything not letter, number, or space)
text = re.sub(r'[^\w\s]', '', text)

# Convert to lowercase for consistency
text = text.lower()

print(text[:2000])


project gutenbergs the adventures of sherlock holmes by arthur conan doyle

this ebook is for the use of anyone anywhere at no cost and with
almost no restrictions whatsoever  you may copy it give it away or
reuse it under the terms of the project gutenberg license included
with this ebook or online at wwwgutenbergnet


title the adventures of sherlock holmes

author arthur conan doyle

release date november 29 2002 ebook 1661
last updated may 20 2019

language english

character set encoding utf8

 start of this project gutenberg ebook the adventures of sherlock holmes 



produced by an anonymous project gutenberg volunteer and jose menendez



cover



the adventures of sherlock holmes



by arthur conan doyle



contents


   i     a scandal in bohemia
   ii    the redheaded league
   iii   a case of identity
   iv    the boscombe valley mystery
   v     the five orange pips
   vi    the man with the twisted lip
   vii   the adventure of the blue carbuncle
   viii  the adventure o

In [62]:
# Cell 3: Split text into sentences based on line breaks
sentences = text.strip().split("\n")

print(sentences[:100])


['project gutenbergs the adventures of sherlock holmes by arthur conan doyle', '', 'this ebook is for the use of anyone anywhere at no cost and with', 'almost no restrictions whatsoever  you may copy it give it away or', 'reuse it under the terms of the project gutenberg license included', 'with this ebook or online at wwwgutenbergnet', '', '', 'title the adventures of sherlock holmes', '', 'author arthur conan doyle', '', 'release date november 29 2002 ebook 1661', 'last updated may 20 2019', '', 'language english', '', 'character set encoding utf8', '', ' start of this project gutenberg ebook the adventures of sherlock holmes ', '', '', '', 'produced by an anonymous project gutenberg volunteer and jose menendez', '', '', '', 'cover', '', '', '', 'the adventures of sherlock holmes', '', '', '', 'by arthur conan doyle', '', '', '', 'contents', '', '', '   i     a scandal in bohemia', '   ii    the redheaded league', '   iii   a case of identity', '   iv    the boscombe valley mystery',

In [63]:
tokenizer = Tokenizer()
tokenizer.fit_on_texts(sentences)

total_words = len(tokenizer.word_index) + 1  # +1 because indexing starts at 1

print("Vocabulary Size:", total_words)
print(len(tokenizer.word_index))
print("Word Index:", tokenizer.word_index)

Vocabulary Size: 8718
8717


In [64]:
input_sequences=[]
for sentence in text.split("\n"):
    tokenized_sentence=tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1,len(tokenized_sentence)):
        input_sequences.append(tokenized_sentence[:i+1])


In [65]:
input_sequences

[[136, 4605],
 [136, 4605, 1],
 [136, 4605, 1, 953],
 [136, 4605, 1, 953, 4],
 [136, 4605, 1, 953, 4, 123],
 [136, 4605, 1, 953, 4, 123, 33],
 [136, 4605, 1, 953, 4, 123, 33, 45],
 [136, 4605, 1, 953, 4, 123, 33, 45, 523],
 [136, 4605, 1, 953, 4, 123, 33, 45, 523, 2107],
 [136, 4605, 1, 953, 4, 123, 33, 45, 523, 2107, 2108],
 [27, 954],
 [27, 954, 14],
 [27, 954, 14, 22],
 [27, 954, 14, 22, 1],
 [27, 954, 14, 22, 1, 268],
 [27, 954, 14, 22, 1, 268, 4],
 [27, 954, 14, 22, 1, 268, 4, 374],
 [27, 954, 14, 22, 1, 268, 4, 374, 2109],
 [27, 954, 14, 22, 1, 268, 4, 374, 2109, 20],
 [27, 954, 14, 22, 1, 268, 4, 374, 2109, 20, 41],
 [27, 954, 14, 22, 1, 268, 4, 374, 2109, 20, 41, 1572],
 [27, 954, 14, 22, 1, 268, 4, 374, 2109, 20, 41, 1572, 2],
 [27, 954, 14, 22, 1, 268, 4, 374, 2109, 20, 41, 1572, 2, 17],
 [543, 41],
 [543, 41, 3239],
 [543, 41, 3239, 3240],
 [543, 41, 3239, 3240, 10],
 [543, 41, 3239, 3240, 10, 70],
 [543, 41, 3239, 3240, 10, 70, 769],
 [543, 41, 3239, 3240, 10, 70, 769, 9],


In [66]:
max_len=max([len(x) for x in input_sequences])


In [67]:
print(max_len)

18


In [68]:
padded=pad_sequences(input_sequences,maxlen=max_len,padding='pre')

In [69]:
x=padded[:,: -1]
y=padded[:,-1]

In [70]:
x.shape

(97929, 17)

In [71]:
y.shape

(97929,)

In [72]:

from tensorflow.keras.utils import to_categorical
y=to_categorical(y,num_classes =8718)

In [73]:
y.shape

(97929, 8718)

In [74]:
max_len = max([len(x) for x in input_sequences])
print(max_len)


18


In [75]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,LSTM
from tensorflow.keras.layers import GRU
from tensorflow.keras.utils import to_categorical


In [76]:
# Define and build the Encoder–Decoder model
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, LSTM, Dense

embedding_dim = 50
latent_dim = 150

# Encoder
encoder_inputs = Input(shape=(17,))
encoder_embedding = Embedding(8718, embedding_dim)(encoder_inputs)
encoder_lstm = LSTM(latent_dim, return_state=True)
_, state_h, state_c = encoder_lstm(encoder_embedding)
encoder_states = [state_h, state_c]

# Decoder
decoder_inputs = Input(shape=(17,))
decoder_embedding = Embedding(8718, embedding_dim)(decoder_inputs)
decoder_lstm = LSTM(latent_dim, return_sequences=False)
decoder_outputs = decoder_lstm(decoder_embedding, initial_state=encoder_states)
decoder_dense = Dense(8718, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Build Model
model3 = Model([encoder_inputs, decoder_inputs], decoder_outputs)

# Compile Model
model3.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Print model summary
model3.summary()

# Train Model (using same x for encoder and decoder inputs)
history3 = model3.fit([x, x], y, epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 10ms/step - accuracy: 0.0599 - loss: 6.6138
Epoch 2/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.1074 - loss: 5.7292
Epoch 3/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.1373 - loss: 5.2711
Epoch 4/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.1543 - loss: 4.9329
Epoch 5/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.1691 - loss: 4.6415
Epoch 6/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.1834 - loss: 4.3761
Epoch 7/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.2065 - loss: 4.1041
Epoch 8/100
[1m3061/3061[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 10ms/step - accuracy: 0.2297 - loss: 3.8598


In [77]:
import numpy as np

In [78]:
test_text = "easy"  # starting text

# Convert to tokens and pad
token_test_text = tokenizer.texts_to_sequences([test_text])[0]
padd_test_text = pad_sequences([token_test_text], maxlen=18, padding="pre")

# Get model predictions
predictions = model.predict(padd_test_text, verbose=0)[0]

# Get top 5 predicted word indices
top_indices = predictions.argsort()[-5:][::-1]

# Print top 5 predictions as sentence options
for idx in top_indices:
    for word, index in tokenizer.word_index.items():
        if index == idx:
            print(f"{test_text} {word}")
            break


            

easy make
easy quietly
easy letter
easy tell
easy if


and now here is the end of the code part of this project with 76 percent of accuracy 