In [None]:
#Import all necessary libraries
import tensorflow
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import Sequential
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.optimizers import Adam

import pickle
import numpy as np
import os

In [None]:
from google.colab import drive

# Mount Google Drive
drive.mount('/content/drive')

# Access the specific file in Google Drive

file_path = '/content/drive/My Drive/Sherlock Holmes.txt'

# Check if the file exists and read its content
try:
    with open(file_path, 'r') as file:
        content = file.read()
        print("File content:\n", content[:500])  # Display first 500 characters
except FileNotFoundError:
    print(f"File not found at path: {file_path}")

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).
File content:
 



                        THE ADVENTURES OF SHERLOCK HOLMES

                               Arthur Conan Doyle



                                Table of contents

               A Scandal in Bohemia
               The Red-Headed League
               A Case of Identity
               The Boscombe Valley Mystery
               The Five Orange Pips
               The Man with the Twisted Lip
               The Adventure of the Blue Carbuncle
               The Adventure of the Speckled Band
  


# Text Preprocessing


In [None]:
#Splits the content string into lines based on newline characters and stores it in lines list
lines = content.split('\n')

In [None]:
#Combines all lines of text from lines list into the data string variable, seperated by a space
data = ""
for i in lines:
  data = ' '.join(lines)

#Cleans that data by removing unnecessary characters such as new lines, carriage returns, byte order marks, and double quotes
data = data.replace('\n', '').replace('\r', '').replace('\ufeff', '').replace('"', '').replace('"', '')

In [None]:
#Splits the text into individual words, then rejoins them into a single string with spaces between them
data = data.split()
data = ' '.join(data)

#Display the first 1000 characters in the cleaned data
data[:1000]

"THE ADVENTURES OF SHERLOCK HOLMES Arthur Conan Doyle Table of contents A Scandal in Bohemia The Red-Headed League A Case of Identity The Boscombe Valley Mystery The Five Orange Pips The Man with the Twisted Lip The Adventure of the Blue Carbuncle The Adventure of the Speckled Band The Adventure of the Engineer's Thumb The Adventure of the Noble Bachelor The Adventure of the Beryl Coronet The Adventure of the Copper Beeches A SCANDAL IN BOHEMIA Table of contents Chapter 1 Chapter 2 Chapter 3 CHAPTER I To Sherlock Holmes she is always the woman. I have seldom heard him mention her under any other name. In his eyes she eclipses and predominates the whole of her sex. It was not that he felt any emotion akin to love for Irene Adler. All emotions, and that one particularly, were abhorrent to his cold, precise but admirably balanced mind. He was, I take it, the most perfect reasoning and observing machine that the world has seen, but as a lover he would have placed himself in a false positio

# Tokenization

In [None]:
#Creates a tokenizer object to convert text into numerical sequences for the model
tokenizer = Tokenizer()

#Analyzes the data and builds a vocabulary of unique words/tokens found in the text
tokenizer.fit_on_texts([data])

In [None]:
#Saves the trained tokenizer so we can load it later without having to retrain it
pickle.dump(tokenizer, open('token.pk1', 'wb'))

# Converting to Sequence of Numbers

In [None]:
#Converts the cleaned text data and converts it into a sequence of numbers
#Each number represents a specific word from the voacbulary and stores it in the sequence_data variable
sequence_data = tokenizer.texts_to_sequences([data])[0]

In [None]:
print(sequence_data[:10])

print(len(sequence_data))

[1, 1561, 5, 129, 34, 647, 4498, 4499, 226, 5]
105879


In [None]:
#Determines the total number of unique words in the text
#Add 1 to the length to accommodate a reserved index
vocabSize = len(tokenizer.word_index)+1
vocabSize

8200

# Converting the Data

In [None]:
#Takes the words and breaks it down into overlapping sequences of 4 words each
#The sequences are stored in the list
#The end goal is to create sequences of four words to be fed into the model
sequence = []
for i in range(3, len(sequence_data)):
  words = sequence_data[i-3:i+1]
  sequence.append(words)

In [None]:
print(len(sequence))

sequence = np.array(sequence)
print(sequence)

105876
[[   1 1561    5  129]
 [1561    5  129   34]
 [   5  129   34  647]
 ...
 [  28    1 8198 8199]
 [   1 8198 8199 3187]
 [8198 8199 3187 3186]]


In [None]:
# Divide into the independent (input data) and dependent features (output data)
x = []
y = []

#loop through sequence to prepare features for model training
for i in sequence:
  #pass the first 3 data values into the x list
  x.append(i[0:3])
  #predict the last data value and add to the y list
  y.append(i[3])

In [None]:
#convert the lists into np arrays to make data compatible with ML libraries
x = np.array(x)
y = np.array(y)

In [None]:
#convert the y array to categorical data to help with classification
y = to_categorical(y, num_classes=vocabSize)

# Building the LSTM model

In [None]:
#initialize the model
model = Sequential()

#pass the embedding layer using vocab size for input_dim and 10 for output_dim
#maps the input words
model.add(Embedding(input_dim = vocabSize, output_dim = 10, input_length = len(sequence)))

#add 1st LSTM layer having 1000 parameters and return sequences as a memory
#retains memory for subsequent layers
model.add(LSTM(1000, return_sequences=True))

#add the 2nd LSTM model with 1000 parameters
#summarizes the sequence info
model.add(LSTM(1000))

#add a dense layer to make the data 0s and 1s
#utilizes the relu function to learn complex patterns
model.add(Dense(1000, activation="relu"))

#add final layer to output probabilities for each vocabulary class
#uses softmax activation because the data is multi-class
model.add(Dense(vocabSize, activation="softmax"))



In [None]:
#explicitly initialize the model using build()
model.build(input_shape=(None, 3))
#display architecture and parameters of the model
model.summary()

# Training the model

In [None]:
#import the modelcheckpoint callback class. saves the model during training
from tensorflow.keras.callbacks import ModelCheckpoint

#create a model checkpoint instance using the file. It also monitors the loss
#monitors loss and saves only the best model to prevent overwriting
checkpoint = ModelCheckpoint("Sherlock Holmes.keras", monitor="loss", verbose=1, save_best_only=True)

#compile the model with categorical crossentropy for the loss function, multi-class classification
#use Adam optimizer, an optimization algorithm that adjusts LR dynamically during training
model.compile(loss="categorical_crossentropy", optimizer = Adam(learning_rate = .001))

#train the model on the x and y, for 2 epochs
model.fit(x, y, epochs = 2, batch_size=64, callbacks=[checkpoint])

Epoch 1/2
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 718ms/step - loss: 6.6745
Epoch 1: loss improved from inf to 6.36580, saving model to Sherlock Holmes.keras
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1193s[0m 719ms/step - loss: 6.6743
Epoch 2/2
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 703ms/step - loss: 5.8114
Epoch 2: loss improved from 6.36580 to 5.78783, saving model to Sherlock Holmes.keras
[1m1655/1655[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1201s[0m 707ms/step - loss: 5.8114


<keras.src.callbacks.history.History at 0x7b038103e6e0>

# Resources





*   https://sourestdeeds.github.io/pdf/Deep%20Learning%20with%20Python.pdf
*   https://www.tensorflow.org/guide/keras
*   https://docs.python.org/3/library/pickle.html





In [None]:
from tensorflow.keras.models import load_model

model = load_model('Sherlock Holmes.keras')
tokenizer = pickle.load(open('token.pk1', 'rb'))

def predict_word(model, tokenizer, text):
  sequence = tokenizer.texts_to_sequences([text])
  sequence = np.array(sequence)
  preds = np.argmax(model.predict(sequence))
  predicted_word = ""
  for key, value in tokenizer.word_index.items():
    if value == preds:
      predicted_word = key
      break
  print(predicted_word)
  return predicted_word

predict_word(model, tokenizer, "Once in a")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 300ms/step
little


'little'

Explanation of predict_word function

Logical Flow: The code is designed to load a trained model and tokenizer, then predict the next word in a text sequence based on the input.
Functionality: It uses a neural network model trained on text data (e.g., "Sherlock Holmes") to generate predictions.

Usability: The function predict_word is modular and can be reused with different inputs.

Does the Code Make Sense?

The core logic is sound: It loads a trained model and tokenizer, processes the input text, predicts the next word's class index, and maps it back to a word.
Issue: predict_classes is deprecated and will throw an error in TensorFlow 2.6+ versions. The corrected prediction line should be:

preds = np.argmax(model.predict(sequence), axis=-1)

Once updated for compatibility, the code is valid and functional for its intended purpose.

from tensorflow.keras.models import load_model
import pickle
import numpy as np

# Load the previously saved trained model
# 'Sherlock Holmes.keras' contains the trained neural network for text prediction.
model = load_model('Sherlock Holmes.keras')

# Load the tokenizer object that was saved earlier using pickle
# The tokenizer is used to encode and decode text into numerical sequences.
tokenizer = pickle.load(open('token.pk1', 'rb'))

def predict_word(model, tokenizer, text):
    """
    Predict the next word in a sequence based on the trained model and tokenizer.

    Parameters:
    - model: Trained Keras sequential model used for predictions.
    - tokenizer: Tokenizer object for encoding/decoding text.
    - text: Input text string to predict the next word.

    Returns:
    - str: Predicted next word in the sequence.
    """
    # Convert the input text into a numerical sequence using the tokenizer
    sequence = tokenizer.texts_to_sequences([text])
    
    # Ensure the sequence is in NumPy array format for compatibility with the model
    sequence = np.array(sequence)
    
    # Predict the class index of the next word using the model
    # Note: `predict_classes` is deprecated in newer TensorFlow versions.
    preds = np.argmax(model.predict(sequence), axis=-1)
    
    # Initialize an empty string to store the predicted word
    predicted_word = ""
    
    # Search through the tokenizer's word-to-index mapping to find the predicted word
    for key, value in tokenizer.word_index.items():
        if value == preds:  # If the index matches the prediction
            predicted_word = key  # Assign the word to `predicted_word`
            break  # Exit the loop once the word is found
    
    # Print the predicted word
    print(predicted_word)
    
    # Return the predicted word
    return predicted_word

# Call the function to predict the next word after the phrase "A Case of"
predict_word(model, tokenizer, "A Case of")








