# LSTM Implementation

# **Text Preprocessing**

In [8]:
import os
import pandas as pd
import numpy as np
import tensorflow as tf
import nltk
import re
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam


In [9]:
with open('alice_in_wonderland.txt', 'r', encoding='utf-8') as infile:
    data = infile.read()

# Preprocessing

In [10]:
def preprocess(text):
    # Remove special characters
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    # Remove extra spaces
    text = re.sub(' +', ' ', text)
    return text

In [11]:
# Preprocessing pipeline
def preprocess_pipeline(data) -> 'list':
    # Split by newline character
    sentences = data.split('\n')
    for i in range(len(sentences)):
        sentences[i] = preprocess(sentences[i])
    # Remove leading and trailing spaces
    sentences = [s.strip() for s in sentences]
    # Drop empty sentences
    sentences = [s for s in sentences if len(s) > 0]
    # Tokenization
    tokenized = []
    for sentence in sentences:
        # Convert to lowercase
        sentence = sentence.lower()
        tokenized.append(sentence)
    return tokenized

# Tokenize sentences
tokenized_sentences = preprocess_pipeline(data)

In [12]:
tokenized_sentences[0:10]

['title alices adventures in wonderland',
 'author lewis carroll',
 'chapter i',
 'down the rabbithole',
 'alice was beginning to get very tired of sitting by her sister',
 'on the bank and of having nothing to do once or twice she had',
 'peeped into the book her sister was reading but it had no',
 'pictures or conversations in it and what is the use of a book',
 'thought alice without pictures or conversation',
 'so she was considering in her own mind as well as she could']

In [13]:
# Tokenize words
tokenizer = Tokenizer(oov_token='<oov>')
tokenizer.fit_on_texts(tokenized_sentences)
total_words = len(tokenizer.word_index) + 1

#we create n-grams
'''
sentence
i     am     very     proud
will look like
0     0      0        i
0     0      i        am
0     i      am       very
i     am     very     proud
'''
input_sequences = []
for line in tokenized_sentences:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i + 1]
        input_sequences.append(n_gram_sequence)

#Pad sequences
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = np.array(pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre'))

In [16]:
max_sequence_len

16

In [7]:
input_sequences[:10]

array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 1474,  300],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 1474,  300,  528],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 1474,  300,  528,   12],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
        1474,  300,  528,   12,  829],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0, 1475, 1476],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0, 1475, 1476, 1477],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  301,   10],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,   37,    2],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,   

In [8]:
input_sequences.shape

(23710, 16)

In [9]:
# Creates labels with input sequences
'''
0     0      0        i
0     0      i        am
0     i      am       very
i     am     very     proud

X {column 0 to 2}
0     0      0
0     0      i
0     i      am
i     am     very

Y {last column}
i
am
very
proud
'''
X,labels = input_sequences[:,:-1],input_sequences[:,-1]
ys = tf.keras.utils.to_categorical(labels, num_classes=total_words)

In [10]:
# Split data into training, validation, and test sets
from sklearn.model_selection import train_test_split
X_train_temp, X_val_test, y_train_temp, y_val_test = train_test_split(X, ys, test_size=0.2, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val_test, y_val_test, test_size=0.5, random_state=42)

# Training

In [11]:
# Define your model
model = Sequential()
model.add(Embedding(total_words, 100))
model.add(Bidirectional(LSTM(150)))
model.add(Dense(total_words, activation='softmax'))

adam = Adam(learning_rate=0.01)
model.compile(loss='categorical_crossentropy', optimizer=adam, metrics=['accuracy'])

# Train the model
history = model.fit(X_train_temp, y_train_temp, epochs=50, validation_data=(X_val, y_val), verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


# Save Models

In [22]:
model.save('Sentence_autocompletion.keras')
model = tf.keras.models.load_model('Sentence_autocompletion.keras')

# Make Actual Prediction

In [23]:
def predict_top_five_words(model, tokenizer, seed_text):
    token_list = tokenizer.texts_to_sequences([seed_text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    top_five_indexes = np.argsort(predicted[0])[::-1][:5]
    top_five_words = []
    for index in top_five_indexes:
        for word, idx in tokenizer.word_index.items():
            if idx == index:
                top_five_words.append(word)
                break
    return top_five_words


In [24]:
seed_text = "Alice is my"
output= predict_top_five_words(model ,tokenizer ,seed_text )
output

['dear', 'time', 'own', 'youth', 'said']

In [25]:
seed_text = "Alice I will never"
output= predict_top_five_words(model ,tokenizer ,seed_text )
output

['do', 'said', 'spoke', 'you', 'thought']

In [26]:
tf.__version__

'2.15.0'

In [27]:
from tensorflow.keras.models import model_from_json

model_json = model.to_json()
with open("next_word_Prediction.json", "w") as json_file:
  json_file.write(model_json)



In [17]:
!python --version

Python 3.10.12


In [15]:
import pickle

# saving
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)