# Next word predictor using LSTM

# Data collecting and preprocessing phase

I have downloaded 17 subltitle files of friends tv show and converted it into text files.

Below i have concatinated all 17 text file in single corpus

In [1]:
corpus = ""

# Loop through all files from f1.txt to f17.txt
for i in range(1, 18):  # 1 to 17
    file_name = f"C:\\Users\\PC\\Desktop\\sub\\f{i}.txt"  # Format file names dynamically
    try:
        with open(file_name, "r", encoding="utf-8") as f:
            corpus += f.read() + " "  # Add space between files
    except FileNotFoundError:
        print(f"File {file_name} not found. Skipping.")

# Print corpus length to check
print("Total Characters in Corpus:", len(corpus))

# Save the combined text (Optional)
with open("combined_corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)


Total Characters in Corpus: 275359


Saved the cleaned text in .txt file

In [2]:
# Saved the cleaned text to a file
with open("C:\\Users\\PC\\Desktop\\cleaned_corpus.txt", "w", encoding="utf-8") as f:
    f.write(corpus)

print("Cleaned text saved as 'cleaned_corpus.txt'")


Cleaned text saved as 'cleaned_corpus.txt'


Preprocessing and cleaning the data below

In [1]:
import re
from collections import Counter

# Function to remove emojis using regex
def remove_emojis(text):
    return re.sub(r'[^\x00-\x7F]+', '', text)

# Function to process the text file
def process_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        lines = file.readlines()

    # Remove emojis and lines with less than 2 words
    cleaned_lines = [remove_emojis(line.strip()) for line in lines if len(line.split()) >= 2]

    # Count occurrences of each line
    line_counts = Counter(cleaned_lines)

    # Filter out lines that appear more than 4 times
    final_lines = [line for line in cleaned_lines if line_counts[line] <= 4]

#     # Join all lines into one paragraph
#     paragraph = ' '.join(final_lines)

    return final_lines

# Example usage
file_path = "/content/cleaned_corpus.txt"  # Replace with your file path
result = process_text_file(file_path)

Converted the it into list of strings

In [2]:
data=[]
for i in result:
    data.append(i+"\n")

In [None]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense


# Step 1: Preprocess the data
# Lowercase and clean the data
cleaned_data = [line.strip().lower() for line in data]

# Combine all lines into a single text
full_text = " ".join(cleaned_data)

# Step 2: Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([full_text])
word_index = tokenizer.word_index
total_words = len(word_index) + 1  # Include one for padding index

# Convert text into sequences of tokens
input_sequences = []
for line in cleaned_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Step 3: Pad sequences and create input-output pairs
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Split into input (X) and output (y)
X = input_sequences[:, :-1]
y = input_sequences[:, -1]
y = to_categorical(y, num_classes=total_words)

# Defining model

In [3]:
# Step 4: Build the model
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len - 1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))

model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

model.summary()



In [4]:
max_sequence_len

14

# TRAINING PHASE

In [5]:
# Step 5: Train the model
model.fit(X, y, epochs=50, verbose=2)

Epoch 1/50
1241/1241 - 14s - 11ms/step - accuracy: 0.0408 - loss: 6.3161
Epoch 2/50
1241/1241 - 18s - 15ms/step - accuracy: 0.0634 - loss: 5.7810
Epoch 3/50
1241/1241 - 10s - 8ms/step - accuracy: 0.0924 - loss: 5.4311
Epoch 4/50
1241/1241 - 10s - 8ms/step - accuracy: 0.1100 - loss: 5.1737
Epoch 5/50
1241/1241 - 10s - 8ms/step - accuracy: 0.1229 - loss: 4.9689
Epoch 6/50
1241/1241 - 8s - 7ms/step - accuracy: 0.1344 - loss: 4.7904
Epoch 7/50
1241/1241 - 10s - 8ms/step - accuracy: 0.1455 - loss: 4.6278
Epoch 8/50
1241/1241 - 8s - 7ms/step - accuracy: 0.1526 - loss: 4.4820
Epoch 9/50
1241/1241 - 10s - 8ms/step - accuracy: 0.1597 - loss: 4.3459
Epoch 10/50
1241/1241 - 10s - 8ms/step - accuracy: 0.1685 - loss: 4.2177
Epoch 11/50
1241/1241 - 8s - 7ms/step - accuracy: 0.1752 - loss: 4.0966
Epoch 12/50
1241/1241 - 8s - 7ms/step - accuracy: 0.1833 - loss: 3.9829
Epoch 13/50
1241/1241 - 10s - 8ms/step - accuracy: 0.1930 - loss: 3.8748
Epoch 14/50
1241/1241 - 10s - 8ms/step - accuracy: 0.2053 - lo

<keras.src.callbacks.history.History at 0x7ca903343010>

In [6]:
model.fit(X, y, epochs=40, verbose=2)

Epoch 1/40
1241/1241 - 8s - 6ms/step - accuracy: 0.5419 - loss: 1.9343
Epoch 2/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5473 - loss: 1.9062
Epoch 3/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5561 - loss: 1.8794
Epoch 4/40
1241/1241 - 8s - 7ms/step - accuracy: 0.5613 - loss: 1.8546
Epoch 5/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5640 - loss: 1.8309
Epoch 6/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5683 - loss: 1.8074
Epoch 7/40
1241/1241 - 8s - 7ms/step - accuracy: 0.5757 - loss: 1.7861
Epoch 8/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5809 - loss: 1.7617
Epoch 9/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5845 - loss: 1.7419
Epoch 10/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5908 - loss: 1.7197
Epoch 11/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5945 - loss: 1.7002
Epoch 12/40
1241/1241 - 10s - 8ms/step - accuracy: 0.5990 - loss: 1.6794
Epoch 13/40
1241/1241 - 10s - 8ms/step - accuracy: 0.6019 - loss: 1.6630
Epoch 14/40
1241/1241 - 10s - 8ms/step - accuracy: 0.6060 - los

<keras.src.callbacks.history.History at 0x7ca86dad0310>

In [7]:
model.fit(X, y, epochs=25, verbose=2)

Epoch 1/25
1241/1241 - 8s - 7ms/step - accuracy: 0.6637 - loss: 1.3334
Epoch 2/25
1241/1241 - 8s - 6ms/step - accuracy: 0.6650 - loss: 1.3234
Epoch 3/25
1241/1241 - 11s - 9ms/step - accuracy: 0.6660 - loss: 1.3209
Epoch 4/25
1241/1241 - 10s - 8ms/step - accuracy: 0.6651 - loss: 1.3143
Epoch 5/25
1241/1241 - 10s - 8ms/step - accuracy: 0.6671 - loss: 1.3089
Epoch 6/25
1241/1241 - 8s - 7ms/step - accuracy: 0.6671 - loss: 1.3031
Epoch 7/25
1241/1241 - 8s - 6ms/step - accuracy: 0.6689 - loss: 1.2971
Epoch 8/25
1241/1241 - 8s - 7ms/step - accuracy: 0.6691 - loss: 1.2895
Epoch 9/25
1241/1241 - 10s - 8ms/step - accuracy: 0.6694 - loss: 1.2877
Epoch 10/25
1241/1241 - 10s - 8ms/step - accuracy: 0.6701 - loss: 1.2807
Epoch 11/25
1241/1241 - 8s - 6ms/step - accuracy: 0.6703 - loss: 1.2744
Epoch 12/25
1241/1241 - 10s - 8ms/step - accuracy: 0.6725 - loss: 1.2702
Epoch 13/25
1241/1241 - 11s - 9ms/step - accuracy: 0.6721 - loss: 1.2681
Epoch 14/25
1241/1241 - 10s - 8ms/step - accuracy: 0.6709 - loss: 

<keras.src.callbacks.history.History at 0x7ca86daee210>

Save the model and tokenizer for future use

In [8]:
model.save('nlp_nxt.h5')



In [None]:
# Save the tokenizer
with open('tokenizer.pkl', 'wb') as file:
    pickle.dump(tokenizer, file)

# Model Testing Phase

In [3]:
model_path = r"C:\Users\PC\Downloads\nlp_nxt.h5"
tokenizer_path = r"C:\Users\PC\Downloads\tokenizer_nlp_nxt.pkl"
max_sequence_len = 14+ 1  # Use the same max length as during training

In [4]:
from keras.models import load_model
import pickle
from keras.preprocessing.sequence import pad_sequences
import numpy as np

def load_and_predict_multiple(model_path, tokenizer_path, input_text, max_sequence_len, num_words=1):
    # Load the trained model
    model = load_model(model_path)
    
    # Load the tokenizer
    with open(tokenizer_path, 'rb') as file:
        tokenizer = pickle.load(file)
    
    # Predict multiple words
    for _ in range(num_words):
        # Tokenize and pad the input text
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        
        # Predict the next word
        predicted = model.predict(token_list, verbose=0)
        predicted_word = ""
        for word, index in tokenizer.word_index.items():
            if index == np.argmax(predicted):
                predicted_word = word
                break
        
        # Append the predicted word to the input text
        input_text += " " + predicted_word
    
    return input_text


In [5]:
input_sentence = "good"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: good | Predicted next word: good night


In [6]:
input_sentence = "where have you"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: where have you | Predicted next word: where have you been


In [7]:
input_sentence = "who are"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: who are | Predicted next word: who are you


In [8]:
input_sentence = "marry"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: marry | Predicted next word: marry me


In [12]:
input_sentence = "will you marry"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: will you marry | Predicted next word: will you marry me


In [13]:
input_sentence = "i love"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: i love | Predicted next word: i love you


In [11]:
input_sentence = "i love you"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: i love you | Predicted next word: i love you too


In [20]:
input_sentence = "thank"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, max_sequence_len)
print(f"Input: {input_sentence} | Predicted next word: {next_word}")




Input: thank | Predicted next word: thank you


In [1]:
import tensorflow as tf
print(tf.__version__)

2.15.0


In [3]:
from keras.models import load_model
import pickle
from keras.preprocessing.sequence import pad_sequences
import numpy as np

model_path = r"C:\Users\PC\Desktop\Projects\git_uplod_NWP_frnds\nlp_nxt.h5"
tokenizer_path = r"C:\Users\PC\Desktop\Projects\git_uplod_NWP_frnds\tokenizer_nlp_nxt.pkl"

def load_and_predict_multiple(model_path, tokenizer_path, input_text, max_sequence_len, num_words=1):
    model = load_model(model_path)
    with open(tokenizer_path, 'rb') as file:
        tokenizer = pickle.load(file)
    
    for _ in range(num_words):
        token_list = tokenizer.texts_to_sequences([input_text])[0]
        token_list = pad_sequences([token_list], maxlen=max_sequence_len - 1, padding='pre')
        predicted = model.predict(token_list, verbose=0)
        predicted_word = [word for word, index in tokenizer.word_index.items() if index == np.argmax(predicted)][0]
        input_text += " " + predicted_word
    
    return input_text

# Test it out
input_sentence = "i love"
next_word = load_and_predict_multiple(model_path, tokenizer_path, input_sentence, 15)
print(f"Input: {input_sentence} | Predicted: {next_word}")
print(np.__version__)



Input: i love | Predicted: i love you
1.24.3


In [7]:
import pkg_resources

required_packages = ["numpy", "keras"]

# Get versions of the required packages
with open("requirements.txt", "w") as f:
    for package in required_packages:
        version = pkg_resources.get_distribution(package).version
        f.write(f"{package}=={version}\n")

print("✅ requirements.txt created with necessary dependencies.")


✅ requirements.txt created with necessary dependencies.


In [8]:
from keras.models import load_model

model = load_model("C:\\Users\\PC\\Desktop\\Projects\\git_uplod_NWP_frnds\\nlp_nxt.h5", compile=False)
model.save("C:\\Users\\PC\\Desktop\\Projects\\git_uplod_NWP_frnds\\nlp_nxt_fixed.h5")


