# NEXT WORD PREDICTION TASK

In [None]:
By: Madhunisha M V

# 1) Predicting the next word in a sequence based on the input text

In [13]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
import numpy as np

# Text data
text = """Machine learning enables computers to learn from data and improve over time.
          It includes techniques like supervised, unsupervised, and reinforcement learning.
          Common applications are in image recognition, speech processing, and predictive analytics."""

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
total_words = len(tokenizer.word_index) + 1

# Create input sequences using the tokens
input_sequences = []
for line in text.split('.'):
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

# Pad sequences to make them the same length
max_sequence_len = max([len(seq) for seq in input_sequences])
input_sequences = pad_sequences(input_sequences, maxlen=max_sequence_len, padding='pre')

# Create predictors and labels
input_sequences = np.array(input_sequences)
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = to_categorical(y, num_classes=total_words)

# One-Hot Encode Input Sequences
def one_hot_encode_sequences(sequences, total_words):
    one_hot_encoded = np.zeros((sequences.shape[0], sequences.shape[1], total_words), dtype=np.int32)
    for i, sequence in enumerate(sequences):
        for j, index in enumerate(sequence):
            if index != 0:  # skip padding
                one_hot_encoded[i, j, index] = 1
    return one_hot_encoded

X = one_hot_encode_sequences(X, total_words)

# Build the Model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense

model = Sequential()
model.add(LSTM(150, input_shape=(max_sequence_len-1, total_words)))
model.add(Dense(total_words, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()

# Train the Model
history = model.fit(X, y, epochs=100, verbose=1)



  super().__init__(**kwargs)


Epoch 1/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step - accuracy: 0.0000e+00 - loss: 3.3956
Epoch 2/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step - accuracy: 0.0345 - loss: 3.3844
Epoch 3/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 39ms/step - accuracy: 0.1034 - loss: 3.3731
Epoch 4/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step - accuracy: 0.1724 - loss: 3.3617
Epoch 5/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step - accuracy: 0.1724 - loss: 3.3501
Epoch 6/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 38ms/step - accuracy: 0.1724 - loss: 3.3381
Epoch 7/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step - accuracy: 0.1379 - loss: 3.3257
Epoch 8/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step - accuracy: 0.1034 - loss: 3.3126
Epoch 9/100
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m

In [14]:
# Prediction Function
def predict_next_word(model, tokenizer, text, max_sequence_len, total_words):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    token_list = one_hot_encode_sequences(token_list, total_words)
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted, axis=-1)[0]
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            return word
    return ""

# Predict the next word for all sequences in the text
def predict_all_next_words(model, tokenizer, input_sequences, max_sequence_len, total_words):
    predictions = []
    for sequence in input_sequences:
        seed_text = ' '.join([tokenizer.index_word[index] for index in sequence if index != 0])
        next_word = predict_next_word(model, tokenizer, seed_text, max_sequence_len, total_words)
        predictions.append((seed_text, next_word))
    return predictions

# Get all input sequences (excluding padding)
input_sequences_text = []
for seq in input_sequences:
    text_seq = [tokenizer.index_word[index] for index in seq if index != 0]
    input_sequences_text.append(text_seq)

# Predict next words
predictions = predict_all_next_words(model, tokenizer, input_sequences, max_sequence_len, total_words)

# Print predictions
for seed_text, next_word in predictions:
    print(f"{seed_text} -> {next_word}")


machine learning -> enables
machine learning enables -> computers
machine learning enables computers -> to
machine learning enables computers to -> learn
machine learning enables computers to learn -> from
machine learning enables computers to learn from -> data
machine learning enables computers to learn from data -> and
machine learning enables computers to learn from data and -> improve
machine learning enables computers to learn from data and improve -> over
machine learning enables computers to learn from data and improve over -> time
machine learning enables computers to learn from data and improve over time -> time
it includes -> techniques
it includes techniques -> like
it includes techniques like -> supervised
it includes techniques like supervised -> unsupervised
it includes techniques like supervised unsupervised -> and
it includes techniques like supervised unsupervised and -> reinforcement
it includes techniques like supervised unsupervised and reinforcement -> learning
it

# ***************************************************************************************

# 2) Predicting the Next Word in a Large Dataset

# Import libraries and load data

In [None]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense
import numpy as np

# Function to load text data from a file
def read_text_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        data = file.read()
    return data

# Load and preprocess text data
file_path = "C:\\Users\\kmpvi\\Downloads\\text.data.txt"  # Replace with your dataset file path
corpus = read_text_file(file_path)

# Data Preprocessing

In [None]:
# Tokenize the text
text_tokenizer = Tokenizer()
text_tokenizer.fit_on_texts([corpus])
vocab_size = len(text_tokenizer.word_index) + 1

# Create input sequences using the tokens
sequence_list = []
for line in corpus.split('\n'):
    tokens = text_tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(tokens)):
        n_gram_seq = tokens[:i+1]
        sequence_list.append(n_gram_seq)

# Pad sequences to make them the same length
max_seq_length = max([len(seq) for seq in sequence_list])
padded_sequences = pad_sequences(sequence_list, maxlen=max_seq_length, padding='pre')

# Create predictors and labels
padded_sequences = np.array(padded_sequences)
X_train, y_train = padded_sequences[:,:-1], padded_sequences[:,-1]
y_train = to_categorical(y_train, num_classes=vocab_size)


# Model Building and Training

In [1]:
# Define the Bi-directional LSTM model
text_generation_model = Sequential()
text_generation_model.add(Embedding(vocab_size, 64, input_length=max_seq_length-1))
text_generation_model.add(Bidirectional(LSTM(150)))
text_generation_model.add(Dense(vocab_size, activation='softmax'))

# Compile the model
text_generation_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train the model
training_history = text_generation_model.fit(X_train, y_train, epochs=20, verbose=1)




Epoch 1/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m82s[0m 25ms/step - accuracy: 0.0574 - loss: 6.6158
Epoch 2/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m76s[0m 24ms/step - accuracy: 0.1051 - loss: 5.7160
Epoch 3/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m79s[0m 25ms/step - accuracy: 0.1358 - loss: 5.3075
Epoch 4/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m80s[0m 25ms/step - accuracy: 0.1513 - loss: 5.0074
Epoch 5/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.1654 - loss: 4.7635
Epoch 6/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m75s[0m 24ms/step - accuracy: 0.1770 - loss: 4.5429
Epoch 7/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m89s[0m 28ms/step - accuracy: 0.1905 - loss: 4.3487
Epoch 8/20
[1m3176/3176[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m81s[0m 26ms/step - accuracy: 0.2068 - loss: 4.1486
Epoch 9/

In [11]:
# Function to generate text
def create_text(model, tokenizer, start_text, max_seq_length, num_words):
    generated_text = start_text
    for _ in range(num_words):
        tokens = tokenizer.texts_to_sequences([generated_text])[0]
        tokens = pad_sequences([tokens], maxlen=max_seq_length-1, padding='pre')
        predicted = model.predict(tokens, verbose=0)
        predicted_word_index = np.argmax(predicted, axis=-1)[0]
        next_word = ""
        for word, index in tokenizer.word_index.items():
            if index == predicted_word_index:
                next_word = word
                break
        generated_text += " " + next_word
    return generated_text


initial_text = "“Is Briony Lodge, "
generated_text = create_text(text_generation_model, text_tokenizer, initial_text, max_seq_length, num_words=10)
print(f"Generated text: {generated_text}")


Generated text: “Is Briony Lodge, Serpentine Avenue, St. John’s Wood.”
