In [73]:
import numpy as np
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.utils import to_categorical

## Load and Preprocess the Text

In [56]:
# Sample text (you can replace this with a larger dataset)
text = """
Long Short-Term Memory (LSTM) networks, a variant of Recurrent Neural Networks (RNNs), are particularly effective for next word prediction tasks due to their ability to retain long-term dependencies in sequential data. These networks solve the vanishing gradient problem that limits traditional RNNs by introducing a memory cell and gated mechanisms to control the flow of information.
For next word prediction, an LSTM processes input sequences, such as a sentence fragment, to predict the most probable word that follows. The model starts by tokenizing and encoding the text into numerical sequences, ensuring that each word is represented as a unique token. During training, sliding windows of word sequences are created to generate input-output pairs, where the input is a sequence of preceding words and the output is the next word.
After passing through an embedding layer for dense word vector representations, the LSTM layer captures temporal dependencies in the input sequence. A final dense layer with a softmax activation predicts the probability of each word in the vocabulary as the next word. The model learns contextual relationships in text, enabling it to generate coherent and meaningful predictions. For example, given the input "The weather is very," the model might predict "sunny" as the next word.
This approach can be extended with techniques like bidirectional LSTMs, beam search, or temperature sampling to improve context understanding and diversity in predictions. Such models find applications in auto-completion, chatbots, and language modeling, making them essential in natural language processing tasks.
"""

# Tokenize the text
tokenizer = Tokenizer()
tokenizer.fit_on_texts([text])
word_index = tokenizer.word_index  # Dictionary of word -> index
vocab_size = len(word_index) + 1   # Vocabulary size (+1 for padding)

# Convert text to sequences of integers
sequences = []
for sentence in text.split("."):
    tokens = tokenizer.texts_to_sequences([sentence])[0]
    for i in range(1, len(tokens)):
        # Create input-output pairs
        sequences.append(tokens[:i+1])

# Pad sequences to ensure uniform length
max_len = max([len(x) for x in sequences])


padded_input_sequences = pad_sequences(sequences, maxlen = max_len, padding='pre')

# Split into inputs (X) and outputs (y)
X = padded_input_sequences[:,:-1]
y = padded_input_sequences[:, -1]

# Convert outputs to one-hot encoding
y = to_categorical(y, num_classes=vocab_size)

In [57]:
# Reshape X to have the required dimensions (samples, timesteps, features)
X = X.reshape((X.shape[0], X.shape[1], 1))

In [58]:
X.shape

(238, 32, 1)

In [59]:
y.shape

(238, 147)

In [60]:
vocab_size

147

In [65]:
print("Shape of X:", X.shape)
print("Shape of y:", y.shape)

Shape of X: (238, 32, 1)
Shape of y: (238, 147)


In [66]:
# Remove the extra dimension from X
X = X.squeeze()  # Shape becomes (238, 32)

In [67]:
X.shape

(238, 32)

## Build the LSTM Model

In [78]:
# Define the model
model = Sequential([
    Embedding(input_dim=vocab_size, output_dim=100, input_length=X.shape[1]),  # Embedding layer
    LSTM(150, return_sequences=True),  # First LSTM, returns sequences
    LSTM(150),                         # Second LSTM, returns the final state
    Dense(vocab_size, activation='softmax')  # Output layer
])

In [79]:
model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])

In [80]:
model.summary()

## train model

In [81]:
# Train the model
model.fit(X, y, epochs=100, batch_size=32)

Epoch 1/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 108ms/step - accuracy: 0.0102 - loss: 4.9898
Epoch 2/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 116ms/step - accuracy: 0.0572 - loss: 4.9288
Epoch 3/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 111ms/step - accuracy: 0.0647 - loss: 4.7364
Epoch 4/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 107ms/step - accuracy: 0.0626 - loss: 4.6721
Epoch 5/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step - accuracy: 0.0719 - loss: 4.5703
Epoch 6/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 106ms/step - accuracy: 0.0500 - loss: 4.5899
Epoch 7/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step - accuracy: 0.0819 - loss: 4.4652
Epoch 8/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 112ms/step - accuracy: 0.0595 - loss: 4.4457
Epoch 9/100
[1m8/8[0m [32m━━━━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7c1f2b864b80>

## Predict the Next Word

In [72]:
import time
text = "After passing through an embedding"

for i in range(10):
  # tokenize
  token_text = tokenizer.texts_to_sequences([text])[0]
  # padding
  padded_token_text = pad_sequences([token_text], maxlen=56, padding='pre')
  # predict
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index == pos:
      text = text + " " + word
      print(text)
      time.sleep(2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 286ms/step
After passing through an embedding layer
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 124ms/step
After passing through an embedding layer for
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
After passing through an embedding layer for dense
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 58ms/step
After passing through an embedding layer for dense dense
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 36ms/step
After passing through an embedding layer for dense dense word
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
After passing through an embedding layer for dense dense word vector
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
After passing through an embedding layer for dense dense word vector vector
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
After passing through