# **Using Recurrent Neural Networks (RNN) to predict next word**

### **Author: Partha Seetala**

**Video Tutorial: https://www.youtube.com/watch?v=VuzcUsg0GVs**

In [None]:
import numpy as np
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.utils import to_categorical

MAX_SENTENCE_LENGTH = 10
EMBEDDING_DIM = 300

def generate_sequences_from_text_data(tokenizer, sentences):
    # STEP #1: tokenize the text
    tokenizer.fit_on_texts(sentences)

    # STEP #2: Convert sentences to sequences
    sequences = tokenizer.texts_to_sequences(sentences)

    x = []
    y = []

    for sequence in sequences:
        for i in range(1, len(sequence)):
            x.append(sequence[:i])
            y.append(sequence[i])

    # STEP #3: Pad sequences
    x = keras.preprocessing.sequence.pad_sequences(x, padding='pre', maxlen=MAX_SENTENCE_LENGTH)

    # STEP #4: prepare ground-truth value Ytrue
    vocab_size = len(tokenizer.word_index) + 1
    ytrue = to_categorical(y, num_classes=vocab_size)

    return x, ytrue

# Prepare training data into sequences

In [None]:
sentences = [
    "i drink coffee with milk and sugar",
    "i brew tea with honey and lemon",
    "i make juice with apples and carrots",
    "i prepare cocoa with marshmallows and cinnamon",
    "i stir soup with salt and pepper",
    "i mix smoothies with yogurt and berries",
    "i shake cocktails with ice and lime",
    "i pour soda with syrup and mint",
    "i blend shakes with protein and almond milk",
    "i sip water with lemon and ice",
    "the sun rises in the east every morning",
    "the moon glows in the sky each night",
    "the cat sleeps in the basket all afternoon",
    "the dog plays in the park every weekend",
    "the bird sings in the tree at dawn",
    "the clock ticks in the hall continuously",
    "the river flows in the valley endlessly",
    "the wind blows in the fields daily",
    "the leaves fall in the autumn quietly",
    "the stars shine in the darkness brightly",
    "john goes to the gym every monday to lift weights",
    "mary goes to the market every tuesday to buy groceries",
    "tom goes to the library every wednesday to study",
    "anna goes to the park every thursday to jog",
    "paul goes to the cafe every friday to read",
    "lisa goes to the pool every saturday to swim",
    "mark goes to the mall every sunday to shop",
    "emma goes to the studio every day to dance",
    "alex goes to the office every morning to work",
    "sara goes to the garden every evening to relax",
    "we ski slopes during the winter often",
    "we swim lakes during the summer frequently",
    "we hike trails during the spring regularly",
    "we bike paths during the autumn sometimes",
    "we play sports during the weekends occasionally",
    "we pick flowers during the springtime joyfully",
    "we rake leaves during the fall daily",
    "we build fires during the winter nights",
    "we plant seeds during the early spring",
    "we harvest crops during the late autumn",
    "it is important to study hard every day",
    "it is fun to play games together often",
    "it is healthy to exercise regularly daily",
    "it is wise to save money early on",
    "it is nice to help others frequently",
    "it is challenging to solve puzzles quickly",
    "it is relaxing to read books quietly",
    "it is exciting to travel abroad yearly",
    "it is necessary to sleep well nightly",
    "it is rewarding to volunteer weekly"
]

tokenizer = Tokenizer(num_words=50000, oov_token='<OOV>')

x, ytrue = generate_sequences_from_text_data(tokenizer, sentences)
for i in range(len(x)):
    print("[", ", ".join(f"{num:2d}" for num in x[i]), "] -> ", "[", ", ".join(f"{y:1.0f}" for y in ytrue[i]), "]")

[  0,  0,  0,  0,  0,  0,  0,  0,  0,  5 ] ->  [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 ]
[  0,  0,  0,  0,  0,  0,  0,  0,  5, 35 ] ->  [ 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0

# Prepare the Embedding Matrix

In [None]:
#!pip install --upgrade numpy
#!pip install --upgrade gensim

from google.colab import drive
import gensim.downloader as api
from gensim.models import KeyedVectors
import os

def load_word_embeddings(model_name='word2vec-google-news-300', save_dir='/content/drive/MyDrive/cidl/embeddings'):
    drive.mount('/content/drive', force_remount=True)
    os.makedirs(save_dir, exist_ok=True)

    # Define save path
    save_path = os.path.join(save_dir, f'{model_name}.model')

    # Check if model exists in Google Drive
    if os.path.exists(save_path):
        print(f"Loading {model_name} from Google Drive...")
        model = KeyedVectors.load(save_path)
        print("Embedding loaded successfully!")
    else:
        print(f"Downloading {model_name}...")
        model = api.load(model_name)
        print(f"Saving {model_name} to Google Drive...")
        model.save(save_path)
        print("Embedding saved successfully!")

    return model

w2vec = None
glove = None

#if w2vec is None:
#    w2vec = load_word_embeddings("word2vec-google-news-300")

if glove is None:
    glove = load_word_embeddings(model_name='glove-wiki-gigaword-300')

ModuleNotFoundError: No module named 'gensim'

In [None]:
GLOBAL_EMBEDDINGS = glove

vocab_size = len(tokenizer.word_index) + 1

embedding_matrix = np.zeros((vocab_size, EMBEDDING_DIM))
for word, i in tokenizer.word_index.items():
    if word in GLOBAL_EMBEDDINGS:
        # Check if the word exists in Word2Vec, use that pre-trained embedding value, else leave it as 0
        embedding_matrix[i] = GLOBAL_EMBEDDINGS[word]  # Assign pre-trained vector

# Build RNN model

In [None]:
use_pretrained_embeddings = False

if use_pretrained_embeddings:
    rnn = keras.Sequential([
        layers.Embedding(
            input_dim=vocab_size,
            output_dim=EMBEDDING_DIM,
            input_length=MAX_SENTENCE_LENGTH,
            weights=[embedding_matrix], # Load pre-trained Embedding weights
            trainable=False             # Don't update embeddings weights
        ),
        layers.SimpleRNN(100, return_sequences=False),
        layers.Dense(vocab_size, activation='softmax')
    ])
else:
    rnn = keras.Sequential([
        layers.Embedding(vocab_size, EMBEDDING_DIM, input_length=MAX_SENTENCE_LENGTH),
        layers.SimpleRNN(100, return_sequences=False),
        layers.Dense(vocab_size, activation='softmax')
    ])

rnn.compile(
    optimizer='adam',
    loss='categorical_crossentropy',
    metrics=['accuracy']
)

NameError: name 'vocab_size' is not defined

# Train RNN model

In [None]:
rnn.fit(x, ytrue, epochs=100, batch_size=32, verbose=1)

Epoch 1/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 19ms/step - accuracy: 0.0466 - loss: 5.2222
Epoch 2/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1311 - loss: 4.7885
Epoch 3/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.1829 - loss: 4.4949
Epoch 4/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2128 - loss: 4.2578
Epoch 5/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.2715 - loss: 3.8699
Epoch 6/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3210 - loss: 3.6257
Epoch 7/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 10ms/step - accuracy: 0.3430 - loss: 3.4062
Epoch 8/100
[1m11/11[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 12ms/step - accuracy: 0.4342 - loss: 2.9764
Epoch 9/100
[1m11/11[0m [32m━━━━━━━━━

<keras.src.callbacks.history.History at 0x7b62f1149dd0>

# Predict Next Token(s)

In [None]:
def predict_next_tokens(rnn, tokenizer, seed_text, num_tokens=3):

    # STEP 1: Same as during training (convert the seed_text into sequences)
    sequence = tokenizer.texts_to_sequences([seed_text])[0]

    completed_text = seed_text

    for _ in range(num_tokens):
        # STEP 2: Pad the sequence
        padded_seq = keras.preprocessing.sequence.pad_sequences([sequence], maxlen=MAX_SENTENCE_LENGTH, padding='pre')

        # STEP 3: Predict next token probabilities
        ypred = rnn.predict(padded_seq, verbose=0)

        # STEP 4: Get the most likely token
        tokenid = np.argmax(ypred[0])

        # STEP 5: Convert token to word
        for word, index in tokenizer.word_index.items():
            if index == tokenid:
                completed_text += " " + '\033[1m' + word + '\033[0m'
                break

        # STEP 6: Update token_list for next prediction
        sequence.append(tokenid)
        sequence = sequence[-MAX_SENTENCE_LENGTH:]  # Keep only last maxlen tokens
    return completed_text

In [None]:
inference_sentences = [
    "i drink coffee",
    "the sun rises",
    "john goes to",
    "we ski slopes",
    "it is important",
    "we pour coffee",
    "alex the cat is"
]

for sent in inference_sentences:
    print(predict_next_tokens(rnn, tokenizer, sent, num_tokens=4))


i drink coffee [1mwith[0m [1mmilk[0m [1mand[0m [1msugar[0m
the sun rises [1min[0m [1mthe[0m [1meast[0m [1mevery[0m
john goes to [1mthe[0m [1mgym[0m [1mevery[0m [1mmonday[0m
we ski slopes [1mduring[0m [1mthe[0m [1mwinter[0m [1moften[0m
it is important [1mto[0m [1mstudy[0m [1mhard[0m [1mevery[0m
we pour coffee [1mduring[0m [1mthe[0m [1mand[0m [1msometimes[0m
alex the cat is [1min[0m [1mthe[0m [1mbasket[0m [1mall[0m
