##### Import the libraries

In [3]:
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense

##### Define the source data

In [5]:
# Sample text data
text_data = [
    "hello how are you",
    "hello how have you been",
    "hi there",
    "good morning",
    "good night",
    "have a nice day",
    "how is it going",
    "how have you been",
    "nice to meet you",
    "thank you"
]

##### Create the tokenizer

In [53]:
# Prepare the tokenizer
tokenizer = Tokenizer()
tokenizer.fit_on_texts(text_data)
total_words = len(tokenizer.word_index) + 1

In [54]:
total_words

21

In [55]:
tokenizer.word_index

{'you': 1,
 'how': 2,
 'have': 3,
 'hello': 4,
 'been': 5,
 'good': 6,
 'nice': 7,
 'are': 8,
 'hi': 9,
 'there': 10,
 'morning': 11,
 'night': 12,
 'a': 13,
 'day': 14,
 'is': 15,
 'it': 16,
 'going': 17,
 'to': 18,
 'meet': 19,
 'thank': 20}

##### Prepare the input sequences

In [56]:
# Create sequences of words
input_sequences = []
for line in text_data:
    token_list = tokenizer.texts_to_sequences([line])[0]
    for i in range(1, len(token_list)):
        n_gram_sequence = token_list[:i+1]
        input_sequences.append(n_gram_sequence)

In [57]:
input_sequences

[[4, 2],
 [4, 2, 8],
 [4, 2, 8, 1],
 [4, 2],
 [4, 2, 3],
 [4, 2, 3, 1],
 [4, 2, 3, 1, 5],
 [9, 10],
 [6, 11],
 [6, 12],
 [3, 13],
 [3, 13, 7],
 [3, 13, 7, 14],
 [2, 15],
 [2, 15, 16],
 [2, 15, 16, 17],
 [2, 3],
 [2, 3, 1],
 [2, 3, 1, 5],
 [7, 18],
 [7, 18, 19],
 [7, 18, 19, 1],
 [20, 1]]

##### Padding of sequences

In [58]:
max_sequence_len = max([len(x) for x in input_sequences])
input_sequences = pad_sequences(
    input_sequences, maxlen=max_sequence_len, padding='pre')

In [59]:
input_sequences

array([[ 0,  0,  0,  4,  2],
       [ 0,  0,  4,  2,  8],
       [ 0,  4,  2,  8,  1],
       [ 0,  0,  0,  4,  2],
       [ 0,  0,  4,  2,  3],
       [ 0,  4,  2,  3,  1],
       [ 4,  2,  3,  1,  5],
       [ 0,  0,  0,  9, 10],
       [ 0,  0,  0,  6, 11],
       [ 0,  0,  0,  6, 12],
       [ 0,  0,  0,  3, 13],
       [ 0,  0,  3, 13,  7],
       [ 0,  3, 13,  7, 14],
       [ 0,  0,  0,  2, 15],
       [ 0,  0,  2, 15, 16],
       [ 0,  2, 15, 16, 17],
       [ 0,  0,  0,  2,  3],
       [ 0,  0,  2,  3,  1],
       [ 0,  2,  3,  1,  5],
       [ 0,  0,  0,  7, 18],
       [ 0,  0,  7, 18, 19],
       [ 0,  7, 18, 19,  1],
       [ 0,  0,  0, 20,  1]], dtype=int32)

In [60]:
input_sequences.shape

(23, 5)

##### Split into features and labels

In [63]:
input_sequences = np.array(input_sequences)
X, y = input_sequences[:,:-1], input_sequences[:,-1]
y = keras.utils.to_categorical(y, num_classes=total_words)

In [64]:
input_sequences[:,-1]

array([ 2,  8,  1,  2,  3,  1,  5, 10, 11, 12, 13,  7, 14, 15, 16, 17,  3,
        1,  5, 18, 19,  1,  1], dtype=int32)

In [65]:
y

array([[0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0., 0.,
        0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
        0., 0., 0.,

##### Build the model

In [67]:
# Build the RNN model
model = Sequential()
model.add(Embedding(total_words, 10, input_length=max_sequence_len-1))
model.add(LSTM(150, return_sequences=True))
model.add(LSTM(100))
model.add(Dense(total_words, activation='softmax'))

In [68]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 4, 10)             210       
                                                                 
 lstm (LSTM)                 (None, 4, 150)            96600     
                                                                 
 lstm_1 (LSTM)               (None, 100)               100400    
                                                                 
 dense (Dense)               (None, 21)                2121      
                                                                 
Total params: 199331 (778.64 KB)
Trainable params: 199331 (778.64 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


##### Compile the model

In [69]:
model.compile(loss='categorical_crossentropy', 
              optimizer='adam', metrics=['accuracy'])

##### Train the model

In [74]:
model.fit(X, y, epochs=200, batch_size=1)

Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78

<keras.src.callbacks.History at 0x765bcc9a10f0>

##### Prediction on new data

In [81]:
def predict_next_word(model, tokenizer, text, max_sequence_len):
    token_list = tokenizer.texts_to_sequences([text])[0]
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = model.predict(token_list, verbose=0)
    predicted_word_index = np.argmax(predicted)
    output_word = ""
    for word, index in tokenizer.word_index.items():
        if index == predicted_word_index:
            output_word = word
            break
    return output_word

In [87]:
new = 'nice'

next_word = predict_next_word(model, tokenizer, new, max_sequence_len)
print(f"Next word: {next_word}")

Next word: to
