In [27]:
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Reading corpus the text file
with open("/content/drive/MyDrive/Next word prediction using RNN/bangalore.txt", 'r', encoding='utf-8') as myfile:
    mytext = myfile.read()

In [28]:
# from google.colab import drive
# drive.mount('/content/drive')

In [29]:
mytext

'The bustling city of Bengaluru, often referred to as the "Silicon Valley of India," is a vibrant hub of technological innovation and cultural diversity. Nestled amidst lush greenery, the city boasts a cosmopolitan population with a strong entrepreneurial spirit. From its towering IT companies and bustling startup scene to its historic temples and vibrant street art, Bengaluru offers a unique blend of the modern and the traditional.\n\nThe city\'s infrastructure reflects its rapid growth, with a well-developed network of metro lines and flyovers efficiently navigating the urban landscape. Bustling markets and contemporary shopping malls cater to every need, while serene parks and sprawling gardens provide a welcome respite from the city\'s energy.\n\nBengaluru\'s culinary scene is a melting pot of flavors, offering a delectable mix of South Indian staples like dosa and idli alongside international cuisines from around the globe. The city\'s vibrant nightlife caters to diverse tastes, w

In [30]:
mytokenizer = Tokenizer()
mytokenizer.fit_on_texts([mytext])
total_words = len(mytokenizer.word_index) + 1

In [31]:
mytokenizer.word_index

{'the': 1,
 'and': 2,
 'a': 3,
 'of': 4,
 'to': 5,
 'its': 6,
 "city's": 7,
 'bengaluru': 8,
 'with': 9,
 'from': 10,
 'for': 11,
 'bustling': 12,
 'city': 13,
 'as': 14,
 'vibrant': 15,
 'spirit': 16,
 'it': 17,
 'scene': 18,
 'rapid': 19,
 'growth': 20,
 'like': 21,
 'is': 22,
 'boasts': 23,
 'strong': 24,
 'entrepreneurial': 25,
 'art': 26,
 'traditional': 27,
 'contemporary': 28,
 'serene': 29,
 'gardens': 30,
 'energy': 31,
 "bengaluru's": 32,
 'offering': 33,
 'live': 34,
 'residents': 35,
 'specific': 36,
 'your': 37,
 'often': 38,
 'referred': 39,
 'silicon': 40,
 'valley': 41,
 'india': 42,
 'hub': 43,
 'technological': 44,
 'innovation': 45,
 'cultural': 46,
 'diversity': 47,
 'nestled': 48,
 'amidst': 49,
 'lush': 50,
 'greenery': 51,
 'cosmopolitan': 52,
 'population': 53,
 'towering': 54,
 'companies': 55,
 'startup': 56,
 'historic': 57,
 'temples': 58,
 'street': 59,
 'offers': 60,
 'unique': 61,
 'blend': 62,
 'modern': 63,
 'infrastructure': 64,
 'reflects': 65,
 'well

In [32]:
my_input_sequences = []
for line in mytext.split('\n'):
    #print(line)
    token_list = mytokenizer.texts_to_sequences([line])[0]
    #print(token_list)
    for i in range(1, len(token_list)):
        my_n_gram_sequence = token_list[:i+1]
        #print(my_n_gram_sequence)
        my_input_sequences.append(my_n_gram_sequence)
        #print(input_sequences)

In [33]:
max_sequence_len = max([len(seq) for seq in my_input_sequences])
input_sequences = np.array(pad_sequences(my_input_sequences, maxlen=max_sequence_len, padding='pre'))

In [34]:
input_sequences[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 12, 13],
      dtype=int32)

In [35]:
X = input_sequences[:, :-1]
y = input_sequences[:, -1]

In [36]:
X[1]

array([ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 12],
      dtype=int32)

In [37]:
y

array([ 12,  13,   4,   8,  38,  39,   5,  14,   1,  40,  41,   4,  42,
        22,   3,  15,  43,   4,  44,  45,   2,  46,  47,  48,  49,  50,
        51,   1,  13,  23,   3,  52,  53,   9,   3,  24,  25,  16,  10,
         6,  54,  17,  55,   2,  12,  56,  18,   5,   6,  57,  58,   2,
        15,  59,  26,   8,  60,   3,  61,  62,   4,   1,  63,   2,   1,
        27,   7,  64,  65,   6,  19,  20,   9,   3,  66,  67,  68,   4,
        69,  70,   2,  71,  72,  73,   1,  74,  75,  12,  76,   2,  28,
        77,  78,  79,   5,  80,  81,  82,  29,  83,   2,  84,  30,  85,
         3,  86,  87,  10,   1,   7,  31,  88,  18,  22,   3,  89,  90,
         4,  91,  33,   3,  92,  93,   4,  94,  95,  96,  21,  97,   2,
        98,  99, 100, 101,  10, 102,   1, 103,   1,   7,  15, 104, 105,
         5, 106, 107,   9, 108, 109,   2, 110, 111,  33,  34, 112,   2,
       113,  11, 114,   1, 116, 117,   8,  23,   3, 118, 119, 120, 121,
         6, 122, 123,  21,   1, 124, 125, 126,   2,   1,  29, 12

In [38]:
y = np.array(tf.keras.utils.to_categorical(y, num_classes=total_words))

In [39]:
y[1]

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 1., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0.

In [40]:
model = Sequential()
model.add(Embedding(total_words, 100, input_length=max_sequence_len-1))
model.add(LSTM(150))
model.add(Dense(total_words, activation='softmax'))
print(model.summary())

Model: "sequential_1"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 66, 100)           22800     
                                                                 
 lstm_1 (LSTM)               (None, 150)               150600    
                                                                 
 dense_1 (Dense)             (None, 228)               34428     
                                                                 
Total params: 207828 (811.83 KB)
Trainable params: 207828 (811.83 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________
None


In [41]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.fit(X, y, epochs=500, verbose=1)

Epoch 1/500
Epoch 2/500
Epoch 3/500
Epoch 4/500
Epoch 5/500
Epoch 6/500
Epoch 7/500
Epoch 8/500
Epoch 9/500
Epoch 10/500
Epoch 11/500
Epoch 12/500
Epoch 13/500
Epoch 14/500
Epoch 15/500
Epoch 16/500
Epoch 17/500
Epoch 18/500
Epoch 19/500
Epoch 20/500
Epoch 21/500
Epoch 22/500
Epoch 23/500
Epoch 24/500
Epoch 25/500
Epoch 26/500
Epoch 27/500
Epoch 28/500
Epoch 29/500
Epoch 30/500
Epoch 31/500
Epoch 32/500
Epoch 33/500
Epoch 34/500
Epoch 35/500
Epoch 36/500
Epoch 37/500
Epoch 38/500
Epoch 39/500
Epoch 40/500
Epoch 41/500
Epoch 42/500
Epoch 43/500
Epoch 44/500
Epoch 45/500
Epoch 46/500
Epoch 47/500
Epoch 48/500
Epoch 49/500
Epoch 50/500
Epoch 51/500
Epoch 52/500
Epoch 53/500
Epoch 54/500
Epoch 55/500
Epoch 56/500
Epoch 57/500
Epoch 58/500
Epoch 59/500
Epoch 60/500
Epoch 61/500
Epoch 62/500
Epoch 63/500
Epoch 64/500
Epoch 65/500
Epoch 66/500
Epoch 67/500
Epoch 68/500
Epoch 69/500
Epoch 70/500
Epoch 71/500
Epoch 72/500
Epoch 73/500
Epoch 74/500
Epoch 75/500
Epoch 76/500
Epoch 77/500
Epoch 78

<keras.src.callbacks.History at 0x7da383f5f760>

In [42]:
input_text = "bengaluru faces challenges like"
predict_next_words= 6

for _ in range(predict_next_words):
    token_list = mytokenizer.texts_to_sequences([input_text])[0]
    print(token_list)
    token_list = pad_sequences([token_list], maxlen=max_sequence_len-1, padding='pre')
    predicted = np.argmax(model.predict(token_list), axis=-1)
    output_word = ""
    for word, index in mytokenizer.word_index.items():
        if index == predicted:
            output_word = word
            break
    input_text += " " + output_word

print(input_text)

[8, 155, 156, 21]
[8, 155, 156, 21, 157]
[8, 155, 156, 21, 157, 158]
[8, 155, 156, 21, 157, 158, 2]
[8, 155, 156, 21, 157, 158, 2, 140]
[8, 155, 156, 21, 157, 158, 2, 140, 141]
bengaluru faces challenges like traffic congestion and welcoming nature readily
