In [5]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

In [6]:
text = """Data plays a vital role in our everyday life.
Directly or indirectly, for daily life decisions, we depend on some data, be it choosing a novel to read from a list of books, buying a thing after considering the budget, and so on.
Have you ever imagined searching for something on Google or Yahoo generates a lot of data?
This data is essential to analyze user experiences.
Getting recommendations on various e-commerce websites after buying a product and tracking parcels during delivery are part of Data Analytics which involves analyzing the raw data to make informed decisions.
But this raw data does not help make decisions if it has some redundancy, inconsistency, or inaccuracy.
Therefore, this data needs to be cleaned before considering for analysis."""

In [7]:
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [8]:
# Initiate the tokenizer

tokenizer = Tokenizer()

In [9]:
tokenizer.fit_on_texts([text])

In [10]:
len(tokenizer.word_index)

87

In [11]:
for sentence in text.split('\n'):
    print(tokenizer.texts_to_sequences([sentence])[0])

[1, 21, 2, 22, 23, 24, 25, 26, 10]
[27, 5, 28, 6, 29, 10, 7, 30, 31, 3, 11, 1, 12, 13, 32, 2, 33, 4, 34, 35, 2, 36, 8, 37, 14, 2, 38, 15, 16, 17, 39, 18, 40, 3]
[41, 42, 43, 44, 45, 6, 46, 3, 47, 5, 48, 49, 2, 50, 8, 1]
[9, 1, 51, 52, 4, 53, 54, 55]
[56, 57, 3, 58, 59, 60, 61, 15, 14, 2, 62, 18, 63, 64, 65, 66, 67, 68, 8, 1, 69, 70, 71, 72, 17, 19, 1, 4, 20, 73, 7]
[74, 9, 19, 1, 75, 76, 77, 20, 7, 78, 13, 79, 11, 80, 81, 5, 82]
[83, 9, 1, 84, 4, 12, 85, 86, 16, 6, 87]


In [12]:
input_sequence = []

for sentence in text.split('\n'):
    tokeinzed_sentence = tokenizer.texts_to_sequences([sentence])[0]

    for i in range(1, len(tokeinzed_sentence)):
        input_sequence.append(tokeinzed_sentence[:i+1])


In [13]:
input_sequence[:10]

[[1, 21],
 [1, 21, 2],
 [1, 21, 2, 22],
 [1, 21, 2, 22, 23],
 [1, 21, 2, 22, 23, 24],
 [1, 21, 2, 22, 23, 24, 25],
 [1, 21, 2, 22, 23, 24, 25, 26],
 [1, 21, 2, 22, 23, 24, 25, 26, 10],
 [27, 5],
 [27, 5, 28]]

In [14]:
max_len = max(len(x) for x in input_sequence)

In [15]:
max_len

34

In [16]:
from tensorflow.keras.preprocessing.sequence import pad_sequences

padded_input_sequence = pad_sequences(input_sequence, maxlen=max_len, padding='pre')

In [17]:
padded_input_sequence[:4]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1, 21],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        21,  2],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 21,
         2, 22],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 21,  2,
        22, 23]], dtype=int32)

In [18]:
x = padded_input_sequence[:, :-1]
y = padded_input_sequence[:, -1]

In [19]:
y[:5]

array([21,  2, 22, 23, 24], dtype=int32)

In [20]:
x[:3]

array([[ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         1],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1,
        21],
       [ 0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
         0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  1, 21,
         2]], dtype=int32)

In [21]:
tokenizer.word_index

{'data': 1,
 'a': 2,
 'on': 3,
 'to': 4,
 'or': 5,
 'for': 6,
 'decisions': 7,
 'of': 8,
 'this': 9,
 'life': 10,
 'some': 11,
 'be': 12,
 'it': 13,
 'buying': 14,
 'after': 15,
 'considering': 16,
 'the': 17,
 'and': 18,
 'raw': 19,
 'make': 20,
 'plays': 21,
 'vital': 22,
 'role': 23,
 'in': 24,
 'our': 25,
 'everyday': 26,
 'directly': 27,
 'indirectly': 28,
 'daily': 29,
 'we': 30,
 'depend': 31,
 'choosing': 32,
 'novel': 33,
 'read': 34,
 'from': 35,
 'list': 36,
 'books': 37,
 'thing': 38,
 'budget': 39,
 'so': 40,
 'have': 41,
 'you': 42,
 'ever': 43,
 'imagined': 44,
 'searching': 45,
 'something': 46,
 'google': 47,
 'yahoo': 48,
 'generates': 49,
 'lot': 50,
 'is': 51,
 'essential': 52,
 'analyze': 53,
 'user': 54,
 'experiences': 55,
 'getting': 56,
 'recommendations': 57,
 'various': 58,
 'e': 59,
 'commerce': 60,
 'websites': 61,
 'product': 62,
 'tracking': 63,
 'parcels': 64,
 'during': 65,
 'delivery': 66,
 'are': 67,
 'part': 68,
 'analytics': 69,
 'which': 70,
 'invo

In [22]:
from tensorflow.keras.utils import to_categorical

In [23]:
y = to_categorical(y, num_classes=88)

In [24]:
y.shape

(119, 88)

In [25]:
x.shape

(119, 33)

### Model Building

In [26]:
from keras.models import Sequential
from keras.layers import Dense, GRU, Embedding

In [34]:
model = Sequential()
model.add(Embedding(88, 100, input_shape=(33,)))
model.add(GRU(150))
model.add(Dense(88, activation='softmax'))

In [35]:
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [36]:
model.summary()

In [37]:
model.fit(x, y, epochs=60)

Epoch 1/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 55ms/step - accuracy: 0.0268 - loss: 4.4774
Epoch 2/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step - accuracy: 0.1548 - loss: 4.4565
Epoch 3/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step - accuracy: 0.1327 - loss: 4.4352
Epoch 4/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step - accuracy: 0.1197 - loss: 4.4072
Epoch 5/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 82ms/step - accuracy: 0.0758 - loss: 4.3760
Epoch 6/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 110ms/step - accuracy: 0.0485 - loss: 4.2875
Epoch 7/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 108ms/step - accuracy: 0.0610 - loss: 4.2964
Epoch 8/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 108ms/step - accuracy: 0.0735 - loss: 4.1986
Epoch 9/60
[1m4/4[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m

<keras.src.callbacks.history.History at 0x7fb4db6dda60>

### Test Model

In [38]:
import numpy as np

txt = "Have you ever"

token_text = tokenizer.texts_to_sequences([txt])[0]
padded_text = pad_sequences([token_text], maxlen=33, padding='pre')

pos = np.argmax(model.predict(padded_text))

for word, idx in tokenizer.word_index.items():
  if idx == pos:
    print(word)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 232ms/step
imagined


In [51]:
txt2 = "Getting"

token_text2 = tokenizer.texts_to_sequences([txt2])[0]
padded_text2 = pad_sequences([token_text2], maxlen=33, padding='pre')

pos2 = np.argmax(model.predict(padded_text2))

for word2, idx2 in tokenizer.word_index.items():
  if idx2 == pos2:
    print(word2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 163ms/step
recommendations


In [62]:
# Unseen data

txt2 = "Facebook is a"

token_text2 = tokenizer.texts_to_sequences([txt2])[0]
padded_text2 = pad_sequences([token_text2], maxlen=33, padding='pre')

pos2 = np.argmax(model.predict(padded_text2))

for word2, idx2 in tokenizer.word_index.items():
  if idx2 == pos2:
    print(word2)

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
vital
