In [15]:
faqs="""Human being, a culture-bearing primate classified in the genus Homo, especially the species H sapiens.
Human beings are anatomically similar and related to the great apes but are distinguished by a more highly developed brain and a resultant
capacity for articulate speech and abstract reasoning. In addition, human beings display a marked erectness of body carriage that frees
the hands for use as manipulative members. Some of these characteristics, however, are not entirely unique to humans. The gap in cognition,
as in anatomy, between humans and the great apes (orangutans, gorillas, chimpanzees, and bonobos) is much less than was once thought,
as they have been shown to possess a variety of advanced cognitive abilities formerly believed to be restricted to humans."""

In [3]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer

In [5]:
tokenizer = Tokenizer()

In [6]:
tokenizer.fit_on_texts([faqs])

In [7]:
tokenizer.word_index

{'the': 1,
 'a': 2,
 'and': 3,
 'to': 4,
 'in': 5,
 'human': 6,
 'are': 7,
 'of': 8,
 'as': 9,
 'humans': 10,
 'beings': 11,
 'great': 12,
 'apes': 13,
 'for': 14,
 'being': 15,
 'culture': 16,
 'bearing': 17,
 'primate': 18,
 'classified': 19,
 'genus': 20,
 'homo': 21,
 'especially': 22,
 'species': 23,
 'h': 24,
 'sapiens': 25,
 'anatomically': 26,
 'similar': 27,
 'related': 28,
 'but': 29,
 'distinguished': 30,
 'by': 31,
 'more': 32,
 'highly': 33,
 'developed': 34,
 'brain': 35,
 'resultant': 36,
 'capacity': 37,
 'articulate': 38,
 'speech': 39,
 'abstract': 40,
 'reasoning': 41,
 'addition': 42,
 'display': 43,
 'marked': 44,
 'erectness': 45,
 'body': 46,
 'carriage': 47,
 'that': 48,
 'frees': 49,
 'hands': 50,
 'use': 51,
 'manipulative': 52,
 'members': 53,
 'some': 54,
 'these': 55,
 'characteristics': 56,
 'however': 57,
 'not': 58,
 'entirely': 59,
 'unique': 60,
 'gap': 61,
 'cognition': 62,
 'anatomy': 63,
 'between': 64,
 'orangutans': 65,
 'gorillas': 66,
 'chimpanz

In [16]:
for sentence in faqs.split('.'):
  for portion in sentence.split(','):
    print(portion)

Human being
 a culture-bearing primate classified in the genus Homo
 especially the species H sapiens

Human beings are anatomically similar and related to the great apes but are distinguished by a more highly developed brain and a resultant
capacity for articulate speech and abstract reasoning
 In addition
 human beings display a marked erectness of body carriage that frees
the hands for use as manipulative members
 Some of these characteristics
 however
 are not entirely unique to humans
 The gap in cognition

as in anatomy
 between humans and the great apes (orangutans
 gorillas
 chimpanzees
 and bonobos) is much less than was once thought

as they have been shown to possess a variety of advanced cognitive abilities formerly believed to be restricted to humans



In [19]:
input_seq = []
for sentence in faqs.split('.'):
  for portion in sentence.split(','):
    tokenized_sentence = tokenizer.texts_to_sequences([portion])[0]

    for i in range(1,len(tokenized_sentence)):
      input_seq.append(tokenized_sentence[:i+1])

In [20]:
input_seq

[[6, 15],
 [2, 16],
 [2, 16, 17],
 [2, 16, 17, 18],
 [2, 16, 17, 18, 19],
 [2, 16, 17, 18, 19, 5],
 [2, 16, 17, 18, 19, 5, 1],
 [2, 16, 17, 18, 19, 5, 1, 20],
 [2, 16, 17, 18, 19, 5, 1, 20, 21],
 [22, 1],
 [22, 1, 23],
 [22, 1, 23, 24],
 [22, 1, 23, 24, 25],
 [6, 11],
 [6, 11, 7],
 [6, 11, 7, 26],
 [6, 11, 7, 26, 27],
 [6, 11, 7, 26, 27, 3],
 [6, 11, 7, 26, 27, 3, 28],
 [6, 11, 7, 26, 27, 3, 28, 4],
 [6, 11, 7, 26, 27, 3, 28, 4, 1],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7, 30],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7, 30, 31],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7, 30, 31, 2],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7, 30, 31, 2, 32],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7, 30, 31, 2, 32, 33],
 [6, 11, 7, 26, 27, 3, 28, 4, 1, 12, 13, 29, 7, 30, 31, 2, 32, 33, 34],
 [6, 11, 7, 26

In [23]:
# We need to apply zero padding to make size same for all input and ouput
max_len = max([len(x) for x in input_seq])

In [65]:
max_len

30

In [30]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_input_seq = pad_sequences(input_seq,maxlen=max_len,padding='pre')

In [32]:
padded_input_seq.shape

(106, 30)

In [27]:
x = padded_input_seq[:,:-1]

In [35]:
y = padded_input_seq[:,-1]

In [None]:
# There is possibility that this is regression or muliclass classification problem
# If we consider it as regression problem than we might get value 2.1 but no word is associated with this number
# Hence we will use multi-class-classification and we will use one-hot-encoding

In [36]:
x.shape

(106, 29)

In [37]:
y.shape

(106,)

In [41]:
x[0]

array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0, 6], dtype=int32)

In [39]:
y[0]

15

In [42]:
from tensorflow.keras.utils import to_categorical
y = to_categorical(y,num_classes=len(tokenizer.word_index)+1)

In [44]:
from tensorflow import keras

In [43]:
y.shape

(106, 89)

In [69]:
model = keras.Sequential([
    keras.layers.Embedding(len(tokenizer.word_index)+1,100,input_length=max_len-1),
    keras.layers.LSTM(100),
    keras.layers.Dense(len(tokenizer.word_index)+1, activation='softmax')
])

In [70]:
model.compile(
    loss='categorical_crossentropy',
    optimizer = 'adam',
    metrics = ['accuracy']
)

In [47]:
model.summary()

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 30, 100)           8900      
                                                                 
 lstm (LSTM)                 (None, 100)               80400     
                                                                 
 dense (Dense)               (None, 89)                8989      
                                                                 
Total params: 98289 (383.94 KB)
Trainable params: 98289 (383.94 KB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [72]:
model.fit(x,y,epochs=100)

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.src.callbacks.History at 0x7f60e4b3ebf0>

In [130]:
text = 'humans are great not apes'

for i in range(10):
  token_text = tokenizer.texts_to_sequences([text])[0]
  padded_token_text = pad_sequences([token_text],maxlen=max_len-1,padding='pre')
  pos = np.argmax(model.predict(padded_token_text))

  for word,index in tokenizer.word_index.items():
    if index==pos:
      print(word)
      text = text + " " + word
print(text)

to
the
great
apes
orangutans
orangutans
are
distinguished
by
a
humans are great not apes to the great apes orangutans orangutans are distinguished by a


In [110]:
model.predict(padded_token_text).shape



(1, 89)

In [111]:
import numpy as np



humans
