In [23]:
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.layers import Embedding, LSTM, Dense, Bidirectional
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.models import Sequential
from tensorflow.keras.optimizers import Adam
import numpy as np

In [202]:
#import the dataset 
import json 
with open("Data.json") as file :
    data = json.load(file)


In [203]:
data

{'intents': [{'tag': 'google',
   'code': 0,
   'patterns': ['can you search this in google',
    'open google',
    'google search',
    'search something in google',
    'find this in google ',
    'search this in google',
    'look in google',
    'find this in google',
    'google',
    'search google',
    'find in google']},
  {'tag': 'youtube',
   'code': 1,
   'patterns': ['can you search this in youtube',
    'open youtube',
    'youtube search',
    'search something in youtube',
    'find this in youtube',
    'search this in youtube',
    'look in youtube',
    'find this in youtube',
    'youtube',
    'search youtube',
    'find in youtube']},
  {'tag': 'email',
   'code': 2,
   'patterns': ['send email',
    'send email to mike',
    'email',
    'email send',
    'email to him',
    'email him',
    'send a email',
    'email send',
    'can u please send a email',
    'send email to jenny',
    'can u send the email',
    'send the email']},
  {'tag': 'random',
   'cod

In [204]:
df = data 

In [205]:
type(df)

dict

In [206]:
pattern = []
store = df["intents"]
for i in range(len(store)) : 
  PAT = store[i]["patterns"]
  for j in range(len(PAT)):
    pattern.append(PAT[j].lower())


In [207]:
pattern 

['can you search this in google',
 'open google',
 'google search',
 'search something in google',
 'find this in google ',
 'search this in google',
 'look in google',
 'find this in google',
 'google',
 'search google',
 'find in google',
 'can you search this in youtube',
 'open youtube',
 'youtube search',
 'search something in youtube',
 'find this in youtube',
 'search this in youtube',
 'look in youtube',
 'find this in youtube',
 'youtube',
 'search youtube',
 'find in youtube',
 'send email',
 'send email to mike',
 'email',
 'email send',
 'email to him',
 'email him',
 'send a email',
 'email send',
 'can u please send a email',
 'send email to jenny',
 'can u send the email',
 'send the email',
 'what hours are you open?',
 'what are your hours?',
 'when are you open?',
 'hi',
 'how are you',
 'is anyone there?',
 'hello',
 'good day',
 'bye',
 'see you later',
 'goodbye',
 'good day',
 'lalalalalalalalal',
 'hola',
 'send a text message',
 'send a text message',
 'message'



here are two really important things here - firstly our misspelt and rare words are just gone. That's really bad, we're trying to judge if a sentence is sincere and part of Quora's critera is that the sentence is gramatically correct - we've just broken that. There is also information in the fact that the word was uncommon enough to not be in the tokenizer.

Another issue is the tokenizer has stripped , and ?. We might not care so much about ,s but part of the critera for a sincere question is it is in fact a question, a ? undoubtably helps us here.

Second attempt - 
use an OOV token
Keras lets us define an Out Of Vocab token - this will replace any unknown words with a token of our choosing. This is better than just throwing away unknown words since it tells our model there was information here.

Let's do that

Third attempt - use question marks 

Finally, let's fix the ? issue. The ? is being filtered out by the tokenizer, we can solve this by specifying the filters ourselves

tokenizer_3 = Tokenizer(num_words=max_features, oov_token='OOV', filters='!"#$%&()*+,-./:;<=>@[\]^_`{|}~ ')
tokenizer_3.fit_on_texts(list(df['question_text'].values))


ref - https://www.kaggle.com/hamishdickson/using-keras-oov-tokens
ref - https://www.tensorflow.org/api_docs/python/tf/keras/preprocessing/text/Tokenizer

In [208]:
tokenizer = Tokenizer(oov_token='OOV', filters='!"#$%&()*+,-./:;<=>@[\]^_`{|}~ ')
corpus =  pattern 
tokenizer.fit_on_texts(corpus)
total_words = len(tokenizer.word_index) + 1
print(tokenizer.word_index)
print(total_words)

{'OOV': 1, 'send': 2, 'in': 3, 'email': 4, 'message': 5, 'google': 6, 'youtube': 7, 'search': 8, 'this': 9, 'a': 10, 'can': 11, 'you': 12, 'find': 13, 'text': 14, 'to': 15, 'u': 16, 'are': 17, 'him': 18, 'the': 19, 'open': 20, 'something': 21, 'look': 22, 'please': 23, 'what': 24, 'open?': 25, 'good': 26, 'day': 27, 'mike': 28, 'jenny': 29, 'hours': 30, 'your': 31, 'hours?': 32, 'when': 33, 'hi': 34, 'how': 35, 'is': 36, 'anyone': 37, 'there?': 38, 'hello': 39, 'bye': 40, 'see': 41, 'later': 42, 'goodbye': 43, 'lalalalalalalalal': 44, 'hola': 45, 'someone': 46}
47


In [209]:
token_list = tokenizer.texts_to_sequences([corpus[2]])[0]

In [210]:
token_list

[6, 8]

In [211]:
encode = []
for i in corpus : 

  token = tokenizer.texts_to_sequences([i])[0]
  encode.append(token)

In [212]:
encode

[[11, 12, 8, 9, 3, 6],
 [20, 6],
 [6, 8],
 [8, 21, 3, 6],
 [13, 9, 3, 6],
 [8, 9, 3, 6],
 [22, 3, 6],
 [13, 9, 3, 6],
 [6],
 [8, 6],
 [13, 3, 6],
 [11, 12, 8, 9, 3, 7],
 [20, 7],
 [7, 8],
 [8, 21, 3, 7],
 [13, 9, 3, 7],
 [8, 9, 3, 7],
 [22, 3, 7],
 [13, 9, 3, 7],
 [7],
 [8, 7],
 [13, 3, 7],
 [2, 4],
 [2, 4, 15, 28],
 [4],
 [4, 2],
 [4, 15, 18],
 [4, 18],
 [2, 10, 4],
 [4, 2],
 [11, 16, 23, 2, 10, 4],
 [2, 4, 15, 29],
 [11, 16, 2, 19, 4],
 [2, 19, 4],
 [24, 30, 17, 12, 25],
 [24, 17, 31, 32],
 [33, 17, 12, 25],
 [34],
 [35, 17, 12],
 [36, 37, 38],
 [39],
 [26, 27],
 [40],
 [41, 12, 42],
 [43],
 [26, 27],
 [44],
 [45],
 [2, 10, 14, 5],
 [2, 10, 14, 5],
 [5],
 [2, 10, 14, 5, 15, 46],
 [5, 18],
 [2, 10, 5],
 [2, 10, 14, 5],
 [5, 2],
 [11, 16, 23, 2, 10, 14, 5],
 [5],
 [2, 5],
 [11, 16, 2, 19, 5]]

In [213]:
max_sequence_len = max([len(x) for x in encode])
input_sequences_X  = np.array(pad_sequences(encode , maxlen=max_sequence_len, padding='pre'))

In [214]:
input_sequences_X

array([[ 0, 11, 12,  8,  9,  3,  6],
       [ 0,  0,  0,  0,  0, 20,  6],
       [ 0,  0,  0,  0,  0,  6,  8],
       [ 0,  0,  0,  8, 21,  3,  6],
       [ 0,  0,  0, 13,  9,  3,  6],
       [ 0,  0,  0,  8,  9,  3,  6],
       [ 0,  0,  0,  0, 22,  3,  6],
       [ 0,  0,  0, 13,  9,  3,  6],
       [ 0,  0,  0,  0,  0,  0,  6],
       [ 0,  0,  0,  0,  0,  8,  6],
       [ 0,  0,  0,  0, 13,  3,  6],
       [ 0, 11, 12,  8,  9,  3,  7],
       [ 0,  0,  0,  0,  0, 20,  7],
       [ 0,  0,  0,  0,  0,  7,  8],
       [ 0,  0,  0,  8, 21,  3,  7],
       [ 0,  0,  0, 13,  9,  3,  7],
       [ 0,  0,  0,  8,  9,  3,  7],
       [ 0,  0,  0,  0, 22,  3,  7],
       [ 0,  0,  0, 13,  9,  3,  7],
       [ 0,  0,  0,  0,  0,  0,  7],
       [ 0,  0,  0,  0,  0,  8,  7],
       [ 0,  0,  0,  0, 13,  3,  7],
       [ 0,  0,  0,  0,  0,  2,  4],
       [ 0,  0,  0,  2,  4, 15, 28],
       [ 0,  0,  0,  0,  0,  0,  4],
       [ 0,  0,  0,  0,  0,  4,  2],
       [ 0,  0,  0,  0,  4, 15, 18],
 

In [215]:
df

{'intents': [{'tag': 'google',
   'code': 0,
   'patterns': ['can you search this in google',
    'open google',
    'google search',
    'search something in google',
    'find this in google ',
    'search this in google',
    'look in google',
    'find this in google',
    'google',
    'search google',
    'find in google']},
  {'tag': 'youtube',
   'code': 1,
   'patterns': ['can you search this in youtube',
    'open youtube',
    'youtube search',
    'search something in youtube',
    'find this in youtube',
    'search this in youtube',
    'look in youtube',
    'find this in youtube',
    'youtube',
    'search youtube',
    'find in youtube']},
  {'tag': 'email',
   'code': 2,
   'patterns': ['send email',
    'send email to mike',
    'email',
    'email send',
    'email to him',
    'email him',
    'send a email',
    'email send',
    'can u please send a email',
    'send email to jenny',
    'can u send the email',
    'send the email']},
  {'tag': 'random',
   'cod

In [216]:
pred_y = []
for i in df["intents"]:
    a = [i["code"]]*len(i["patterns"])
    for j in a : 
        pred_y.append(j)

        
# [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
# [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]
# [2, 2, 2, 2, 2]
# [3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
# [4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]        

In [217]:
#catogorical 
print(pred_y)

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4]


In [218]:
 pred_y_cat = tf.keras.utils.to_categorical(
    pred_y, num_classes = 5 , dtype='float32'
)

In [219]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
   input_sequences_X, pred_y, test_size=0.1, random_state=42)
    
    
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(
   input_sequences_X, pred_y_cat, test_size=0.1, random_state=42)    

In [220]:
print(X_train , y_train)
print(X_train_cat , y_train_cat)

[[ 0  0  0  0  2 19  4]
 [ 0  0  0  2 10 14  5]
 [ 0  0  0  0  0 20  7]
 [ 0  0  0  0  0  0  5]
 [ 0  0  0  0  0  0 44]
 [ 0  0  0  0  0  0  5]
 [ 0  0  0  2  4 15 29]
 [ 0  0  0  8 21  3  6]
 [ 0  0  0  0  0  5 18]
 [ 0  0  0  0 22  3  7]
 [ 0  0  0  0  0  0  6]
 [ 0  0  0  0 22  3  6]
 [ 0  0  0  0  0  0 39]
 [ 0  0  0 13  9  3  6]
 [ 0  0  0  0 41 12 42]
 [ 0  0  0  0  0  0  7]
 [ 0  0 24 30 17 12 25]
 [ 0  0  0  0  0  2  5]
 [ 0  0  0  0  0  4  2]
 [11 16 23  2 10 14  5]
 [ 0  0  0 13  9  3  7]
 [ 0  0  0  0  0  4 18]
 [ 0  0  0  0  0  8  6]
 [ 0 11 16 23  2 10  4]
 [ 0  0  0  0  4 15 18]
 [ 0  0  0  8  9  3  7]
 [ 0  0  0  0  0  0  4]
 [ 0  0  0  0  0  5  2]
 [ 0 11 12  8  9  3  7]
 [ 0  0 11 16  2 19  4]
 [ 0  0  0  0  2 10  5]
 [ 0  0  0  0  0 26 27]
 [ 0  0  0  0  0  0 34]
 [ 0  0  0  0  0  4  2]
 [ 0  0  0  0  0  0 43]
 [ 0  0  0  0  0 20  6]
 [ 0  0  0  0 13  3  7]
 [ 0  0  0  0  0  6  8]
 [ 0  0  0  0  0  0 45]
 [ 0  0  0  0 36 37 38]
 [ 0  0  0 24 17 31 32]
 [ 0  0  0  2  4

In [221]:
#model1
model1 = tf.keras.Sequential([
    tf.keras.layers.Embedding(total_words, 32 , input_length= max_sequence_len ),
    tf.keras.layers.LSTM(32),
    tf.keras.layers.Dense(5, activation="softmax")
])

In [222]:
model1.compile(loss="categorical_crossentropy",optimizer="rmsprop",metrics=['acc'])
#for encoded 

In [223]:
model1.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 7, 32)             1504      
_________________________________________________________________
lstm_2 (LSTM)                (None, 32)                8320      
_________________________________________________________________
dense_2 (Dense)              (None, 5)                 165       
Total params: 9,989
Trainable params: 9,989
Non-trainable params: 0
_________________________________________________________________


In [224]:
model1.fit(X_train_cat , y_train_cat , epochs = 10 ,  validation_data=(X_test_cat, y_test_cat ))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7fff1fe9a0>

In [231]:
model1.compile(loss="sparse_categorical_crossentropy",optimizer="rmsprop",metrics=['acc'])
#not encoded 

In [227]:
type(y_train)

list

In [228]:
model1.fit(X_train ,np.array(y_train) , epochs = 10 , validation_data=(X_test, np.array(y_test)) )

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<tensorflow.python.keras.callbacks.History at 0x7f834fd6d0>

the non encoded data sets the correct value relative to the categorial one

training the whole dataset 

In [233]:
model1.fit(input_sequences_X , np.array(pred_y) , epochs = 20)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f89e94550>

In [234]:
model1.save("friday_1.0.h5") #saving the model ...

# new_model = tf.keras.models.load_model('friday_1.0.h5')  loading ..

In [235]:
#testing ..
def process(x):
  token_list = tokenizer.texts_to_sequences([x])
  input_sequences = np.array(pad_sequences(token_list , maxlen=max_sequence_len, padding='pre'))
  input_sequences = np.array([input_sequences[0]])
  return input_sequences

In [236]:
x = "send email "
x1 = process(x)

In [237]:
type(x1) , type(X_train)

(numpy.ndarray, numpy.ndarray)

In [238]:
pred = model1.predict(X_train[0:1])
print(pred)

[[2.7253993e-05 1.1728472e-05 9.9988902e-01 3.5651642e-06 6.8472895e-05]]


In [239]:
np.argmax(pred)

2

In [240]:
pred = model1.predict(x1)
print(pred)

[[7.0785245e-05 2.6128177e-05 9.9979657e-01 9.8109513e-06 9.6809468e-05]]


In [241]:
np.argmax(pred)

2

In [242]:
new_model = tf.keras.models.load_model('friday_1.0.h5') #load our model .... 

In [243]:
new_model.predict(x1)

array([[7.0785245e-05, 2.6128177e-05, 9.9979657e-01, 9.8109513e-06,
        9.6809468e-05]], dtype=float32)

In [244]:
predict = {0 : "google" , 1 : "youtube" , 2 : "email" , 3 : "random" , 4 : "text"  }

In [246]:
bot = True 
while bot:
    text = input("hello...")
    if(text == "c"):
        break 
    text = process(text)
    pred = new_model.predict(text)
    print(pred)
    pred = np.argmax(pred)
    print(predict[pred])

hello...yoo
[[0.05053078 0.03792108 0.022188   0.8690008  0.0203593 ]]
random
hello...open google
[[9.9968040e-01 1.2614518e-04 4.5880617e-05 9.2123853e-05 5.5433444e-05]]
google
hello...search something in google
[[9.9971956e-01 1.4216334e-04 1.9400621e-05 8.5732885e-05 3.3102373e-05]]
google
hello...open youtube
[[7.9549129e-05 9.9982053e-01 2.6630301e-05 6.5576700e-05 7.9119945e-06]]
youtube
hello...i want to want video in youtube
[[1.0775707e-03 9.7385186e-01 2.4818566e-02 8.7187516e-05 1.6490574e-04]]
youtube
hello...kdkdkkadkamcadcmdac
[[0.05053078 0.03792108 0.022188   0.8690008  0.0203593 ]]
random
hello...send a message to elon
[[1.9152676e-04 6.5778775e-05 3.1249521e-03 6.3315569e-04 9.9598455e-01]]
text
hello...send a email to mark
[[6.0047580e-05 2.6037091e-05 9.9978906e-01 5.0126741e-06 1.1995454e-04]]
email
hello...send a email to no sorry send a text message to john
[[1.2039385e-04 4.2079468e-05 5.5329071e-04 7.7940407e-04 9.9850488e-01]]
text
hello...wowowoow
[[0.050530