In [3]:
import numpy as np
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem.lancaster import LancasterStemmer
import nltk
import re
from sklearn.preprocessing import OneHotEncoder
import matplotlib.pyplot as plt
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.models import Sequential, load_model
from keras.layers import Dense, LSTM, Bidirectional, Embedding, Dropout
from keras.callbacks import ModelCheckpoint

In [8]:
def load_dataset(filename):
    df = pd.read_csv(filename, encoding = "latin1", names = ["Sentence", "Intent"])
    print(df.head())
    intents = df["Intent"]
    unique_intents = list(set(intents))
    sentences = list(df["Sentence"])

    return (intents, unique_intents, sentences)

In [11]:
intents, unique_intents, sentences = load_dataset("dataset.csv")

                Sentence          Intent
0       Need help pleese  commonQ.assist
1              Need help  commonQ.assist
2       I need some info  commonQ.assist
3      Will you help me?  commonQ.assist
4  What else can you do?  commonQ.assist


In [12]:
print(sentences[:5])

['Need help pleese', 'Need help', 'I need some info', 'Will you help me?', 'What else can you do?']


In [14]:
nltk.download("stopwords") # downloading corpus of stopwords
nltk.download("punkt") # reference: https://www.nltk.org/_modules/nltk/tokenize/punkt.html

[nltk_data] Downloading package stopwords to /home/manas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/manas/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [16]:
stemmer = LancasterStemmer() # reference: https://www.datacamp.com/community/tutorials/stemming-lemmatization-python

In [19]:
def cleaning(sentences):
      words = []
      for s in sentences:
        clean = re.sub(r'[^ a-z A-Z 0-9]', " ", s)
        w = word_tokenize(clean)
        # TODO: apply stemming before appending into words
        words.append([i.lower() for i in w])

      return words

In [20]:
cleaned_words = cleaning(sentences)
print(len(cleaned_words))
print(cleaned_words[:2])

1113
[['need', 'help', 'pleese'], ['need', 'help']]


In [21]:
# references:
# https://machinelearningmastery.com/prepare-text-data-deep-learning-keras/
# https://stackoverflow.com/questions/51956000/what-does-keras-tokenizer-method-exactly-do

def create_tokenizer(words, filters = '!"#$%&()*+,-./:;<=>?@[\]^_`{|}~'):
    token = Tokenizer(filters = filters)
    token.fit_on_texts(words)
    return token

In [34]:
def max_length(words):
    return(len(max(words, key = len)))

In [35]:
word_tokenizer = create_tokenizer(cleaned_words)
vocab_size = len(word_tokenizer.word_index) + 1
max_length = max_length(cleaned_words)

print("Vocab Size = %d and Maximum length = %d" % (vocab_size, max_length))
# print(word_tokenizer.word_index)

Vocab Size = 492 and Maximum length = 28


In [36]:
def encoding_doc(token, words):
    return(token.texts_to_sequences(words))

In [40]:
encoded_doc = encoding_doc(word_tokenizer, cleaned_words)
# print(encoded_doc)

In [41]:
def padding_doc(encoded_doc, max_length):
    return(pad_sequences(encoded_doc, maxlen = max_length, padding = "post"))

In [43]:
padded_doc = padding_doc(encoded_doc, max_length)
print(padded_doc)
print("Shape of padded docs = ",padded_doc.shape)

[[ 25  77 332 ...   0   0   0]
 [ 25  77   0 ...   0   0   0]
 [  1  25 198 ...   0   0   0]
 ...
 [ 59  28 133 ...   0   0   0]
 [ 59  42   4 ...   0   0   0]
 [ 84  42 133 ...   0   0   0]]
Shape of padded docs =  (1113, 28)


In [45]:
#tokenizer with filter changed
output_tokenizer = create_tokenizer(unique_intents, filters = '!"#$%&()*+,-/:;<=>?@[\]^`{|}~')

In [46]:
print(output_tokenizer.word_index)

{'commonq.not_giving': 1, 'commonq.name': 2, 'faq.address_proof': 3, 'faq.apply_register': 4, 'faq.application_process': 5, 'faq.biz_new': 6, 'faq.biz_category_missing': 7, 'commonq.how': 8, 'commonq.wait': 9, 'contact.contact': 10, 'commonq.query': 11, 'commonq.bot': 12, 'commonq.assist': 13, 'faq.biz_simpler': 14, 'faq.banking_option_missing': 15, 'faq.borrow_limit': 16, 'faq.aadhaar_missing': 17, 'faq.borrow_use': 18, 'faq.bad_service': 19, 'faq.approval_time': 20, 'commonq.just_details': 21}


In [49]:
encoded_output = encoding_doc(output_tokenizer, intents)
print(encoded_output[:5])

[[13], [13], [13], [13], [13]]


In [50]:
encoded_output = np.array(encoded_output).reshape(len(encoded_output), 1)
print(encoded_output.shape)

(1113, 1)


In [51]:
def one_hot(encode):
      o = OneHotEncoder(sparse = False)
      return(o.fit_transform(encode))

In [55]:
output_one_hot = one_hot(encoded_output)
print(output_one_hot.shape)

(1113, 21)


In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [56]:
from sklearn.model_selection import train_test_split

In [59]:
train_X, val_X, train_Y, val_Y = train_test_split(padded_doc, output_one_hot, shuffle = True, test_size = 0.2)
print("Shape of train_X = %s and train_Y = %s" % (train_X.shape, train_Y.shape))
print("Shape of val_X = %s and val_Y = %s" % (val_X.shape, val_Y.shape))

Shape of train_X = (890, 28) and train_Y = (890, 21)
Shape of val_X = (223, 28) and val_Y = (223, 21)


In [60]:
def create_model(vocab_size, max_length):
      model = Sequential()
      model.add(Embedding(vocab_size, 128, input_length = max_length, trainable = False))
      model.add(Bidirectional(LSTM(128)))
      #  model.add(LSTM(128))
      model.add(Dense(32, activation = "relu"))
      model.add(Dropout(0.5))
      model.add(Dense(21, activation = "softmax"))

      return model

In [63]:
model = create_model(vocab_size, max_length)
model.compile(loss = "categorical_crossentropy", optimizer = "adam", metrics = ["accuracy"])
model.summary()

Model: "sequential_3"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_3 (Embedding)      (None, 28, 128)           62976     
_________________________________________________________________
bidirectional_3 (Bidirection (None, 256)               263168    
_________________________________________________________________
dense_5 (Dense)              (None, 32)                8224      
_________________________________________________________________
dropout_3 (Dropout)          (None, 32)                0         
_________________________________________________________________
dense_6 (Dense)              (None, 21)                693       
Total params: 335,061
Trainable params: 272,085
Non-trainable params: 62,976
_________________________________________________________________


In [64]:
filename = 'model.h5'
checkpoint = ModelCheckpoint(filename, monitor='val_loss', verbose=1, save_best_only=True, mode='min')
hist = model.fit(train_X, train_Y, epochs = 100, batch_size = 32, validation_data = (val_X, val_Y), callbacks = [checkpoint])

Instructions for updating:
Use tf.where in 2.0, which has the same broadcast rule as np.where
Train on 890 samples, validate on 223 samples
Epoch 1/100

Epoch 00001: val_loss improved from inf to 2.77283, saving model to model.h5
Epoch 2/100

Epoch 00002: val_loss improved from 2.77283 to 2.71847, saving model to model.h5
Epoch 3/100

Epoch 00003: val_loss improved from 2.71847 to 2.63819, saving model to model.h5
Epoch 4/100

Epoch 00004: val_loss improved from 2.63819 to 2.53801, saving model to model.h5
Epoch 5/100

Epoch 00005: val_loss improved from 2.53801 to 2.47412, saving model to model.h5
Epoch 6/100

Epoch 00006: val_loss improved from 2.47412 to 2.36785, saving model to model.h5
Epoch 7/100

Epoch 00007: val_loss improved from 2.36785 to 2.25881, saving model to model.h5
Epoch 8/100

Epoch 00008: val_loss improved from 2.25881 to 2.17467, saving model to model.h5
Epoch 9/100

Epoch 00009: val_loss did not improve from 2.17467
Epoch 10/100

Epoch 00010: val_loss improved fro


Epoch 00038: val_loss did not improve from 0.93354
Epoch 39/100

Epoch 00039: val_loss did not improve from 0.93354
Epoch 40/100

Epoch 00040: val_loss did not improve from 0.93354
Epoch 41/100

Epoch 00041: val_loss did not improve from 0.93354
Epoch 42/100

Epoch 00042: val_loss improved from 0.93354 to 0.92254, saving model to model.h5
Epoch 43/100

Epoch 00043: val_loss improved from 0.92254 to 0.89139, saving model to model.h5
Epoch 44/100

Epoch 00044: val_loss did not improve from 0.89139
Epoch 45/100

Epoch 00045: val_loss did not improve from 0.89139
Epoch 46/100

Epoch 00046: val_loss did not improve from 0.89139
Epoch 47/100

Epoch 00047: val_loss did not improve from 0.89139
Epoch 48/100

Epoch 00048: val_loss did not improve from 0.89139
Epoch 49/100

Epoch 00049: val_loss did not improve from 0.89139
Epoch 50/100

Epoch 00050: val_loss did not improve from 0.89139
Epoch 51/100

Epoch 00051: val_loss did not improve from 0.89139
Epoch 52/100

Epoch 00052: val_loss did not


Epoch 00081: val_loss did not improve from 0.81833
Epoch 82/100

Epoch 00082: val_loss did not improve from 0.81833
Epoch 83/100

Epoch 00083: val_loss did not improve from 0.81833
Epoch 84/100

Epoch 00084: val_loss did not improve from 0.81833
Epoch 85/100

Epoch 00085: val_loss did not improve from 0.81833
Epoch 86/100

Epoch 00086: val_loss did not improve from 0.81833
Epoch 87/100

Epoch 00087: val_loss did not improve from 0.81833
Epoch 88/100

Epoch 00088: val_loss did not improve from 0.81833
Epoch 89/100

Epoch 00089: val_loss did not improve from 0.81833
Epoch 90/100

Epoch 00090: val_loss did not improve from 0.81833
Epoch 91/100

Epoch 00091: val_loss did not improve from 0.81833
Epoch 92/100

Epoch 00092: val_loss did not improve from 0.81833
Epoch 93/100

Epoch 00093: val_loss did not improve from 0.81833
Epoch 94/100

Epoch 00094: val_loss did not improve from 0.81833
Epoch 95/100

Epoch 00095: val_loss did not improve from 0.81833
Epoch 96/100

Epoch 00096: val_loss di

In [65]:
model = load_model("model.h5")

In [67]:
def predictions(text):
    clean = re.sub(r'[^ a-z A-Z 0-9]', " ", text)
    test_word = word_tokenize(clean)
    test_word = [w.lower() for w in test_word]
    test_ls = word_tokenizer.texts_to_sequences(test_word)
    print(test_word)
    #Check for unknown words
    if [] in test_ls:
        test_ls = list(filter(None, test_ls))
    test_ls = np.array(test_ls).reshape(1, len(test_ls))
    x = padding_doc(test_ls, max_length)
    pred = model.predict_proba(x)

    return pred

In [68]:
def get_final_output(pred, classes):
    predictions = pred[0]
    classes = np.array(classes)
    ids = np.argsort(-predictions)
    classes = classes[ids]
    predictions = -np.sort(-predictions)

    for i in range(pred.shape[1]):
        print("%s has confidence = %s" % (classes[i], (predictions[i])))

In [80]:
text = "i want to get in touch"
pred = predictions(text)
print(pred)
get_final_output(pred, unique_intents)

['i', 'want', 'to', 'get', 'in', 'touch']
[[1.60746351e-02 1.78172905e-02 7.64280921e-05 2.40478739e-02
  4.42507714e-02 3.03787410e-01 3.71068040e-06 2.02602092e-02
  1.07709114e-02 3.75265479e-01 8.54797382e-03 1.51545510e-01
  1.11071905e-02 5.02497656e-04 2.63005350e-04 6.48355999e-05
  1.54846246e-04 8.20696951e-05 1.00626657e-02 7.10455875e-04
  4.60428558e-03]]
contact.contact has confidence = 0.37526548
faq.biz_new has confidence = 0.3037874
commonQ.bot has confidence = 0.15154551
faq.application_process has confidence = 0.04425077
faq.apply_register has confidence = 0.024047874
commonQ.how has confidence = 0.02026021
commonQ.name has confidence = 0.01781729
commonQ.not_giving has confidence = 0.016074635
commonQ.assist has confidence = 0.0111071905
commonQ.wait has confidence = 0.010770911
faq.bad_service has confidence = 0.010062666
commonQ.query has confidence = 0.008547974
commonQ.just_details has confidence = 0.0046042856
faq.approval_time has confidence = 0.0007104559
faq