In [1]:
import nltk
nltk.download('omw-1.4')

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\mgopa\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
#libraries

import random
from keras.optimizers import SGD
from keras.layers import Dense, Dropout
from keras.models import load_model
from keras.models import Sequential
import numpy as np
import pickle
import json
import nltk
from nltk.stem import WordNetLemmatizer
lemmatizer=WordNetLemmatizer()
nltk.download("punkt")
nltk.download("wordnet")

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\mgopa\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\mgopa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [4]:

#init file
words=[]
classes=[]
documents = []
ignore_words=["?","!"]
data_file=open("intents.json").read()
intents = json.loads(data_file)

In [5]:

#words
for intent in intents["intents"]: 
  for pattern in intent["patterns"]:
    #take each word and tokenize it 
    w=nltk.word_tokenize(pattern)
    words.extend(w)
    #adding documents
    documents.append((w, intent["tag"]))
    # adding classes to our class list
    if intent["tag"] not in classes: 
      classes.append(intent["tag"])

In [6]:

#lemmatizer

words=[lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words=sorted(list(set(words)))
classes=sorted(list(set(classes)))
print(len(documents), "documents")
print(len(classes), "classes", classes)
print(len(words), "unique lemmatized words", words)
pickle.dump(words, open("words.pkl", "wb"))
pickle.dump(classes, open("classes.pkl", "wb"))


261 documents
28 classes ['admission', 'canteen', 'college intake', 'committee', 'course', 'event', 'facilities', 'fees', 'goodbye', 'greeting', 'hostel', 'hours', 'infrastructure', 'ithod', 'library', 'location', 'menu', 'number', 'placement', 'principal', 'ragging', 'salutaion', 'sem', 'size', 'syllabus', 'task', 'transport', 'vacation']
195 unique lemmatized words ["'s", 'a', 'about', 'active', 'address', 'admission', 'allotment', 'am', 'and', 'antiragging', 'any', 'anyone', 'are', 'at', 'attend', 'available', 'average', 'be', 'between', 'big', 'book', 'branch', 'bus', 'bye', 'cafetaria', 'call', 'campus', 'can', 'canteen', 'capacity', 'case', 'college', 'come', 'committe', 'committee', 'company', 'conducted', 'contact', 'course', 'cya', 'date', 'day', 'detail', 'different', 'distance', 'do', 'doe', 'done', 'each', 'end', 'event', 'exam', 'facility', 'far', 'fee', 'first', 'food', 'for', 'from', 'function', 'get', 'give', 'go', 'good', 'goodbye', 'got', 'gtg', 'guy', 'gvp', 'gvpce',

In [7]:
#training initializer
#initializing training data
training=[]
output_empty=[0]*len(classes)
for doc in documents:
  #initializing bag of words
  bag=[]
  #list of tokenized words for the pattern
  pattern_words=doc[0]
  #lemmatize each word - create base word, in attempt to represent related words
  pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words] 
  #create our bag of words array with 1, if word match found in current pattern
  for w in words:
    bag.append(1) if w in pattern_words else bag.append(0)
  # output is a '0' for each tag and '1' for current tag (for each pattern)
  output_row=list(output_empty) 
  output_row[classes.index(doc[1])] = 1
  training.append( [bag,output_row])

In [8]:

#shuffle our features and turn into np.array 
random.shuffle(training)
training = np.array(training)
#create train and test lists. X-patterns, Y - intents
train_x=list(training[:, 0])
train_y=list(training[:, 1])
print("Training data created")

Training data created


  training = np.array(training)


In [9]:

# actual training

#Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(64, activation="relu"))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation="softmax"))
model.summary()
#Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model

sgd=SGD(lr=0.01, decay=1e-6,momentum=0.9,nesterov=True)
model.compile(loss="categorical_crossentropy", optimizer=sgd, metrics=["accuracy"])
# for choosing an optimal number of training epochs to avoid underfitting or overfitting use an early stopping callback to keras
# based on either accuracy or loos monitoring. If the loss is being monitored, training comes to halt when there is an
#increment observed in loss values. Or, If accuracy is being monitored, training comes to halt when there is decrement observed in accuracy values.
# fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=200, batch_size=5, verbose=1)
model.save("chatbot_mode.h5", hist)
print("model created")

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 dense (Dense)               (None, 128)               25088     
                                                                 
 dropout (Dropout)           (None, 128)               0         
                                                                 
 dense_1 (Dense)             (None, 64)                8256      
                                                                 
 dropout_1 (Dropout)         (None, 64)                0         
                                                                 
 dense_2 (Dense)             (None, 28)                1820      
                                                                 
Total params: 35,164
Trainable params: 35,164
Non-trainable params: 0
_________________________________________________________________


  super().__init__(name, **kwargs)


Epoch 1/200
Epoch 2/200
Epoch 3/200
Epoch 4/200
Epoch 5/200
Epoch 6/200
Epoch 7/200
Epoch 8/200
Epoch 9/200
Epoch 10/200
Epoch 11/200
Epoch 12/200
Epoch 13/200
Epoch 14/200
Epoch 15/200
Epoch 16/200
Epoch 17/200
Epoch 18/200
Epoch 19/200
Epoch 20/200
Epoch 21/200
Epoch 22/200
Epoch 23/200
Epoch 24/200
Epoch 25/200
Epoch 26/200
Epoch 27/200
Epoch 28/200
Epoch 29/200
Epoch 30/200
Epoch 31/200
Epoch 32/200
Epoch 33/200
Epoch 34/200
Epoch 35/200
Epoch 36/200
Epoch 37/200
Epoch 38/200
Epoch 39/200
Epoch 40/200
Epoch 41/200
Epoch 42/200
Epoch 43/200
Epoch 44/200
Epoch 45/200
Epoch 46/200
Epoch 47/200
Epoch 48/200
Epoch 49/200
Epoch 50/200
Epoch 51/200
Epoch 52/200
Epoch 53/200
Epoch 54/200
Epoch 55/200
Epoch 56/200
Epoch 57/200
Epoch 58/200
Epoch 59/200
Epoch 60/200
Epoch 61/200
Epoch 62/200
Epoch 63/200
Epoch 64/200
Epoch 65/200
Epoch 66/200
Epoch 67/200
Epoch 68/200
Epoch 69/200
Epoch 70/200
Epoch 71/200
Epoch 72/200
Epoch 73/200
Epoch 74/200
Epoch 75/200
Epoch 76/200
Epoch 77/200
Epoch 78