## Import Needed Modules

In [1]:
import json
import random
import numpy as np

import nltk
nltk.download('punkt')
nltk.download('wordnet')
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

from keras.models import Sequential
from keras.layers import Dense, Activation, Dropout
from keras.optimizers import SGD

[nltk_data] Downloading package punkt to /root/nltk_data...

[nltk_data]   Package punkt is already up-to-date!

[nltk_data] Downloading package wordnet to /root/nltk_data...

[nltk_data]   Package wordnet is already up-to-date!


## Data Analysis and Preprcessing

#### Read data

In [2]:
data_file = open('/kaggle/input/books-dataset/intents.json').read()
intents = json.loads(data_file)

In [3]:
words=[]
classes = []
documents = []
ignore_words = ['?', '!']

### Preprocessing Text

In [4]:
for intent in intents['intents']:
    for pattern in intent['patterns']:

        # take each word and tokenize it
        w = nltk.word_tokenize(pattern)
        words.extend(w)

        # adding documents
        documents.append((w, intent['tag']))

        # adding classes to our class list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

words = [lemmatizer.lemmatize(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

classes = sorted(list(set(classes)))

print (len(documents), "documents")
print (len(words), "unique lemmatized words")
print (len(classes), "classes", classes)

161 documents

107 unique lemmatized words

51 classes ['Adventure stories', 'American fiction', 'Architecture', 'Art', 'Biography & Autobiography', 'Body, Mind & Spirit', 'Business & Economics', "Children's stories", 'Comics & Graphic Novels', 'Computers', 'Cooking', 'Detective and mystery stories', 'Drama', 'Education', 'English fiction', 'Family & Relationships', 'Fantasy fiction', 'Fiction', 'Foreign Language Study', 'Games', 'Health & Fitness', 'History', 'Humor', 'Juvenile Fiction', 'Juvenile Nonfiction', 'Language Arts & Disciplines', 'Law', 'Literary Collections', 'Literary Criticism', 'Medical', 'Music', 'Nature', 'Performing Arts', 'Philosophy', 'Photography', 'Poetry', 'Political Science', 'Psychology', 'Religion', 'Science', 'Science fiction', 'Self-Help', 'Social Science', 'Sports & Recreation', 'Travel', 'True Crime', 'Young Adult Fiction', 'book_search', 'goodbye', 'greeting', 'thanks']


### Initializing Training Data

In [5]:
training = []
output_empty = [0] * len(classes)
for doc in documents:

    # initializing bag of words
    bag = []

    # list of tokenized words for the pattern
    pattern_words = doc[0]

    # lemmatize each word - create base word, in attempt to represent related words
    pattern_words = [lemmatizer.lemmatize(word.lower()) for word in pattern_words]

    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)

    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1

    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists. X - patterns, Y - intents
train_x = list(training[:, 0])
train_y = list(training[:, 1])


  training = np.array(training)


## Model Training

In [6]:
# Create model - 3 layers. First layer 128 neurons, second layer 64 neurons and 3rd output layer contains number of neurons
# equal to number of intents to predict output intent with softmax
model = Sequential()
model.add(Dense(128, input_shape=(len(train_x[0]),), activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(len(train_y[0]), activation='softmax'))

# Compile model. Stochastic gradient descent with Nesterov accelerated gradient gives good results for this model
sgd = SGD(lr=0.01, momentum=0.9, nesterov=True)
model.compile(loss='categorical_crossentropy', optimizer=sgd, metrics=['accuracy'])



In [7]:
#fitting and saving the model
hist = model.fit(np.array(train_x), np.array(train_y), epochs=500, batch_size=5, verbose=1)
model.save('chatbot_model.h5', hist)

print("model created")

Epoch 1/500


Epoch 2/500


Epoch 3/500


Epoch 4/500


Epoch 5/500


Epoch 6/500


Epoch 7/500


Epoch 8/500


Epoch 9/500


Epoch 10/500


Epoch 11/500


Epoch 12/500


Epoch 13/500


Epoch 14/500


Epoch 15/500


Epoch 16/500


Epoch 17/500


Epoch 18/500


Epoch 19/500


Epoch 20/500


Epoch 21/500


Epoch 22/500


Epoch 23/500


Epoch 24/500


Epoch 25/500


Epoch 26/500


Epoch 27/500


Epoch 28/500


Epoch 29/500


Epoch 30/500


Epoch 31/500


Epoch 32/500


Epoch 33/500


Epoch 34/500


Epoch 35/500


Epoch 36/500


Epoch 37/500


Epoch 38/500


Epoch 39/500


Epoch 40/500


Epoch 41/500


Epoch 42/500


Epoch 43/500


Epoch 44/500


Epoch 45/500


Epoch 46/500


Epoch 47/500


Epoch 48/500


Epoch 49/500


Epoch 50/500


Epoch 51/500


Epoch 52/500


Epoch 53/500


Epoch 54/500


Epoch 55/500


Epoch 56/500


Epoch 57/500


Epoch 58/500


Epoch 59/500


Epoch 60/500


Epoch 61/500


Epoch 62/500


Epoch 63/500


Epoch 64/500


Epoch 65/500


Epoch 66/500


Epoch 67/500


Epoc


  saving_api.save_model(


## Chatbot Prediction

### Function to clean user input

In [8]:
def clean_up_sentence(sentence):
    sentence_words = nltk.word_tokenize(sentence)
    sentence_words = [lemmatizer.lemmatize(word.lower()) for word in sentence_words]

    return sentence_words

### Function for Bag of Wrds

In [9]:
def bow(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)

    # bag of words - matrix of N words, vocabulary matrix
    bag = [0] * len(words)
    for s in sentence_words:
        for i, w in enumerate(words):
            if w == s:

                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

### Function for Class Prediction

In [10]:
def predict_class(sentence, model):
    # filter out predictions below a threshold
    p = bow(sentence, words,show_details=False)
    res = model.predict(np.array([p]))[0]
    ERROR_THRESHOLD = 0.25
    results = [[i, r] for i, r in enumerate(res) if r > ERROR_THRESHOLD]

    # sort by strength of probability
    results.sort(key=lambda x: x[1], reverse=True)
    return_list = []
    for r in results:
        return_list.append({"intent": classes[r[0]], "probability": str(r[1])})

    return return_list

### Function to get chatbot response

In [11]:
def getResponse(ints, intents_json):
    tag = ints[0]['intent']
    list_of_intents = intents_json['intents']
    for i in list_of_intents:
        if(i['tag']== tag):
            result = random.choice(i['responses'])
            break

    return result

### Chatbot Function

In [12]:
def chatbot_response(msg):
    ints = predict_class(msg, model)
    res = getResponse(ints, intents)
    return res

## Chatbot

In [14]:
chatbot_response('Recommend a book in History')



{'Book': 'The Chaneysville Incident',
 'Feedback': "The legends say something happened in Chaneysville. The Chaneysville Incident is the powerful story of one man's obsession with discovering what that something was--a quest that takes the brilliant and bitter young black historian John Washington back through the secrets and buried evil of his heritage. Returning home to care for and then bury his father's closest friend and his own guardian, Old Jack Crawley, he comes upon the scant records of his family's proud and tragic history, which he drives himself to reconstruct and accept. This is the story of John's relationship with his family, the town, and the woman he loves; and also between the past and the present, between oppression and guilt, hate and violence, love and acceptance.",
 'Rate': 3.96}