<a href="https://colab.research.google.com/github/msalmanz93/Zana-Coding-Challenge/blob/master/Task2_Zana.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
#Install tflearn
!pip install tflearn

In [0]:
#Older version of tensorflow is required since some modules of tflearn isn't supported by  
!pip uninstall tensorflow
pip install tensorflow==1.14

In [0]:
import nltk
nltk.download('punkt')

In [0]:
# things we need for NLP
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

# things we need for Tensorflow
import numpy as np
import tflearn
import tensorflow as tf
import random

In [0]:
from google.colab import files
uploaded = files.upload()

In [0]:
# import the json intent file
import json
with open('intent.json') as json_data:
    intents = json.load(json_data)

# **Preparing Data**
*   Tokenise patterns into array of words
*   Lower case and stem all words. Example: Pharmacy = pharm. Attempt to represent related words
*   Create list of classes — intents
*   Create list of documents — combination between list of patterns and list of intents

In [0]:
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our intents patterns
for intent in intents['intents']:
    for pattern in intent['patterns']:
        # tokenize each word in the sentence
        w = nltk.word_tokenize(pattern)
        # add to our words list
        words.extend(w)
        # add to documents in our corpus
        documents.append((w, intent['tag']))
        # add to our classes list
        if intent['tag'] not in classes:
            classes.append(intent['tag'])

# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))

# sort classes
classes = sorted(list(set(classes)))

# documents = combination between patterns and intents
print (len(documents), "documents")
# classes = intents
print (len(classes), "classes", classes)
# words = all words, vocabulary
print (len(words), "unique stemmed words", words)

45 documents
9 classes ['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']
82 unique stemmed words ["'s", ',', 'a', 'advers', 'al', 'anyon', 'ar', 'awesom', 'be', 'behavy', 'blood', 'by', 'bye', 'can', 'caus', 'chat', 'check', 'could', 'dat', 'day', 'detail', 'do', 'dont', 'drug', 'entry', 'find', 'for', 'giv', 'good', 'goodby', 'hav', 'hello', 'help', 'hi', 'hist', 'hospit', 'how', 'i', 'id', 'is', 'lat', 'list', 'load', 'loc', 'log', 'look', 'lookup', 'man', 'me', 'mod', 'nearby', 'next', 'nic', 'of', 'off', 'op', 'paty', 'pharm', 'press', 'provid', 'react', 'rel', 'result', 'search', 'see', 'show', 'suit', 'support', 'task', 'thank', 'that', 'ther', 'til', 'tim', 'to', 'transf', 'up', 'want', 'what', 'which', 'with', 'you']


# **Training** Neural Network 
*   Define X input shape = equal to word vocabulary size 
*   Define two layers with 8 hidden neurones — optimal for text classification task (based on experiments) 
*   Define Y input shape — equal to number of intents

In [0]:
# create our training data
training = []
# create an empty array for our output
output_empty = [0] * len(classes)

# training set, bag of words for each sentence
for doc in documents:
    # initialize our bag of words
    bag = []
    # list of tokenized words for the pattern
    pattern_words = doc[0]
    # stem each word - create base word, in attempt to represent related words
    pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
    # create our bag of words array with 1, if word match found in current pattern
    for w in words:
        bag.append(1) if w in pattern_words else bag.append(0)
    
    # output is a '0' for each tag and '1' for current tag (for each pattern)
    output_row = list(output_empty)
    output_row[classes.index(doc[1])] = 1
    
    training.append([bag, output_row])

# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)

# create train and test lists. X - patterns, Y - intents
train_x = list(training[:,0])
train_y = list(training[:,1])

*   Apply regression to find the best equation parameters
*   Run model.fit to construct classification model. Provide X/Y inputs, number of epochs and batch size
*   Per each epoch, multiple operations are executed to find optimal model parameters to classify future input converted to array of 0/1





In [0]:
# reset underlying graph data
tf.reset_default_graph()
# Build neural network - input data shape, number of words in vocabulary (size of first array element). 
net = tflearn.input_data(shape=[None, len(train_x[0])])
# Two fully connected layers with 8 hidden units/neurons - optimal for this task
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
# number of intents, columns in the matrix train_y
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
# regression to find best parameters, during training
net = tflearn.regression(net)

# Define Deep Neural Network model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_chatbot_redsamurai_medical_logs')
# Start training (apply gradient descent algorithm)
# n_epoch - number of epoch to run
# Batch size defines number of samples that going to be propagated through the network.
model.fit(train_x, train_y, n_epoch=1000, batch_size=5, show_metric=True)
model.save('chatbot_redsamurai_medical_model.tflearn')

Training Step: 8999  | total loss: [1m[32m0.21150[0m[0m | time: 0.025s
| Adam | epoch: 1000 | loss: 0.21150 - acc: 0.9882 -- iter: 40/45
Training Step: 9000  | total loss: [1m[32m0.19273[0m[0m | time: 0.028s
| Adam | epoch: 1000 | loss: 0.19273 - acc: 0.9882 -- iter: 45/45
--
INFO:tensorflow:/content/chatbot_redsamurai_medical_model.tflearn is not in all_model_checkpoint_paths. Manually adding it.


# **Initial Model Testing**

*   Tokenise input sentence — split it into array of words
*   Create bag of words (array with 0/1) for the input sentence — array equal to the size of vocabulary, with 1 for each word found in input sentence
*   Run model.predict with given bag of words array, this will return probability for each intent





In [0]:
def clean_up_sentence(sentence):
    # tokenize the pattern - split words into array
    sentence_words = nltk.word_tokenize(sentence)
    # stem each word - create short form for word
    sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
    return sentence_words

# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=True):
    # tokenize the pattern
    sentence_words = clean_up_sentence(sentence)
    # bag of words - matrix of N words, vocabulary matrix
    bag = [0]*len(words)  
    for s in sentence_words:
        for i,w in enumerate(words):
            if w == s: 
                # assign 1 if current word is in the vocabulary position
                bag[i] = 1
                if show_details:
                    print ("found in bag: %s" % w)

    return(np.array(bag))

In [0]:
p = bow("Load blood pessure for patient", words)
print (p)
print (classes)

found in bag: load
found in bag: blood
found in bag: for
found in bag: paty
[0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
 0 0 0 0 0 0 0 0]
['adverse_drug', 'blood_pressure', 'blood_pressure_search', 'goodbye', 'greeting', 'hospital_search', 'options', 'pharmacy_search', 'thanks']


In [0]:
print(model.predict([p]))

[[5.6332779e-11 5.1041122e-04 7.9476154e-01 3.2390458e-06 1.6157302e-06
  2.0472319e-01 1.8833681e-13 7.8192126e-11 2.9850671e-08]]
