### Packages Required

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from pathlib import Path
import string
import re
import joblib
import json
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import plot_model
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,Flatten
from tensorflow.keras.layers import Conv1D
from tensorflow.keras.layers import MaxPooling1D
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping,ReduceLROnPlateau

### Datasets

In [2]:
path = 'C:/Users/Nithin/Downloads/Chatbot Creation/'
file = open(path + 'Datasets/Question & Answer.json')
data = json.load(file)

In [7]:
data[18]['question']

'young doc says closure by "second intention " yet my site is clean/intact! i want him to evacuate postop scrotal hematoma end w/sutures. am i wrong?'

In [8]:
question = []
tags = []
for i in range(0,25000):
    to_app = data[i]['question']
    question.append(to_app)
    to_ap = data[i]['tags'][0]
    tags.append(to_ap)

In [10]:
question_ser = pd.Series(question)
tags_ser = pd.Series(tags)

In [11]:
df_quest = pd.concat([question_ser,tags_ser],axis=1)

In [12]:
df_quest = df_quest.rename(columns={0:'question',1:'tags'})

In [14]:
answer = []
tags = []
for i in range(0,25000):
    to_app = data[i]['answer']
    answer.append(to_app)
    to_ap = data[i]['tags'][0]
    tags.append(to_ap)

In [15]:
answer_ser = pd.Series(answer)
tags_ser = pd.Series(tags)

In [16]:
df_ans = pd.concat([answer_ser,tags_ser],axis=1)

In [17]:
df_ans = df_ans.rename(columns={0:'answer',1:'tags'})

In [22]:
lemmatizer=WordNetLemmatizer()
vocab=Counter()
labels=[]
def tokenizer(entry):
    tokens=entry.split()
    re_punc=re.compile('[%s]'% re.escape(string.punctuation))
    tokens=[re_punc.sub('',w) for w in tokens]
    tokens=[word for word in tokens if word.isalpha()]
    tokens=[lemmatizer.lemmatize(w.lower()) for w in tokens]
    tokens=[word.lower() for word in tokens if len(word)>1]
    return tokens

In [23]:
def remove_stop_words(tokenizer,df,feature):
    doc_without_stopwords=[]
    for entry in df[feature]:
        tokens=tokenizer(entry)
        joblib.dump(tokens,'tokens.pkl')
        doc_without_stopwords.append(' '.join(tokens))
    df[feature]=doc_without_stopwords
    return

In [24]:
def create_vocab(tokenizer,df,feature):
    for entry in df[feature]:
        tokens=tokenizer(entry)
        vocab.update(tokens)
    joblib.dump(vocab,'vocab.pkl')
    return

In [25]:
create_vocab(tokenizer,df_quest,'question')

In [26]:
remove_stop_words(tokenizer,df_quest,'question')

In [27]:
print(vocab.most_common(20))

[('to', 12864), ('is', 12459), ('and', 11013), ('the', 9286), ('my', 8777), ('it', 8262), ('what', 8010), ('have', 6804), ('of', 5804), ('for', 5663), ('weight', 5598), ('in', 4895), ('on', 4314), ('do', 3943), ('can', 3817), ('with', 3018), ('if', 2897), ('am', 2713), ('im', 2668), ('or', 2606)]


In [28]:
vocab_size=len(vocab)
vocab_size

11153

In [29]:
test_list=list(df_quest.groupby(by='tags',as_index=False).first()['question'])
test_list

['zoloft sertraline side effect doe it make you gain or lose weight',
 'zoloft sertaline cause heart racing cant sit still is this normal',
 'zyprexa seroquel risperdal abilify all can have porential risk of raising blood sugar doe latuda lurasidone carry this risk also not to familiar with it',
 'zyprexa olanzapine is causing me metabolic syndrome need zyprexa olanzapine but not the syndrome what should do',
 'zoloft sertraline is th only antidepressant so far that have been able to tolerate but can it make me increase weight',
 'zyban bupropion question ha anyone prescribed zyban bupropion to quit smoking',
 'zinc deficiency mg for adult male and diarrheal disease zinc tablet daily help two cause constipation is there anything else to consider',
 'zoloft sertraline taken overdose double dose what symptom to look for',
 'zirconium dental implant how common is it used now is there any advantage or benefit over titanium implant con pro please thanks',
 'zit ha honey coloured crust and i

In [35]:
test_index=[]
for i,_ in enumerate(test_list):
    idx=df_quest[df_quest.question==test_list[i]].index[0]
    test_index.append(idx)
test_index

[7, 6, 23, 26, 27, 31, 67, 68, 0, 73, 8]

In [36]:
train_index=[i for i in df_quest.index if i not in test_index]

In [37]:
' '.join(list(vocab.keys()))



In [38]:
def encoder(df,feature):
    t=Tokenizer()
    entries=[entry for entry in df[feature]]
    t.fit_on_texts(entries)
    joblib.dump(t,'tokenizer_t.pkl')
    vocab_size=len(t.word_index)+1
    entries=[entry for entry in df[feature]]
    max_length=max([len(s.split())for s in entries])
    encoded=t.texts_to_sequences(entries)
    padded=pad_sequences(encoded,maxlen=max_length,padding='post')
    return padded,vocab_size

In [40]:
x,vocab_size=encoder(df_quest,'question')

In [41]:
vocab_size

11154

In [43]:
df_encoded=pd.DataFrame(x)

In [115]:
df_encoded['labels']=df_quest.tags
df_encoded.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,labels
0,3192,1074,891,23,294,2,6,250,52,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dentistry
1,3192,1074,891,23,294,2,6,250,52,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dentistry
2,3192,1074,891,23,294,2,6,250,52,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dentistry
3,3192,1074,891,23,294,2,6,250,52,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dentistry
4,3192,1074,891,23,294,2,6,250,52,2,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,dentistry


In [45]:
for i in range(0,2):
    dt=[0]*16
    dt.append('confused')
    dt=[dt]
    pd.DataFrame(dt).rename(columns={16:'labels'})
    df_encoded=df_encoded.append(pd.DataFrame(dt).rename(columns={16:'labels'}),ignore_index=True)

In [117]:
df_encoded.tail()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,45,46,47,48,49,50,51,52,53,labels
24997,8,237,1456,137,81,6,25,297,33,131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bariatrics
24998,8,237,1456,137,81,6,25,297,33,131,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bariatrics
24999,8,1,237,1456,4585,6,736,39,111,12,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,bariatrics
25000,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,
25001,0,0,0,0,0,0,0,0,0,0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,


In [118]:
df_encoded.fillna(0,inplace=True)

In [49]:
train_index.append(422)
test_index.append(423)

In [50]:
from sklearn.preprocessing import LabelEncoder
label_enc=LabelEncoder()

In [51]:
labl=label_enc.fit_transform(df_encoded.labels)
labl

array([9, 9, 9, ..., 0, 7, 7])

In [54]:
mapper={}
for index,key in enumerate(df_encoded.labels):
    if key not in mapper.keys():
        mapper[key]=labl[index]
mapper

{'dentistry': 9,
 'cardiology': 1,
 'bariatrics': 0,
 'wound care': 11,
 'child psychiatry': 2,
 'clinical genetics': 3,
 'clinical lipidology': 4,
 'clinical psychology': 5,
 'colon and rectal surgery': 6,
 'critical care': 8,
 'dermatology': 10,
 'confused': 7}

In [56]:
df_ans.tags=df_ans.tags.map(mapper)

In [57]:
df_ans.dropna(inplace=True)

In [59]:
df_ans.tags=df_ans.tags.astype({'tags':'int32'})

In [60]:
df_ans.head()

Unnamed: 0,answer,tags
0,a majority of the dental implants placed are t...,9
1,and the data on zirconia implants is much more...,9
2,dental implants when loaded transfer stress to...,9
3,stick with what we know works -- titanium. let...,9
4,the vast majority of dental implants placed ar...,9


### Model Building

In [63]:
train=df_encoded.loc[train_index]
test=df_encoded.loc[test_index]

In [64]:
x_train=train.drop(columns=['labels'],axis=1)
y_train=train.labels
x_test=test.drop(columns=['labels'],axis=1)
y_test=test.labels

In [65]:
y_train=pd.get_dummies(y_train).values
y_test=pd.get_dummies(y_test).values

In [122]:
y_train[0].shape,y_test[0].shape

((9,), (11,))

In [67]:
print(x_train.shape,y_train.shape,x_test.shape,y_test.shape)

(24990, 54) (24990, 9) (12, 54) (12, 11)


In [124]:
max_length=x_train.shape[1]
output=9

In [98]:
early_stopping=EarlyStopping(monitor='val_loss',patience=10)
checkpoint=ModelCheckpoint("model-v1.h5",
                            monitor="val_loss",
                            mode="min",
                            save_best_only=True,
                           verbrose=2

)
reduce_lr=ReduceLROnPlateau(monitor="val_loss",factor=0.2,patience=3,verbose=1,min_delta=0.0001)
callbacks=[early_stopping,checkpoint,reduce_lr]

In [125]:
def define_model(vocab_size,max_length):
    model=Sequential()
    model.add(Embedding(vocab_size,600,input_length=max_length))
    model.add(Conv1D(filters=64,kernel_size=4,activation='relu'))
    model.add(MaxPooling1D(pool_size=8))
    model.add(Flatten())
    model.add(Dense(output,activation='softmax'))

    #compile networks
    model.compile(loss='categorical_crossentropy',
                  optimizer='adam',
                  metrics=['accuracy'])

    #summarise defined model
    model.summary()
    plot_model(model,to_file='model.png',show_shapes=True)
    return model

In [126]:
model=define_model(vocab_size,max_length)

Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_7 (Embedding)     (None, 54, 600)           6692400   
                                                                 
 conv1d_7 (Conv1D)           (None, 51, 64)            153664    
                                                                 
 max_pooling1d_7 (MaxPooling  (None, 6, 64)            0         
 1D)                                                             
                                                                 
 flatten_7 (Flatten)         (None, 384)               0         
                                                                 
 dense_7 (Dense)             (None, 9)                 3465      
                                                                 
Total params: 6,849,529
Trainable params: 6,849,529
Non-trainable params: 0
____________________________________________

In [127]:
# Training the model
history=model.fit(x_train,y_train,epochs=1,callbacks=callbacks)

