# Diabetic Retinopathy Detection ChatBot

In [None]:
import json
import os

#### Load the training dataset

In [None]:
with open("retQA.json") as file:
    data = json.load(file)

In [None]:
print(data)

### Building the word Vocabulary and Tokenizing the Data

In [None]:
from nltk.corpus import stopwords
import numpy as np
import re
import random
import os
import time
import json
import nltk
from nltk.stem.lancaster import LancasterStemmer
stemmer = LancasterStemmer()

In [None]:
stop_words = set(stopwords.words('english'))

In [None]:
words = []
labels = []
responses = []
docs_x = []
docs_y = []
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        wrds = nltk.word_tokenize(pattern)
        words.extend([w for w in wrds if not w in stop_words])
        docs_x.append(wrds)
        docs_y.append(intent["tag"])

sentences = []

for i in range(len(docs_x)):
    filtered_sentence = [w for w in docs_x[i] if not w in stop_words]
    sentences.append(filtered_sentence)
        
for intent in data["intents"]:
    for response in intent["responses"]:
        resp = nltk.word_tokenize(response)
        responses.extend([w for w in resp if not w in stop_words])

words = [stemmer.stem(w.lower()) for w in words if w != "?"]
responses = [stemmer.stem(r.lower()) for r in responses if r != "?"]
vocab = sorted(set(words + responses))

In [None]:
print('Vectorizing the word sequences.....')
word_idx = dict((c, i + 1) for i, c in enumerate(vocab))
print('Done.')

In [None]:
word_idx

### Word representation model using FastText

In [None]:
from keras.preprocessing.text import Tokenizer
#from gensim.models.fasttext import FastText
import numpy as np
import matplotlib.pyplot as plt
import nltk
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import sent_tokenize
from nltk import WordPunctTokenizer

import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
en_stop = set(nltk.corpus.stopwords.words('english'))

%matplotlib inline

In [None]:
import re
from nltk.stem import WordNetLemmatizer

stemmer = WordNetLemmatizer()

def preprocess_text(document):
        # Remove all the special characters
        document = re.sub(r'\W', ' ', str(document))

        # remove all single characters
        document = re.sub(r'\s+[a-zA-Z]\s+', ' ', document)

        # Remove single characters from the start
        document = re.sub(r'\^[a-zA-Z]\s+', ' ', document)

        # Substituting multiple spaces with single space
        document = re.sub(r'\s+', ' ', document, flags=re.I)

        # Removing prefixed 'b'
        document = re.sub(r'^b\s+', '', document)

        # Converting to Lowercase
        document = document.lower()

        # Lemmatization
        tokens = document.split()
        tokens = [stemmer.lemmatize(word) for word in tokens]
        tokens = [word for word in tokens if word not in en_stop]
        tokens = [word for word in tokens if len(word) > 3]

        preprocessed_text = ' '.join(tokens)

        return preprocessed_text

In [None]:
preprocess_text('what are the symptoms of diabetic retinopathy')

In [None]:
model10 = FastText(size=15, window=3, min_count=5)
model10.build_vocab(sentences=word_idx)

In [None]:
embedding_size = 15
window_size = 3
min_word = 5
down_sampling = 1e-2

In [None]:
%%time
fast_text_model = FastText('corpus.txt',
                      size=embedding_size,
                      window=window_size,
                      min_count=min_word,
                      sample=down_sampling,
                      sg=1,
                      iter=100)

In [None]:
from fasttext import train_unsupervised

In [None]:
lr = train_unsupervised(input='corpus.txt', epoch=2, lr=1.0, wordNgrams=3, verbose=2, minCount=1)

lr.ws

In [None]:
import fasttext

# Skipgram model :
skip_model = fasttext.train_unsupervised('corpus.txt', model='skipgram')

# or, cbow model :
cbow_model = fasttext.train_unsupervised('corpus.txt', model='cbow')

In [None]:
skip_model

In [None]:
print(skip_model.words)   # list of words in dictionary
print(cbow_model.words)   # list of words in dictionary
print(skip_model['excellent']) # get the vector of the word 'retinopathy'
print(cbow_model['retinopathy']) # get the vector of the word 'retinopathy'

#### Vectorizing the corpus

In [None]:
from keras.preprocessing.sequence import pad_sequences

In [None]:
def vectorize(data):
    inputs = []
    wrds = nltk.word_tokenize(data)
    wrds = [w for w in wrds if not w in stop_words]
    words = [stemmer.stem(w.lower()) for w in wrds if w != "?"]
    # Remove words not in vocab
    removed_words = [words.pop(i) for i, w in enumerate(words) if w not in vocab]
    
    for w in removed_words:
        inputs.append(word_idx[w])
    return inputs
    #return pad_sequences(inputs, maxlen = 20)


In [None]:
sentences

In [None]:
corpus = []
for ls in sentences:
    sent = [stemmer.stem(w.lower()) for w in ls if w != "?"]
    inner_list = []
    for ent in sent:
        inner_list.append(word_idx[ent])
    corpus.append(inner_list)
X_train = pad_sequences(corpus, 8)

In [None]:
X_train

In [None]:
len(set(docs_y))

#### Vectorizing the Tags

In [None]:
# Create Tags dictionary
y = {}
y = dict((c, i) for i, c in enumerate(set(docs_y))) 

# Convert tags to interger representation
y_train = []
for entry in docs_y:
    y_train.append(y[entry])

# Convert to categorical using keras module
import keras

y_train = keras.utils.to_categorical(y_train)

In [None]:
y_train

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, 
                                                    test_size=.3,
                                                   random_state=1,
                                                   stratify=y_train)

In [None]:
print(X_train.shape)
print(y_train.shape)

### Defining the Model

In [None]:
from keras import layers
from keras.models import Sequential
from keras.layers import Embedding, Flatten, Dense

In [None]:
maxlen = 8

model_1 = Sequential()
model_1.add(Embedding(len(vocab)+1, 8, input_length=maxlen))
model_1.add(layers.Bidirectional(layers.LSTM(maxlen)))
model_1.add(Dense(12, activation='softmax'))

In [None]:
model_1.compile(optimizer='rmsprop', loss='categorical_crossentropy', metrics=['acc'])
model_1.summary()

In [None]:
history = model_1.fit(X_train, y_train, epochs=500, batch_size=8)

In [None]:
import pickle
save_path ='/Users/adara/CAPSTONEPROJECT/eyeNetBot'
pickle.dump(vocab, open(os.path.join(save_path, "vocab.pkl"), "wb"))

In [None]:
# Load the model, if it exits, load vocab too
from keras.models import load_model
save_path = '/Users/adara/CAPSTONEPROJECT/eyeNetBot'
model_1 = load_model(os.path.join(save_path, "chatModel.h5"))
#vocab = pickle.load(open(os.path.join(save_path, "vocab.pkl"), "rb"))

#### Saving and loading a model object

In [None]:
model.save_model("skip_model.bin")
model.save_model("cbow_model.bin")

In [None]:
skip_model = fasttext.load_model("skip_model.bin")
cbow_model = fasttext.load_model("cbow_model.bin")

### Building the ChatBot

In [None]:
def vectorize(data):
    inputs = []
    wrds = nltk.word_tokenize(data)
    words = [stemmer.stem(w.lower()) for w in wrds if w != "?"]
    filtered_sentence = [w for w in words if not w in stop_words]
    [filtered_sentence.pop(i) for i, w in enumerate(filtered_sentence) if w not in vocab]
    
    inner_list = []
    
    for ent in filtered_sentence:
        inner_list.append(word_idx[ent])
    inputs.append(inner_list)
    
    return pad_sequences(inputs, maxlen = 8)

In [None]:
y

In [None]:
# Create Tags dictionary
labels = {}
labels = dict((i, c) for i, c in enumerate(set(docs_y)))

In [None]:
labels[0]

In [None]:
n = vectorize('what causes the disease')

In [None]:
results = model_1.predict(n, 12)[0]
results_index = np.argmax(results)
print(labels[results_index])

In [None]:
import sqlite3
 
from sqlite3 import Error
 
def sql_connection():
 
    try:
 
        con = sqlite3.connect('retQA.db')
 
        return con
 
    except Error:
 
        print(Error)

In [None]:
def sql_table(con):
 
    cursorObj = con.cursor()
 
    cursorObj.execute("CREATE TABLE QUESTIONS(Question text)")
 
    con.commit()

con = sql_connection()
 
sql_table(con)

In [None]:
!pip install gtts

In [None]:
import os
import tempfile
import time
import playsound
import speech_recognition as sr
from gtts import gTTS
from pathlib import Path
import sqlite3

In [None]:
def chat():
    print("start talking with the bot (type quit to stop)!")
    num = 100
    while True:
                
        inp = input("You: ")
        num = num + 1 
        
        if inp.lower() == "quit":
            tts = gTTS(text = 'bye', lang = "en", slow=False)
            filename = 'chat{}.mp3'.format(num)
            tts.save(filename)
            playsound.playsound(filename)
            break
        inp = vectorize(inp)
        results = model_1.predict(inp, 12)[0]
        results_index = np.argmax(results)
        output = labels[results_index]
        
        if results[results_index] > 0.5:
            for tg in data["intents"]:
                if tg['tag'] == output:
                    responses = tg['responses']
            tts = gTTS(text = random.choice(responses), lang = "en", slow=False)
            filename = 'chat{}.mp3'.format(num)
            tts.save(filename)
            playsound.playsound(filename)
        else:
            tts = gTTS(text = "I didn't understand that, try again.", lang = "en", slow=False)
            filename = 'chat{}.mp3'.format(num)
            tts.save(filename)
            playsound.playsound(filename)

In [None]:
inp = 'omolewa'
con = sqlite3.connect('retQA.db')
cursor = con.cursor()
cursor.execute("""INSERT INTO QUESTIONS (question) VALUES (?)""", [(inp)])

In [None]:
cursor.execute("""SELECT * FROM QUESTIONS""")
cursor.fetchall()

In [None]:
chat()

In [None]:
import anvil.server
anvil.server.connect("here comes your anvil link code")

In [None]:
@anvil.server.callable


In [None]:
import anvil.server

anvil.server.connect("35KHU4CLLVXFT6LWX4G34K5F-5RJCZTHEBWY35MGS")
@anvil.server.callable
def botChat(inp):
    print("Making botchat call")
    if inp.lower() == "quit":
        return 'bye'
    
    inp = vectorize(inp)
    results = model_1.predict(inp, 12)[0]
    results_index = np.argmax(results)
    output = labels[results_index]
    
    if results[results_index] > 0.5:
        for tg in data["intents"]:
            if tg['tag'] == output:
                print("Found match")
                print(tg['tag'])
                responses = tg['responses']
            
        if responses:
            for response in responses:
                speak_text(reponse)
        return random.choice(responses)
    else:
        # create a database and store the question
        return "I didn't understand that, try again."

In [None]:
words = []
labels = []
responses = []
x = []
y = []
for intent in data["intents"]:
    for pattern in intent["patterns"]:
        sent = preprocess_text(pattern)
        x.append(sent)
        y.append(intent["tag"])

In [None]:
x

In [None]:
ff = []
for i in range(len(x)):
    kk = nltk.word_tokenize(x[i])
    for j in range(len(kk)):
        ff.append(kk[j])
    

In [None]:
ff

In [None]:
gg = []
for intent in data["intents"]:
    for response in intent["responses"]:
        resp = preprocess_text(response)
        a = [w for w in nltk.word_tokenize(resp)]
        for i in range(len(a)):
            gg.append(a[i])

In [None]:
x

In [None]:
nltk.word_tokenize(x[0])

In [None]:
sent = ['what', 'is', 'afsdguyg', 'diabetic', 'retinopathy', 'bukola']

In [None]:
[sent.pop(i) for i, w in enumerate(sent) if w not in vocab]

In [None]:
ww = []
for w in sent:
    if w in vocab:
        ww.append(w)

In [None]:
ww

In [None]:
for w in sent:
    if w not in vocab:
        print(enumerate(sent))

In [None]:
vocab