In [910]:
#import libraries
import nltk
import random
from nltk.chat.util import Chat, reflections
import math
import numpy as np
import pandas as pd
import json
from matplotlib import pyplot as plt
import seaborn as sns
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from nltk import WordNetLemmatizer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk import FreqDist
from chatbot import Chatbot

pd.set_option("display.max_rows", 200)
pd.set_option("display.max_columns", None)

In [911]:
def import_data(): #imports dataset as ["category", "user query", "chatbot response"]
    with open("data.json") as intent: #load jsons
        data = json.load(intent)
    df = pd.DataFrame(data['intents'])
    df2 = df[['intent', "text", "responses"]] #choose specific columns
    df2 = df2.rename(columns={"text":"in", "responses":"out"}) #making it more readable

    return df2

In [912]:
def get_tree():
    t = DecisionTreeClassifier(
        criterion="gini", 
        splitter="best",
        max_depth = 4,
        min_samples_leaf = 100,
        min_samples_split = 1000,
        random_state=1)
    
    df = import_data()
    return df

In [913]:
df = import_data()
df["intent"] = df["intent"].apply(lambda x: str.lower(x))
df["in"] = df["in"].apply(lambda x: [str.lower(n) for n in x])
# df["out"] = df["out"].apply(lambda x: [str.lower(n) for n in x])
df.head(5)

Unnamed: 0,intent,in,out
0,greeting,"[hi, hi there, hola, hello, hello there, hya, ...","[Hi human, , Hello human, , Hola human, ]"
1,greetingresponse,"[my name is chatbot, this is chatbot, i am cha...","[Great! Hi! How can I help?, Good! Hi, how can..."
2,courtesygreeting,"[how are you?, hi how are you?, hello how are ...","[Hello, I am great, how are you? , Hello, how ..."
3,courtesygreetingresponse,"[good thanks! my name is chatbot, good thanks!...","[Great! Hi! How can I help?, Good! Hi, how can..."
4,currenthumanquery,"[what is my name?, what do you call me?, who d...","[You are! How can I help?, Your name is , how ..."


In [914]:
cb = Chatbot()

In [915]:
#proess "in" and "out" into array of lemmas
def to_lemmas(arr):
    rtn = []
    for i in arr:
        words = cb.lemmatize(cb.tokenize(i))
        words = cb.filter_stopwords(words)
        rtn += words
    return [*set(rtn)]

df["in"] = df["in"].apply(to_lemmas)
df.head(5)


Unnamed: 0,intent,in,out
0,greeting,"[hi, hola, hya, hello]","[Hi human, , Hello human, , Hola human, ]"
1,greetingresponse,"[chatbot, name]","[Great! Hi! How can I help?, Good! Hi, how can..."
2,courtesygreeting,"[hello, well, hi, hola, ?, hope]","[Hello, I am great, how are you? , Hello, how ..."
3,courtesygreetingresponse,"[chatbot, name, thanks, good, !, great]","[Great! Hi! How can I help?, Good! Hi, how can..."
4,currenthumanquery,"[name, think, talk, tell, call, ?]","[You are! How can I help?, Your name is , how ..."


In [916]:
arr = df["in"].to_numpy()
arr2 = []
for i in arr:
    arr2 += i
all_keywords = [*set(arr2)] #holds all lemmas for input text
for word in all_keywords:
    df[word] = [0 for x in range(0, df.shape[0])]
df.head(5)

Unnamed: 0,intent,in,out,chatbot,fuck,identify,laugh,pod,comprendo,enough,genious,cheer,tell,call,shut,know,hello,good,communicate,aware,hi,speak,n't,door,great,see,bay,shhh,prove,?,need,understand,get,conscious,help,make,adios,intelligent,talk,friend,anyone,joke,hola,open,meant,bore,think,stop,",",!,self,mean,thanks,surely,chatbotous,girl,give,gossip,shit,helpful,bye,thank,hear,hya,camera,could,ok,want,hope,real,goodbye,name,time,say,twat,quiet,'s,later,self-aware,well,clever,please
0,greeting,"[hi, hola, hya, hello]","[Hi human, , Hello human, , Hola human, ]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,greetingresponse,"[chatbot, name]","[Great! Hi! How can I help?, Good! Hi, how can...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,courtesygreeting,"[hello, well, hi, hola, ?, hope]","[Hello, I am great, how are you? , Hello, how ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,courtesygreetingresponse,"[chatbot, name, thanks, good, !, great]","[Great! Hi! How can I help?, Good! Hi, how can...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,currenthumanquery,"[name, think, talk, tell, call, ?]","[You are! How can I help?, Your name is , how ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [917]:
for i, row in df.iterrows():
    for word in row["in"]:
        if word in all_keywords:
            df.loc[len(df.index)] = [row["intent"], word, row["out"], 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
df.head(5)

Unnamed: 0,intent,in,out,chatbot,fuck,identify,laugh,pod,comprendo,enough,genious,cheer,tell,call,shut,know,hello,good,communicate,aware,hi,speak,n't,door,great,see,bay,shhh,prove,?,need,understand,get,conscious,help,make,adios,intelligent,talk,friend,anyone,joke,hola,open,meant,bore,think,stop,",",!,self,mean,thanks,surely,chatbotous,girl,give,gossip,shit,helpful,bye,thank,hear,hya,camera,could,ok,want,hope,real,goodbye,name,time,say,twat,quiet,'s,later,self-aware,well,clever,please
0,greeting,"[hi, hola, hya, hello]","[Hi human, , Hello human, , Hola human, ]",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
1,greetingresponse,"[chatbot, name]","[Great! Hi! How can I help?, Good! Hi, how can...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,courtesygreeting,"[hello, well, hi, hola, ?, hope]","[Hello, I am great, how are you? , Hello, how ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,courtesygreetingresponse,"[chatbot, name, thanks, good, !, great]","[Great! Hi! How can I help?, Good! Hi, how can...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0
4,currenthumanquery,"[name, think, talk, tell, call, ?]","[You are! How can I help?, Your name is , how ...",0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [918]:
def get_intent(string): #string = user query
    arr = cb.lemmatize(cb.tokenize(string))
    highscore = 0
    intent = ""
    for word in arr:
        for i, row in df.iterrows():
            score = 0
            try:
                if df.loc[i, word] == 1:
                    score += 1
            except:
                pass
            if score > highscore:
                intent = row["intent"]
    print(intent)
    print(df.iloc[df["intent"] == intent, 2])

In [919]:
df.drop(["in", "out"], axis=1, inplace=True) #prep df for split
x = df.drop("intent", axis=1)
y = df["intent"]

In [920]:
xtrain, xtest, ytrain, ytest = train_test_split(x, y, test_size=0.3, random_state=1)

In [921]:
#make a model and fit to data
model = DecisionTreeClassifier()
model.fit(xtrain, ytrain)

In [922]:
model.score(xtest, ytest)

0.022727272727272728