In [1]:
questions = {
    "where_to_have_lunch": " ".join([
        "where is a good place to have lunch?",
        "where can i eat?",
        "what's good for lunch?",
        "I'm hungry",
        "anything nice for lunch?",
        "what's a good place for lunch"
    ]),
    "where_to_have_tea": " ".join([
        "it's 4pm, time for tea!",
        "any good coffee places",
        "where to have tea",
        "where to have coffee",
        "what's for tea",
        "I would like to go for a coffee break"
    ]),
    "time_for_bed": " ".join([
        "lights out!",
        "time to go to sleep",
        "sleep time",
        "time for bed",
        "its late, time to go to sleep"
    ])
}

output_answers = {
    "where_to_have_lunch": "here's a chinese restaurant that you always go to!",
    "where_to_have_tea": "starbucks near roppongi",
    "time_for_bed": "lights out!"
}


In [2]:

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import precision_recall_fscore_support



In [17]:
!pip install nltk spacy spacy_hunspell hunspell



In [6]:
!python -m spacy download en_core_web_sm

Collecting en_core_web_sm==2.1.0 from https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz#egg=en_core_web_sm==2.1.0
[?25l  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-2.1.0/en_core_web_sm-2.1.0.tar.gz (11.1MB)
[K    100% |████████████████████████████████| 11.1MB 2.5MB/s ta 0:00:01  5% |█▉                              | 645kB 557kB/s eta 0:00:19    16% |█████▎                          | 1.8MB 2.1MB/s eta 0:00:05    25% |████████                        | 2.8MB 719kB/s eta 0:00:12    51% |████████████████▍               | 5.7MB 1.7MB/s eta 0:00:04    55% |█████████████████▊              | 6.1MB 1.3MB/s eta 0:00:04    58% |██████████████████▋             | 6.4MB 3.5MB/s eta 0:00:02    66% |█████████████████████▍          | 7.4MB 1.5MB/s eta 0:00:03    67% |█████████████████████▋          | 7.5MB 1.1MB/s eta 0:00:04    75% |████████████████████████▏       | 8.4MB 1.8MB/s eta 0:00:02    84% 

# training

1. separate values form labels
2. turn values into vectors

In [26]:
import nltk
nltk.download("stopwords")
# exclude stop words
from nltk.corpus import stopwords
# word similarity
import spacy
import en_core_web_sm
# typo correction - https://github.com/tokestermw/spacy_hunspell
from spacy_hunspell import spaCyHunSpell

nlp = en_core_web_sm.load()
hunspell_dicts = ("/home/jovyan/hunspell_dictionaries/en_US.dic", "/home/jovyan/hunspell_dictionaries/en_US.aff")
hunspell = spaCyHunSpell(nlp, hunspell_dicts) # stupid me, this is a linux docker, don'T use mac
nlp.add_pipe(hunspell)

[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [44]:
# sanity test for spacy and hunspell
doc = nlp('I can haz cheezeburger.')

print("original:   ", doc)
spellcheck = " ".join([str(word._.hunspell_spell) for word in doc])
print("spellcheck: ", spellcheck)

haz = doc[2]
chk = haz._.hunspell_spell  # False
print("\nis the spelling 'haz' correct?", chk)
print("suggested spellings: ", ", ".join(haz._.hunspell_suggest))

chz = doc[3]
chk = chz._.hunspell_spell
print("\nis the spelling 'cheezburger' correct?", chk)
print("suggested spellings: ", ", ".join(chz._.hunspell_suggest))

original:    I can haz cheezeburger.
spellcheck:  True True False False True

is the spelling 'haz' correct? False
suggested spellings:  ha, haze, hazy, has, hat, had, hag, ham, hap, hay, haw, ha z

is the spelling 'cheezburger' correct? False
suggested spellings:  cheeseburger, vegeburger


In [136]:
# preprocess and vectorize word
training_documents = list(questions.values())
labels = list(questions.keys())

# remove stopwords (unimportant words)
en_stop_words = stopwords.words("english")

# vectorize
vectorizer = CountVectorizer(stop_words=en_stop_words) # play with vectorizer params
print(vectorizer)
X = vectorizer.fit_transform(training_documents)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=None, min_df=1,
        ngram_range=(1, 1), preprocessor=None,
        stop_words=['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs',... 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"],
        strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
        tokenizer=None, vocabulary=None)


[nltk_data] Downloading package stopwords to /home/jovyan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [119]:
training_documents

["where is a good place to have lunch? where can i eat? what's good for lunch? I'm hungry anything nice for lunch? what's a good place for lunch",
 "it's 4pm, time for tea! any good coffee places where to have tea where to have coffee what's for tea I would like to go for a coffee break",
 'lights out! time to go to sleep sleep time time for bed its late, time to go to sleep']

In [120]:
# these are our feature vectors and their labels
feature_names = vectorizer.get_feature_names()
print(len(feature_names))
print(X)
print(feature_names)


20
  (0, 13)	1
  (0, 1)	1
  (0, 8)	1
  (0, 5)	1
  (0, 12)	4
  (0, 14)	2
  (0, 7)	3
  (1, 3)	1
  (1, 6)	1
  (1, 11)	1
  (1, 19)	1
  (1, 15)	1
  (1, 4)	3
  (1, 17)	3
  (1, 18)	1
  (1, 0)	1
  (1, 7)	1
  (2, 9)	1
  (2, 2)	1
  (2, 16)	3
  (2, 10)	1
  (2, 6)	2
  (2, 18)	4
['4pm', 'anything', 'bed', 'break', 'coffee', 'eat', 'go', 'good', 'hungry', 'late', 'lights', 'like', 'lunch', 'nice', 'place', 'places', 'sleep', 'tea', 'time', 'would']


In [121]:
# train the model

classifier = MultinomialNB() # play with models
classifier.fit(X, labels)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [122]:
# predict

def predict(raw_queries):
    queries = vectorizer.transform(raw_queries)
    prob = classifier.predict_proba(queries)
    print(prob)
    return classifier.predict(queries)

raw_queries = [" it's quite noon now please", "what for eats", "hello", "night!"]

print(predict(raw_queries))

[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]]
['time_for_bed' 'time_for_bed' 'time_for_bed' 'time_for_bed']


In [123]:
predictions

array(['where_to_have_lunch', 'where_to_have_lunch', 'time_for_bed'],
      dtype='<U19')

In [124]:
# accuracy
# compare input with expected classes

tests = [" it's quite noon now please", "what for eats", "hello", "night!"]
expected = ["where_to_have_lunch", "where_to_have_lunch", "none", "time_for_bed"]
predicted = predict(tests)

# has interesting params to play with
evaluation = precision_recall_fscore_support(expected, predicted)
metrics = {}
(metrics["p"], metrics["r"], metrics["f1"], _) = evaluation

[[ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]
 [ 0.33333333  0.33333333  0.33333333]]


  'precision', 'predicted', average, warn_for)


In [125]:
metrics

{'p': array([ 0.  ,  0.25,  0.  ]),
 'r': array([ 0.,  1.,  0.]),
 'f1': array([ 0. ,  0.4,  0. ])}

# challenges
1. return the answer
2. exclude stop words (unimportant words)
3. handle synonyms (e.g. lobby == front desk)
4. handle typos
5. return unknown
    - if the questions is outside the domain of the chatbot, defer
6. handle paramenter (e.g. set my check out time to 3pm)
    - extract 3pm as parameter

In [126]:
# return the answer
def answer(raw_queries):
    predictions = predict(raw_queries)
    answers = [output_answers[p] for p in predictions]
    
    print('\npredictions')
    for i, p in enumerate(predictions):
        print(p + " -> " + answers[i])

    return answers
queries = ["what's for lunch", "good day today", "evening, time for some snores"]

results = answer(queries)


[[ 0.14728275  0.71409819  0.13861906]
 [ 0.14790403  0.57368837  0.27840759]
 [ 0.63677639  0.12349603  0.23972758]]

predictions
where_to_have_lunch -> here's a chinese restaurant that you always go to!
where_to_have_lunch -> here's a chinese restaurant that you always go to!
time_for_bed -> lights out!


In [115]:
# exclude stop words above

In [None]:
# handle synonyms and typos using vector models?


# Tokenization:
# Using the spaCy library, individual words and punctuation marks were identified
# Each word was sent to the Hunspell library and if it is misspelled, the tool. provides a list of suggested replacements. If there were no replacements, the query would be forwarded to the FAQ module without any corrections.

# Similarity matching:
# The suggested replacements were converted to word vectors using spaCy.
# These vectors were then compared to the vector representation of the original misspelled word.
# The comparison generates an initial similarity score that indicates how similar the suggestion is to the original word.



In [131]:
# return unknown

#predictor returns top 10 intents
#if the score is too close together, the chatbot might not know what the actual intent is

# other challenges

1. other languages
2. conjugation and punctuation (word stemming)
3. domination of frequent words or intents
4. conversation state
5. conversation design
6. interface for data entry (customizing the chatbot)
7. dealing with what is happening when it goes wrong
8. recognizing different entities in the sentence and deciding whether the noun or article is important for this intent