In [1]:
import io
import random
import string # to process standard python strings
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer
import json
import pickle
import re
from colorama import Fore, Back, Style 

In [2]:
rawData = {}
raw = ""
with open("onlyTopicsData.json") as json_file:
    data = json.load(json_file)
    rawData = data
    for bigTopic, topics in data.items():
        for topic, text in topics.items():
            if text.strip() != "":
                raw += " \n " + " ".join(text.strip().split("\n"))

In [3]:
#Tokenization
sent_tokens = []
articles = []
articleToText = {}
with open("onlyTopicsData.json") as json_file:
    data = json.load(json_file)
    for category, topics in data.items():
        for topic, text in topics.items():
            if text.strip() != "" and text.strip() != " ":
                text = " ".join([w for w in text.split(" ") if w.strip() != "" and "[" not in w])
                doc = " ".join([p for p in text.strip().split("\n") if p.strip() != "" and len(p) > 5])
                sentences = nltk.sent_tokenize(doc)
                article = [topic for _ in range(len(sentences))]
                sent_tokens.extend(sentences)
                articles.extend(article)
                articleToText[topic] = text

In [4]:
lemmer = WordNetLemmatizer()

# take as input the tokens and return normalized tokens
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# tokens normalized
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

greeting_in = ("hello", "hi", "greetings", "sup", "yo", "hey", " what's up")
# array form because random.choice()
greeting_out = ["hi", "hey there", " hello", "I'm glad we are conversing."]

#if user types in greeting, send a greeting out
def introduction(sentence):
    for word in sentence.split():
        if word.lower() in greeting_in:
            return random.choice(greeting_out);

In [5]:
TfidfVect = TfidfVectorizer(tokenizer = LemNormalize, stop_words = 'english')
# TfidfVect = TfidfVectorizer(stop_words = 'english', ngram_range=(1,2))
tfidf = TfidfVect.fit_transform(sent_tokens)

  'stop_words.' % sorted(inconsistent))


In [21]:
#response
def get_user_input():
    print(Style.RESET_ALL + ">", end = " ")
    userinput = input().lower()
    print(Fore.RED)
    return userinput

def response (user_text):
    robo_text = ''
    sent_tokens.append(user_text)
    values = cosine_similarity(TfidfVect.transform([user_text]), tfidf)
    indexes = values.argsort()[0]
    flat = values.flatten()
    flat.sort()
    if (flat[-1] == 0):
        robo_text = robo_text + "I\'m sorry, I do not understand you. The query you have inputted is incomprehensible. \n Please try again. "
        return robo_text
    else:
        print(Fore.RED + "I found these sentences most similar to your input. Please state the article most relevant: (ex. \"article 1\")")
        for i in range(1,6): #print 5 most related sentences
            index = indexes[-i]
            print("Article {}: {}".format(i, articles[index]))
            print("Similar Sentence: " + sent_tokens[index] + "\n")
        user_resp = get_user_input()
        next_response(user_text, user_resp, indexes)
    return robo_text

def next_response(user_text, user_resp, indexes):
    i = int(re.findall(r"\d+", user_resp)[0])
    if i > 5:#TODO
        print("invalid input")
    index = indexes[-i]
    sentence = sent_tokens[index]
    articleText = articleToText[articles[index]]
    paragraphs = [p for p in articleText.split("\n") if p != ""]
    
#     for para in paragraphs:
#         if sentence in para:
#             print(para)
            
    articleSents = nltk.sent_tokenize(" ".join(paragraphs))
    articleVect = TfidfVectorizer(stop_words = "english")
    articleTfidf = articleVect.fit_transform(articleSents)
    values = cosine_similarity(articleVect.transform([sentence, user_text]), articleTfidf)
    indexes = values.argsort()[0]
    bestIndex = values.argsort()[0][-1]
#     print(values[0][0])
    while values[0][bestIndex] > 0:
        print(articleSents[bestIndex])
#         print(values[0][bestIndex])
        bestIndex += 1

In [22]:
user_exit = False
while (user_exit == False):
        user_text = get_user_input()
        # user want to leave
        if (user_text == 'bye'):
            user_exit = True
            print("KnowBot: Bye! Take care and come back soon. ")
        # replying to gratitude
        elif(user_text == 'thanks' or user_text == 'thank you'):
            print("KnowBot: You\'re welcome! Ask me another query!")
        # user needs more instructions
        elif (user_text == 'help'):
            print("KnowBot: I\'m sorry the instructions were unclear. \n I am a robot designed to answer queries you have about the following subjects: Matematics, Science, Music, Politics, History (USA), Computer Science. \n You can type in keyword(s) (i.e. multiplication, linear algebra, boolean, 1844) to learn more about that subject. \n If you would like to leave, please type \"bye\".")
        # user has typed in a greeting
        elif (introduction(user_text) != None):
            print("KnowBot: " + introduction(user_text))
        # user has typed in a keyword, generate a response
        else:
            print("KnowBot: " , end= "")
            print(response(user_text))

[0m> what is linear algebra
[31m
KnowBot: [31mI found these sentences most similar to your input. Please state the article most relevant: (ex. "article 1")
Article 1: Linear_algebra
Similar Sentence: Their theory is thus an essential part of linear algebra.

Article 2: Linear_algebra
Similar Sentence: For nonlinear systems, which cannot be modeled with linear algebra, linear algebra is often used as a first-order approximation.

Article 3: Linear_algebra
Similar Sentence: Linear algebra is the branch of mathematics concerning linear equations such as  linear functions such as and their representations through matrices and vector algebra is central to almost all areas of mathematics.

Article 4: Linear_algebra
Similar Sentence: Until the 19th century, linear algebra was introduced through systems of linear equations and matrices.

Article 5: Symbolic_computation
Similar Sentence: The same was also true for the classical algorithms from linear algebra.

[0m> article 3
[31m
Linear al

[0m> article 3
[31m
George Washington, who had led the Continental Army to victory, was the first president elected under the new constitution.

[0m> what is an apple
[31m
KnowBot: [31mI found these sentences most similar to your input. Please state the article most relevant: (ex. "article 1")
Article 1: Aristotle
Similar Sentence: When we look at an apple, for example, we see an apple, and we can also analyse a form of an apple.

Article 2: Hungarian_language
Similar Sentence: 'Which apple would you like?

Article 3: Addition
Similar Sentence: For example, in the adjacent picture, there is a combination of three apples and two apples together, making a total of five apples.

Article 4: Hungarian_language
Similar Sentence: ('It is John who sees the apple'.

Article 5: Aristotle
Similar Sentence: In this distinction, there is a particular apple and a universal form of an apple.

[0m> article 3
[31m
For example, in the adjacent picture, there is a combination of three apples and t

[0m> quit
[31m
KnowBot: [31mI found these sentences most similar to your input. Please state the article most relevant: (ex. "article 1")
Article 1: Automobile
Similar Sentence: Maybach quit DMG shortly thereafter and opened a business of his own.

Article 2: Hypnotism
Similar Sentence: The technique is often used to increase motivation for a diet, to quit smoking, or to reduce stress.

Article 3: Pop_country
Similar Sentence: (Running from 1980 to 1982 – Mandrell had to quit the show because of health reasons.)

Article 4: Polyrhythm
Similar Sentence: The Aaliyah song "Quit Hatin" uses 98 against 44 in the chorus.

Article 5: Vannevar_Bush
Similar Sentence: Bush preferred to quit rather than study a subject that did not interest subsequently enrolled in the Massachusetts Institute of Technology (MIT) electrical engineering program.

[0m> quit
[31m


IndexError: list index out of range