In [6]:
import io
import random
import string # to process standard python strings
import warnings
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import nltk
from nltk.stem import WordNetLemmatizer
import json
import pickle
import re
from colorama import Fore, Back, Style 
import spacy

In [7]:
rawData = {}
raw = ""
with open("onlyTopicsData.json") as json_file:
    data = json.load(json_file)
    rawData = data
    for bigTopic, topics in data.items():
        for topic, text in topics.items():
            if text.strip() != "":
                raw += " \n " + " ".join(text.strip().split("\n"))

In [8]:
#Tokenization
sent_tokens = []
articles = []
articleToText = {}
with open("onlyTopicsData.json") as json_file:
    data = json.load(json_file)
    for category, topics in data.items():
        for topic, text in topics.items():
            if text.strip() != "" and text.strip() != " ":
                text = " ".join([w for w in text.split(" ") if w.strip() != "" and "[" not in w])
                doc = " ".join([p for p in text.strip().split("\n") if p.strip() != ""])
                sentences = [sent for sent in nltk.sent_tokenize(doc) if len(sent) > 5]
                article = [topic for _ in range(len(sentences))]
                sent_tokens.extend(sentences)
                articles.extend(article)
                articleToText[topic] = text

In [9]:
lemmer = WordNetLemmatizer()

# take as input the tokens and return normalized tokens
def LemTokens(tokens):
    return [lemmer.lemmatize(token) for token in tokens]

remove_punct_dict = dict((ord(punct), None) for punct in string.punctuation)

# tokens normalized
def LemNormalize(text):
    return LemTokens(nltk.word_tokenize(text.lower().translate(remove_punct_dict)))

greeting_in = ("hello", "hi", "greetings", "sup", "yo", "hey", " what's up")
# array form because random.choice()
greeting_out = ["hi", "hey there", " hello", "I'm glad we are conversing."]

rejection_words = ["no", "nah", "nope", "not really", "not quite"]
approval_words = ["yes", "yeah", "yea", "yep", "ya", "ye", "kinda", "a little"]

#if user types in greeting, send a greeting out
def introduction(sentence):
    for word in sentence.split():
        if word.lower() in greeting_in:
            return random.choice(greeting_out);

In [22]:
import sister
sentEmbedder = sister.MeanEmbedding(lang="en")
def getSentEmbeddings(data):
    return np.array([sentEmbedder(sent) for sent in data])

Loading model...





In [None]:
#get sentence embeddings, can take a while
sentEmbeddings = getSentEmbeddings(sent_tokens)

In [37]:
#response
def get_user_input():
    print(Style.RESET_ALL + ">", end = " ")
    userinput = input().lower()
    print(Fore.RED)
    return userinput

#original response
def response(user_text):
    robo_text = ''
    values = cosine_similarity(sentEmbedder(user_text).reshape(1,-1), sentEmbeddings)
    indexes = values.argsort()[0]
    index = values.argsort()[0][-1]
    flat = values.flatten()
    flat.sort()
    if (flat[-1] == 0):
        robo_text = robo_text + "I\'m sorry, I do not understand you. The query you have inputted is incomprehensible. \n Please try again. "
        return robo_text
    else:
        print("I found these articles most similar to your input.") 
        
        # print top 5 most related articles
        for i in range(1,6): #print 5 most related sentences
            index = indexes[-i]
            print("Article {}: {}".format(i, articles[index]))
            print("Similar Sentence:" + sent_tokens[index])
            
        print()
        print("Do any of these match your interest?")
        
        user_text = get_user_input() # request user's approval
        
        # validate input
        while user_text not in rejection_words and user_text not in approval_words:
            print("I'm sorry, I did not understand whether you found these interesting. Please say yes or no!")
            user_text = get_user_input()
        
        if user_text in rejection_words:
            print("Let me find a few more articles.")
            
            # print articles 6-10
            for i in range(6,11):
                index = indexes[-i]
                print("Article {}: {}".format(i, articles[index]))
                print("Similar Sentence:" + sent_tokens[index])
            
            print()
            print("Do any of these match your interest?")

            user_text = get_user_input() # request user's approval
            
            while user_text not in rejection_words and user_text not in approval_words:
                print("I'm sorry, I did not understand whether you found these interesting. Please say yes or no!")
                user_text = get_user_input()
            
            if user_text in rejection_words:
                print("I'm sorry I couldn't find any good results. Could you please rephrase your inquiry or try something else?")
                user_text = get_user_input() 
                response(user_text) # starting over
            elif user_text in approval_words:
                print("Awesome! Please state the article number you would like to explore more.")
                user_text = get_user_input() 
                next_response(user_text, indexes, 10)
                  
        elif user_text in approval_words:
            print("Awesome! Please state the article number you would like to explore more.")
            user_text = get_user_input()
            next_response(user_text, indexes, 5)
            
    return robo_text

def next_response(user_text, indexes, rank):
    if not user_text.isdigit():
        print("invalid input")
        return
    i = int(re.findall(r"\d+", user_text)[0])
    if i > rank:#TODO
        print("invalid input")
        return
    index = indexes[-i]
    sentence = sent_tokens[index]
    articleText = articleToText[articles[index]]
    paragraphs = [p for p in articleText.split("\n") if p != ""]
    
    articleSents = nltk.sent_tokenize(" ".join(paragraphs))
    articleVectors = getSentEmbeddings(articleSents)
    values = cosine_similarity(getSentEmbeddings([sentence,user_text]), articleVectors)
    indexes = values.argsort()[0]
    bestIndex = values.argsort()[0][-1]
    outputText = ""
    count = 0
    while bestIndex < len(values[0]) and values[0][bestIndex] > 0.8 and count < 10:
        outputText += articleSents[bestIndex] + " "
        bestIndex += 1
        count += 1
    print(outputText)

In [None]:
user_exit = False
while (user_exit == False):
        user_text = get_user_input()
        user_text = user_text.lower()
        # user want to leave
        if (user_text == 'bye'):
            user_exit = True
            print("KnowBot: Bye! Take care and come back soon. ")
        # replying to gratitude
        elif(user_text == 'thanks' or user_text == 'thank you'):
            print("KnowBot: You\'re welcome! Ask me another query!")
        # user needs more instructions
        elif (user_text == 'help'):
            print("KnowBot: I\'m sorry the instructions were unclear. \n I am a robot designed to answer queries you have about the following subjects: Matematics, Science, Music, Politics, History (USA), Computer Science. \n You can type in keyword(s) (i.e. multiplication, linear algebra, boolean, 1844) to learn more about that subject. \n If you would like to leave, please type \"bye\".")
        # user has typed in a greeting
        elif (introduction(user_text) != None):
            print("KnowBot: " + introduction(user_text))
        # user has typed in a keyword, generate a response
        else:
            print("KnowBot: " , end= "")
            print(response(user_text))
#             sent_tokens.remove(user_text)

[0m> Who was Michael Jackson?
[31m
KnowBot: I found these articles most similar to your input.
Article 1: Marc_Bloch
Similar Sentence:There he met Eileen Power, R. H. Tawney and Michael Postan, among others.
Article 2: Early_Middle_Ages
Similar Sentence:It was refounded by Emperor Michael III in 849.
Article 3: Orchestration
Similar Sentence:Some of the most in-demand orchestrators today (and of the past 30 years) include Jeff Atmajian, Pete Anthony, Brad Dechter (James Newton Howard, Christopher Young, Theodore Shapiro, Teddy Castellucci, Danny Elfman, John Powell, Marco Beltrami, John Debney, Marc Shaiman, Michael Giacchino), Conrad Pope (John Williams, Alexandre Desplat, Jerry Goldsmith, James Newton Howard, Alan Silvestri, James Horner, Mark Isham, John Powell, Michael Convertino, Danny Elfman, Howard Shore), Eddie Karam (John Williams, James Horner), Bruce Fowler (Hans Zimmer, Klaus Badelt, Harry Gregson-Williams, Steve Jablonsky, Mark Mancina, John Powell), John Ashton Thomas (

[0m> no
[31m
I'm sorry I couldn't find any good results. Could you please rephrase your inquiry or try something else?
[0m> data structures and algorithms
[31m
I found these articles most similar to your input.
Article 1: Theoretical_computer_science
Similar Sentence:Usually, efficient data structures are key to designing efficient algorithms.
Article 2: Selection_algorithm
Similar Sentence:Two such data structures are tree-based structures and frequency tables.
Article 3: Algorithm
Similar Sentence:Some example classes are search algorithms, sorting algorithms, merge algorithms, numerical algorithms, graph algorithms, string algorithms, computational geometric algorithms, combinatorial algorithms, medical algorithms, machine learning, cryptography, data compression algorithms and parsing techniques.
Article 4: Computational_geometry
Similar Sentence:Algorithms for problems of this type typically involve dynamic data structures.
Article 5: Data_compression
Similar Sentence:Genetics

[0m> matrix multiplication
[31m
I'm sorry, I did not understand whether you found these interesting. Please say yes or no!
[0m> no
[31m
Let me find a few more articles.
Article 6: Chemistry
Similar Sentence:The periodic table is useful in identifying periodic compound is a pure chemical substance composed of more than one element.
Article 7: Chemistry
Similar Sentence:The periodic table is arranged in groups, or columns, and periods, or rows.
Article 8: Atom
Similar Sentence:Examples include the element carbon and the organic chemical elements are often displayed in a periodic table that is laid out to display recurring chemical properties, and elements with the same number of valence electrons form a group that is aligned in the same column of the table.
Article 9: Chemistry
Similar Sentence:For example, all atoms with 6 protons in their nuclei are atoms of the chemical element carbon, but atoms of carbon may have mass numbers of 12 or standard presentation of the chemical element