In [53]:
import numpy as np
import pandas as pd
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

In [54]:
training_df = pd.read_csv('WikiQA-train.tsv', sep='\t')

In [55]:
def preprocess_text(text):

    text = text.lower()
    text = ''.join([char for char in text if char not in string.punctuation])

    words = word_tokenize(text)

    stop_words = set(stopwords.words('english'))
    
    filtered_words = [word for word in words if word not in stop_words]
    
    return ' '.join(filtered_words)



In [56]:
# Preprocess the dataset, removing labels 0
filtered_training = training_df.copy()[training_df['Label'] == 1]

In [57]:
class SimpleAssistant:
    def __init__(self, data, k=3):
        self.data = data
        self.k = k #amount of keywords to match

    def find_answer(self, question):
        question_keywords = preprocess_text(question).split()

        def contains_keywords(answer):
            matches = sum(keyword in answer['Question'] for keyword in question_keywords)
            return matches >= self.k

        matches = self.data[self.data.apply(contains_keywords, axis=1)]
        
        for _, row in matches.iterrows():
            if row['Label'] == 1:
                return row['Sentence']
            
        return 'I am not able to answer this question.'



In [58]:
assistant = SimpleAssistant(filtered_training, k=3)

In [59]:
question = 'When was'# Thomas Jefferson president?'

print(assistant.find_answer(question))

I am not able to answer this question.


In [60]:
question = 'Which languages are spoken in South Africa?'

print(assistant.find_answer(question))

South Africa has eleven official languages : Afrikaans , English , Ndebele , Northern Sotho , Sotho , Swazi , Tswana , Tsonga , Venda , Xhosa and Zulu .


In [61]:
question = 'How many muscles are there in a human body?'

print(assistant.find_answer(question))

There are approximately 642 skeletal muscles within the typical human, and almost every muscle constitutes one part of a pair of identical bilateral muscles, found on both sides, resulting in approximately 320 pairs of muscles, as presented in this article.


In [62]:
question = 'How many muscles do humans have?'

print(assistant.find_answer(question))

I am not able to answer this question.


In [63]:
assistant_2keywords = SimpleAssistant(filtered_training, k=2)


In [64]:
question = 'How many muscles are there in a human body?'

print(assistant_2keywords.find_answer(question))

Lysergic acid diethylamide, abbreviated LSD or LSD-25, also known as lysergide ( INN ) and colloquially as acid, is a semisynthetic psychedelic drug of the ergoline family, well known for its psychological effects which can include altered thinking processes, closed and open eye visuals, synesthesia , an altered sense of time and spiritual experiences , as well as for its key role in 1960s counterculture .


In [65]:
question = 'How many muscles do humans have?'

print(assistant_2keywords.find_answer(question))

There are approximately 642 skeletal muscles within the typical human, and almost every muscle constitutes one part of a pair of identical bilateral muscles, found on both sides, resulting in approximately 320 pairs of muscles, as presented in this article.


In [66]:
import tkinter as tk
from tkinter import scrolledtext

class DialogueGUI:
    def __init__(self, master):
        self.master = master
        master.title('Simple assistant')

        self.question_label = tk.Label(master, text='Ask a question:')
        self.question_label.pack()

        self.question_input = tk.Entry(master)
        self.question_input.pack()

        self.ask_button = tk.Button(master, text='Ask', command=self.get_answer)
        self.ask_button.pack()

        self.response_area = scrolledtext.ScrolledText(master, wrap=tk.WORD, width=40, height=10)
        self.response_area.pack()
        self.response_area.insert(tk.INSERT, 'Assistant: Ask a question.\n')
        self.response_area.configure(state='disabled')

    def get_answer(self):
        question = self.question_input.get()
        if question.strip() != '':
            self.response_area.configure(state='normal')
            self.response_area.insert(tk.END, 'You: ' + question + '\n')
            answer = assistant.find_answer(question)
            self.response_area.insert(tk.END, 'Assistant: ' + answer + '\n')
            self.response_area.configure(state='disabled')
            self.response_area.yview(tk.END)
            self.question_input.delete(0, tk.END)



In [67]:
assistant = SimpleAssistant(filtered_training)

root = tk.Tk()
gui = DialogueGUI(root)
root.mainloop()