In [2]:
# import argparse
import random
import numpy as np
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

Question answering class 

In [13]:
class QuestionAnswering(object):
    """Question answering model

    This class predicts the answers to some given sets of questions. """
    def __init__(self, testfile = None):
#         self.args = self.parse_arguments()   #works on commandline
#         self.questions_file = self.args.questions_file  #works on commandline

        self.train_questions =  self.read_dataset('Questions.txt')
        self.test_questions = self.read_dataset(testfile)
        self.answers = self.read_dataset('Answers.txt')

    def getTask(self):
        return self.args.task
        
    #Works on the commandline
    def parse_arguments(self): #command line arguments parser
        """Commandline arguments parser: 
        Takes to command line arguements and returns the argument parser.
        """
        parser = argparse.ArgumentParser()
        parser.add_argument('task', help='task name: topic or answer')
        parser.add_argument('questions_file', help='Guestions file e.g question_file.txt')
        args = parser.parse_args()

        return args

    def cos_sim(self, a, b):
        """Takes 2 vectors a, b and returns the cosine similarity according 
        to the definition of the dot product
        """
        dot_product = np.dot(a, b)
        norm_a = np.linalg.norm(a)
        norm_b = np.linalg.norm(b)
        return dot_product / (norm_a * norm_b)

    
    def read_dataset(self, filename):
        """Takes string filename and returns a list of the read file content."""
        f = open(filename, 'rb')
        data = []
        for datum in f:
            datum = datum.strip()
            data.append(datum)

        return data

    def split_to_train_test(self, test_ratio = 0.20):

        questions = self.train_questions
        answers = self.test_questions

        train_question = []
        test_question = []
        train_answers = []
        test_answers = []

        num_of_train_questions = int(len(questions) * (1-test_ratio)) 
        
        #split the documents into testing and training dataset.
        while len(train_question) < num_of_train_questions:  
            index = random.randrange(20)  
            train_question.append(questions.pop(index))  #randomly add documents to training document set
            train_answers.append(answers.pop(index)) 
            
        return train_question, train_answers, test_question, test_answers

    def answer_questions_using_cosine_sim(self, questions = None):
        vectorizer= TfidfVectorizer()
        
        if questions is None:
            test_question = self.test_questions

        #fit train questions
        #transform trainset and change to array
        X = vectorizer.fit(self.train_questions) 
        array = X.transform(self.train_questions).toarray()

        ##transform testset and change to numpy array
        test_array = X.transform(test_question).toarray() 

        answers = []
        for i in range(len(test_question)):
            max_value = 0
            for j in range(len(self.train_questions)):
                if max_value < self.cos_sim(array[j], test_array[i]):
                    max_value = self.cos_sim(array[j], test_array[i])
                    answer_index = j 
            answers.append(self.answers[answer_index])
        return answers

    def write_results(self, filename, answer_list):
        """Takes string file name, and list of strings and writes the 
        content of the list into a file with the specified file name."""
        with open(filename, 'w') as f:
            for answer in answer_list:
                f.write(str(answer) + "\n")
                

In [14]:

def main():
    testfile = input("Enter test filename (e.g. test.txt): ")
    task = input("Enter task (e.g. qa, topic): ")
    model = QuestionAnswering(testfile = testfile) 
    
    if task.lower() == "topic":
        print("Topic Modeling")
    elif task.lower() == "qa":
        answer = model.answer_questions_using_cosine_sim()
#         print(answer)
        model.write_results("qa_results.txt", answer)
    else:
        raise Exception("Program does not support the requested task")

main()


Enter test filename (e.g. test.txt): test_file.txt
Enter task (e.g. qa, topic): qa


