In [34]:
import time
import json
from glob import glob
from pprint import pprint

import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer


In [2]:
path = "/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/"
files = glob(path + "training_set*.json")
files

['/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.4.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.5.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.2.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.3.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.0.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.1.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.6.json',
 '/Users/allen/Projects/cmsc723/qanta-codalab/experiments/wiki-dataset-wrangle/data/training_set.7.json']

# View a sample from one of the files

In [26]:
def data_generator(limit=None):
    for file in files[:limit]:
        with open(file, "r") as f:
            for line in f:
                 yield json.loads(line)

paras = [ii for ii in data_generator(1)]
pprint(paras[:2])
print(f"Total paragraphs in a single file: {len(paras)}")

[{'ans': 'Anna_Kournikova',
  'text': 'Anna Kournikova Anna Sergeyevna Kournikova (; born 7 June 1981) is '
          'a Russian former professional tennis player. Her appearance and '
          'celebrity status made her one of the best known tennis stars '
          'worldwide. At the peak of her fame, fans looking for images of '
          'Kournikova made her name one of the most common search strings on '
          'Google Search.'},
 {'ans': 'Anna_Kournikova',
  'text': 'Despite never winning a singles title, she reached No.\xa08 in the '
          'world in 2000. She achieved greater success playing doubles, where '
          'she was at times the world No.\xa01 player. With Martina Hingis as '
          'her partner, she won Grand Slam titles in Australia in 1999 and '
          '2002, and the WTA Championships in 1999 and 2000. They referred to '
          'themselves as the "Spice Girls of Tennis".'}]
Total paragraphs in a single file: 1986418


# Create and Validate data reader

In [21]:
class WikidataIterator(object):
    def __init__(self, files, limit=None):
        self.files = files
        self.limit = limit
        self.i_to_ans = {}

    def _doc_iterator(self):
        counter = 0
        for path in self.files:
            with open(path, "r") as f:
                for line in f:
                    yield json.loads(line)
                    counter += 1
                    if self.limit and counter >= self.limit:
                        return

    @property
    def docs(self):
        counter = 0
        for doc in self._doc_iterator():
            ans = doc["ans"]
            text = doc["text"]

            self.i_to_ans[counter] = ans
            counter += 1
            yield text


wikidata = WikidataIterator(files, limit=50)
docs = [item for item in wikidata.docs]
pprint(docs[0])
pprint(docs[30])
pprint(docs[49])
pprint(wikidata.i_to_ans)

('Anna Kournikova Anna Sergeyevna Kournikova (; born 7 June 1981) is a Russian '
 'former professional tennis player. Her appearance and celebrity status made '
 'her one of the best known tennis stars worldwide. At the peak of her fame, '
 'fans looking for images of Kournikova made her name one of the most common '
 'search strings on Google Search.')
('Agnosticism Agnosticism is the view that the existence of God, of the divine '
 'or the supernatural is unknown or unknowable. English biologist Thomas Henry '
 'Huxley coined the word "agnostic" in 1869, and said "It simply means that a '
 'man shall not say he knows or believes that which he has no scientific '
 'grounds for professing to know or believe." Earlier thinkers, however, had '
 'written works that promoted agnostic points of view, such as Sanjaya '
 'Belatthaputta, a 5th-century BCE Indian philosopher who expressed '
 'agnosticism about any afterlife; and Protagoras, a 5th-century BCE Greek '
 'philosopher who expressed 

# Create TF/IDF Model

In [67]:
class TFIDF():

    def __init__(self, limit=None):
        self.i_to_ans = None
        self.limit = limit

    def train(self, ngram_range=(1, 1), min_df=2, max_df=.75):
        wikidata = WikidataIterator(files[:self.limit])
        self.docs = [ii for ii in wikidata.docs]

        vectorizer_kwargs = {
            'ngram_range': ngram_range,
            'min_df': min_df,
            'max_df': max_df
        }
        start = time.time()
        self.tfidf_vectorizer = TfidfVectorizer(**vectorizer_kwargs).fit(self.docs)
        elapsed = int(time.time() - start)
        print("INFO: fit completed in {} seconds".format(elapsed))

        start = time.time()
        self.tfidf_matrix = self.tfidf_vectorizer.transform(self.docs)
        elapsed = int(time.time() - start)
        print("INFO: transform completed in {} seconds".format(elapsed))

        self.i_to_ans = wikidata.i_to_ans

    def guess(self, questions, max_n_guesses=2):
        representations = self.tfidf_vectorizer.transform(questions)
        guess_matrix = self.tfidf_matrix.dot(representations.T).T
        guess_indices = (-guess_matrix).toarray().argsort(axis=1)[:, 0:max_n_guesses]
        guesses = []
        for i in range(len(questions)):
            idxs = guess_indices[i]
            guesses.append([(self.i_to_ans[j], i, j, guess_matrix[i, j]) for j in idxs])

        return guesses

In [65]:
model = None

# Create dataset loader

In [30]:
TRAIN_FILE = "/Users/allen/Projects/cmsc723/qanta-codalab/data/qanta.train.2018.04.18.json"
TEST_FILE = "/Users/allen/Projects/cmsc723/qanta-codalab/data/qanta.test.2018.04.18.json"

def load_data(filename):
    data = list()
    with open(filename) as json_data:
        for q in json.load(json_data)["questions"]:
            yield (q['text'], q['page'])    

def test_load():
    counter = 0
    for item in load_data(TEST_FILE):
        pprint(item)
        counter += 1
        if counter == 2: break
test_load()

('One work by this author uses printing, gunpowder, and the compass as symbols '
 'of personal ambition, national ambition, and the ambition of the human race '
 'to extend its grasp. This thinker described three forms of false learning as '
 '"delicate", "contentious", and "fantastical" in categorizing the '
 '"distempers" that impede academic progress. This thinker imagined a utopian '
 "university called Salomon's House, and he likened received systems of "
 'philosophy to stage plays that misrepresent the world, and thus labeled them '
 '"idols of the theatre". This author of The New Atlantis established the '
 'doctrine of inductive, empirical methodology. For 10 points, name this '
 '17th-century English philosopher who wrote Novum Organum and spearheaded the '
 'Scientific Revolution.',
 'Francis_Bacon')
("One character in this play ignores news of his wife's fever, while repeating "
 'the line "Poor fellow!" in response to reports of the gluttony of a '
 'character who drank fo

# Evaluate TF/IDF By Paragraph on TEST data

fit model

In [68]:
model = TFIDF(limit=1)
model.train()

INFO: fit completed in 217 seconds
INFO: transform completed in 211 seconds


In [48]:
unique_answers = len(set(model.i_to_ans.values()))
print(f"unique answers: {unique_answers}")

unique answers: 419601


test model

In [71]:
def view_answers():
    counter = 0
    for question, ans in load_data(TEST_FILE):
        if ans in model.i_to_ans.values():
            guess, unk, para_index, _ = model.guess([question], max_n_guesses=1)[0][0]
            pprint({"question": question, "ans": ans, "guess": guess, "para_index": para_index, "guess_text": model.docs[para_index],})

            counter += 1
            if counter > 5: break
        

view_answers()

{'ans': 'Angular_momentum_operator',
 'guess': 'Quantity_adjustment',
 'guess_text': 'Quantity adjustment In economics, quantity adjustment is the '
               'process by which a market surplus leads to a cut-back in the '
               'quantity supplied or a market shortage causes an increase in '
               'supplied quantity. It is one possible result of supply and '
               'demand disequilibrium in a market. Quantity adjustment is '
               'complementary to pricing.',
 'para_index': 351306,
 'question': 'Calculating a Racah W-coefficient requires knowledge of six '
             'parameters corresponding to this quantity. Another set of '
             'coefficients arising from this quantity relate reduced matrix '
             'elements to the spherical tensor. The individual components of '
             'the operator corresponding to this quantity commute with its '
             'square, but not with each other. That square of the operator '
            

{'ans': 'RNA_splicing',
 'guess': 'Process_performance_index',
 'guess_text': 'Process performance index In process improvement efforts, the '
               'process performance index is an estimate of the process '
               'capability of a process during its initial set-up, "before" it '
               'has been brought into a state of statistical control. '
               'Formally, if the upper and lower specifications of the process '
               'are USL and LSL, the estimated mean of the process is '
               'formula_1, and the estimated variability of the process '
               '(expressed as a standard deviation) is formula_2, then the '
               'process performance index is defined as: formula_2 is '
               'estimated using the sample standard deviation. P may be '
               'negative if the process mean falls outside the specification '
               'limits (because the process is producing a large proportion of '
               'defe

In [75]:
from tqdm import tqdm_notebook

In [82]:
TEST_NUM = 4104

def get_accuracy():
    counter = 0
    correct = 0
    for question, ans in tqdm_notebook(load_data(TEST_FILE),  total=TEST_NUM):
        if ans in model.i_to_ans.values():
            counter += 1
            guess, unk, para_index, _ = model.guess([question], max_n_guesses=1)[0][0]
            if ans == guess:
                correct += 1
    return {"correct": correct, "num_questions": counter, "accuracy": correct / counter,}
        

get_accuracy()

HBox(children=(IntProgress(value=0, max=4104), HTML(value='')))

{'correct': 60, 'num_questions': 560, 'accuracy': 0.10714285714285714}