In [77]:
%matplotlib notebook

import sys
sys.path.append("../src")

import os
from qanta.tfidf import TfidfGuesser
from qanta.models.dan import DanGuesser, DanModel, DanEncoder, datasets
from qanta.models.timer import Timer

from numpy import dot
from math import sqrt
import pandas as pd
import matplotlib.pyplot as plt
from tqdm import tqdm_notebook, tnrange
from IPython.display import display

import nltk
from nltk import pos_tag
from nltk import word_tokenize
from nltk.tokenize import sent_tokenize
import torchtext.vocab as vocab

# Load Train/Test Sets

In [2]:
os.chdir("../src/qanta")
train_data, dev_data, test_data = datasets()

### View single record

In [4]:
question = test_data[0]
question_text = question["text"]
ans = question["page"]
question_text

'One work by this author uses printing, gunpowder, and the compass as symbols of personal ambition, national ambition, and the ambition of the human race to extend its grasp. This thinker described three forms of false learning as "delicate", "contentious", and "fantastical" in categorizing the "distempers" that impede academic progress. This thinker imagined a utopian university called Salomon\'s House, and he likened received systems of philosophy to stage plays that misrepresent the world, and thus labeled them "idols of the theatre". This author of The New Atlantis established the doctrine of inductive, empirical methodology. For 10 points, name this 17th-century English philosopher who wrote Novum Organum and spearheaded the Scientific Revolution.'

# Load Embeddings

In [3]:
glove100 = vocab.GloVe(name='6B', dim=100)
print('Loaded {} words'.format(len(glove100.itos)))

2018-12-06 03:02:08,709 [INFO ]  Loading vectors from .vector_cache/glove.6B.100d.txt.pt


Loaded 400000 words


In [4]:
glove = vocab.GloVe(name='6B', dim=200)
print('Loaded {} words'.format(len(glove.itos)))

2018-12-06 03:02:10,400 [INFO ]  Loading vectors from .vector_cache/glove.6B.200d.txt.pt


Loaded 400000 words


# Utility to find target word embedding

In [204]:
def target_word_vector(text):
    word = target_word(text)
    if not word: 
        return None
    
    if word in glove.stoi:
        return glove.vectors[glove.stoi[word]]
    else:
        return None
    

def target_word(text):
    if "this" not in text and "This" not in text and "these" not in text and "These" not in text:
        return None

    determiner_found = False
    determiners = ("this", "these", "This", "These")
    first_sent = sent_tokenize(text)[0]
    tags = pos_tag(word_tokenize(first_sent))
    for word, pos in tags:
        if determiner_found:
            if pos in ["NN", "NNS"]:
                return word
        else:
            if word in determiners:
                determiner_found = True
            

# for q in test_data[:2]:
#     print(target_word(q["text"]))
#     print("")

In [227]:


def sim(w1, w2, embs):
    wv1, wv2 = embs.vectors[embs.stoi[w1]], embs.vectors[embs.stoi[w2]]
    return dot(wv1,wv2)/ (sqrt(dot(wv1,wv1))*sqrt(dot(wv2,wv2)))

sim("conflict", "rebellions", glove100)

0.3818499280654741

# Load Models

In [7]:
os.chdir("../")

In [8]:
tfidf_guesser = TfidfGuesser.load(stem=True)
dan_guesser = DanGuesser()



# Guess and Buzz code

In [21]:
BUZZ_NUM_GUESSES = 10
BUZZ_THRESHOLD = .3
def guess_and_buzz(tfidf_model, dan_model, question_text):
    tfidf_guesses = tfidf_model.guess([question_text], BUZZ_NUM_GUESSES)[0]
    dan_guesses = dan_model.guess(question_text, BUZZ_NUM_GUESSES)

    question_len = len(question_text.split(" "))

    if question_len < 50:
        scores = [guess[1] for guess in tfidf_guesses]
        buzz = scores[0] / sum(scores) >= BUZZ_THRESHOLD
        return tfidf_guesses[0][0], buzz

    return dan_guesses, True

In [63]:
DAN_BUZZ_NUM_GUESSES = 2
DAN_BUZZ_THRESHOLD = .6

def dan_guess_and_buzz(dan_model, question_text):
    guesses = dan_model.guess(question_text, DAN_BUZZ_NUM_GUESSES)
    scores = [guess[1] for guess in guesses]
    buzz = scores[0] / sum(scores) >= DAN_BUZZ_THRESHOLD
    return guesses[0][0], buzz, scores[0] / sum(scores)

# Check for when we get the correct answer vs buzz

In [100]:
if False:
    for qidx in range(50):
        print(f"\nQuestion {qidx}")
        ans = test_data[qidx]["page"]
        text_len = len(test_data[qidx]["text"].split(" "))
        for num_words in range(30, text_len + 5, 5):
            text = " ".join(test_data[qidx]["text"].split(" ")[:num_words])
            guess = dan_guess_and_buzz(dan_guesser, text)
            if guess[0] == ans or guess[1]:
                print("Words: {}, Correct: {}, Buzz: {}, Confidence: {}".format(num_words, guess[0] == ans, guess[1], guess[2]))
                if guess[1]: break

# Get multiple answers and check target word

In [110]:
import psycopg2

  """)


In [221]:
def format_type(t):
    t = t[1:-1]
    t = " ".join(t.split("_")[1:])
    return t

def query_types(text):
    conn_string = "host='localhost' dbname='allen' user='allen'"
    conn = psycopg2.connect(conn_string)
    cur = conn.cursor()
    cur.execute("select object from yagofacts where subject = '<{}>'".format(text.replace("'", "''")))

    results = cur.fetchall()
    cur.close()
    conn.close()
    return [format_type(r[0]) for r in results]


def best_noun_from_subject(text):
    tags = pos_tag(text)
    if len(tags) == 1:
        return tags[0][0]

    # check first word for NNS
    if tags[0][1] in ["NNS"]:
        return tags[0][0]
    
    # check last word for NNS
    if tags[-1][1] in ["NNS"]:
        return tags[-1][0]
    
    # looks for first NNS
    for t in tags:
        if t[1] in ["NNS"]:
            return t[0]
    

def best_similarity(target, answer):
#     print(f"target word: {target}")
    if target in glove.stoi:
        target_vec = glove.vectors[glove.stoi[target]]
    else:
        return None
    sim_scores = []
    
    yago_types = query_types(answer)
    nouns = [best_noun_from_subject(n.split(" ")) for n in yago_types]
    for t in nouns:
        if t is None: continue
        t = t.lower()
        if t not in glove.stoi:
            continue
        yago_type_vec = glove.vectors[glove.stoi[t]]
        
        wv1, wv2 = target_vec, yago_type_vec
        similarity = dot(wv1,wv2)/ (sqrt(dot(wv1,wv1))*sqrt(dot(wv2,wv2)))
        sim_scores.append(similarity)

    max_score = max(sim_scores) if sim_scores else None
    return max_score


bs = best_similarity("author", "Francis_Bacon")
print(bs)

None


In [226]:
for qidx in range(15):
    print(f"\nQuestion {qidx}")
    num_words = 100
    text = " ".join(test_data[qidx]["text"].split(" ")[:num_words])
    print(" ".join(test_data[qidx]["text"].split(" ")[:10]))

    ans = test_data[qidx]["page"]
    print(f"answer: {ans}")
    
    tword = target_word(text)
    print(f"target: {tword}")

    guesses = dan_guesser.guess(text, 5)
    guesses_with_sim = []
    sims = []
    for g in guesses:
        bs = best_similarity(tword, g[0])
        guesses_with_sim.append((g[0], g[1], bs))
        sims.append(bs)
    print([g[:1] for g in guesses_with_sim])
    print(sims)




Question 0
One work by this author uses printing, gunpowder, and the
answer: Francis_Bacon
target: author
[('Francis_Bacon',), ('Auguste_Comte',), ('Jean_Piaget',), ('George_Herbert_Mead',), ('Johann_Gottlieb_Fichte',)]
[0.5132534999309095, 0.5132534999309095, 0.5132534999309095, 0.3830038598933477, 0.5132534999309095]

Question 1
One character in this play ignores news of his wife's
answer: Tartuffe
target: play
[('Tartuffe',), ('Othello',), ('The_Imaginary_Invalid',), ('Cat_on_a_Hot_Tin_Roof',), ('The_Birthday_Party_(play)',)]
[0.7768365397377778, 0.7768365397377778, 0.7768365397377778, 0.7768365397377778, 0.7768365397377778]

Question 2
Calculating a Racah W-coefficient requires knowledge of six parameters corresponding
answer: Angular_momentum_operator
target: quantity
[('Angular_momentum',), ('Momentum',), ('Hamiltonian_(quantum_mechanics)',), ('Spin_(physics)',), ('Distance',)]
[None, None, 0.2481735758742757, 0.8429100605670035, None]

Question 3
Coinage similarities caused thi

In [217]:
query_types("Dorr_Rebellion")

['Rebellions in the United States',
 'Conflicts in 1842',
 'Conflicts in 1841',
 'conflict 100958896',
 '19th-century rebellions']

In [213]:
pos_tag('Conflicts in 1842'.split(" "))

[('Conflicts', 'NNS'), ('in', 'IN'), ('1842', 'CD')]

# Check num words in some questions

In [None]:
for idx in range(20):
    print(len(test_data[idx]["text"].split(" ")))

<br><br><br><br><br><br><br>
# Filter database entries

In [78]:
all_data = train_data + dev_data + test_data
all_pages = [p["page"] for p in all_data]

In [79]:
print(len(all_pages))
all_pages = list(set(all_pages))
print(len(all_pages))

119247
26877


In [None]:
sql = """
    DELETE FROM yagofacts WHERE subject not in ({})
"""
stuff = ", ".join(["'<{}>'".format(item.replace("'", "''")) for item in all_pages])

with open("/Users/allen/Desktop/delete.sql", "w") as f:
    f.write(sql.format(stuff))
