In [6]:
import xapian
import spacy
from collections import Counter
import numpy as np
import json
from allennlp.predictors.predictor import Predictor
DROPBOX_PATH = "/Users/neesergparajuli/Dropbox/"
dev_SET = DROPBOX_PATH+"Webtext/Data/devset.json"
predictor = Predictor.from_path(DROPBOX_PATH+"Webtext/WSTA_fact_checker/allen3/model.tar.gz")
DATABASE = DROPBOX_PATH+"Webtext/Data/XxapianDatabase3"
db = xapian.Database(DATABASE)
nlp = spacy.load('en_core_web_sm')

# Preprocessing queries and sentences

In [7]:
def preprocess(fact):
    sent = nlp(fact)
    query = []
    sentence = []
    title = []
    for ent in sent.ents:
        query.append("text:" + '"' + ent.text + '"')
        query.append("title:" + '"' + ent.text + '"')
        title.append(ent.text)
    for chunk in sent.noun_chunks:
        if chunk.root.dep_ == 'nsubj':
            title.append(chunk.text)
    for token in sent:
        if token.tag_ == "NN" or token.pos_ == "PROPN" or token.tag_ == "VBG":
            query.append("text:" + '"' + token.text + '"')
            sentence.append(token.text)
#         if token.pos_ == "PROPN":
#             title.append(token.text)
            
    return (query,sentence, title)

# Query the databases and process results

In [8]:
def get_unique_id(terms):
    for term in terms:
        if "_~s~_" in term:
            ID = term.split("_~s~_" )
            docId = ID[0][1:]
            return docId
    
    print("wtdf")
    return "wtf"

def my_own_queryparser(fact):
    new_fact = ''
    for f in fact:
        new_fact += f + ' '
    return new_fact

def get_sentences(docs):
    doc_dict = {}
    for doc in docs:
        sent = doc[1].split('.\n')
        doc_dict[doc[0]] = {i: sent[i] for i in range(len(sent)-1) }
    return doc_dict
def sort_doc(title,docs, processed, t_processed):
    title = " ".join(title.split('_'))
    s= 0
    for fact in t_processed:
        if fact in title:
            s+= 20*(len(fact)/len(title))
    counts = Counter()
    i = 1
    for key in docs:
        i += 1
        for fact in (processed+t_processed):
            if counts[key] == 0:
                counts[key] = 1
            if fact in docs[key]:
                counts[key] +=1
            else:
                counts[key] +=0
    
    return (s + sum(counts.values())/np.sqrt(i), counts.most_common())

def get_best_sentences(querys ,docs, n_docs = 5, n_sentences = 20):
    best_docs = []
    for doc in docs:
        (doc_scores, sentence_score) = sort_doc(doc,docs[doc],querys[1], querys[2])
        scores = (doc_scores, doc, sentence_score)
        best_docs.append(scores)
    best_docs.sort(reverse =True)
    best_docs = best_docs[:n_docs]
    best_sent = Counter()
    for best in best_docs:
        for sent in best[2]:
            best_sent[(best[1], sent[0])] = sent[1]*best[0]
    return best_sent.most_common(n_sentences)

In [9]:
def get_docs(querys, k_matches):
    fact = querys[0]
    queryparser = xapian.QueryParser()
    queryparser.set_stemming_strategy(xapian.QueryParser.STEM_NONE)
    queryparser.add_prefix("text", "")
    queryparser.add_prefix("title", "S")
    query = queryparser.parse_query(my_own_queryparser(fact))
    enquire = xapian.Enquire(db)
    enquire.set_query(query)
    results = []
    for match in enquire.get_mset(0,k_matches):
        terms = match.document.termlist()
        unid = get_unique_id([term.term.decode("utf8") for term in terms])
        results.append((unid, match.document.get_data().decode("utf8")))
    docs = get_sentences(results)
    best_matches = get_best_sentences(querys, docs)
    final_result = []
    for match in best_matches:
        doc_id, sent_id = match[0]
        final_result.append(((doc_id, sent_id), docs[doc_id][sent_id]))
    return final_result

In [10]:
with open(dev_SET) as file:
    data_ = json.load(file)
data ={}
with open("practice_dev.json", "w") as file:
    for ob in data_:
        if np.random.uniform()<0.05:
            data[ob] = data_[ob]
    json.dump(data, file, indent =4)

In [11]:
def get_topN(fact, n_fact):
    processed = preprocess(fact)
    premises = get_docs(processed, n_fact)
    facts_prob ={"SUPPORTS":[], "REFUTES":[]}
    i=0
    x = np.random.uniform()
    for premise in premises:
        probs = predictor.predict(premise = premise[1], hypothesis = fact)["label_probs"]
        (prob1, prob2) = (probs[1], probs[2] )
        if i >8:
            break
        if(prob1>0.5):
            facts_prob["SUPPORTS"].append((prob1,premise[0]))
        if(prob2>0.5):
            facts_prob["REFUTES"].append((prob2, premise[0]))
        i +=1
#     facts_prob["SUPPORTS"].sort(reverse = True)
#     facts_prob["REFUTES"].sort(reverse = True)
    return facts_prob

In [12]:
m = 0
empty = 0
js_dict = {}
for ob in data:
    m+=1
    fact = data[ob]["claim"]
    premises = get_topN(fact, 100)
    refutes = premises["REFUTES"]
    supports = premises["SUPPORTS"]
    if len(refutes) == len(supports):
        js_dict[ob] = {"claim": fact, "label": "NOT ENOUGH INFO", "evidence": []}
    elif len(refutes) > len(supports):
        js_dict[ob] = {"claim": fact, "label": "REFUTES", "evidence": [ [ref[1][0], ref[1][1]] for ref in refutes ]}
    else:
        js_dict[ob] = {"claim": fact, "label": "SUPPORTS", "evidence": [ [sup[1][0], sup[1][1]] for sup in supports ]}
with open("prediction_dev.json","w") as file:
    json.dump(js_dict, file, indent=4)

In [5]:
print(get_docs(preprocess("Pharrell Williams is not a musician."), 1000))

[(('Pharrell_Williams', 7), "Williams owns a media venture that encompasses entertainment , music , fashion , and art called i am OTHER , a multimedia creative collective and record label that serves as an umbrella for all of Pharrell Williams ' endeavors , including Billionaire Boys Club "), (('Pharrell_Williams_discography', 0), 'The discography of Pharrell Williams , an American recording artist and record producer , consists of two studio albums , two extended plays -LRB- EPs -RRB- , one mixtape , 46 singles -LRB- including 38 as a featured artist -RRB- and 40 guest appearances '), (('Pharrell_Williams', 0), 'Pharrell Lanscilo Williams -LRB- -LSB- fəˈɹ̠ɛːl -RSB- born April 5 , 1973 -RRB- is an American singer-songwriter , rapper , record producer , and film producer '), (('Happy_-LRB-Pharrell_Williams_song-RRB-', 0), "`` Happy '' is a song written , produced , and performed by American singer and record producer Pharrell Williams , from the Despicable Me 2 soundtrack album "), (('G