# FEVER Document Retrieval

**Purpose**: the purpose of this notebook is to develop a baseline approach for scoring document retrieval on the FEVER dataset with Apache Lucene.

**Input**: This document requires the Lucene index, and JSON files to run.

## Setting up Lucene Query

In [78]:
import utils
import pickle
from tqdm import tqdm_notebook
from joblib import Parallel, delayed
from multiprocessing import cpu_count
import numpy as np

In [2]:
claims, labels, article_list, claim_set, claim_to_article = utils.extract_fever_jsonl_data("../train.jsonl")

Num Distinct Claims 109810
Num Data Points 125051


In [32]:
import subprocess
import string

def query_lucene(c):
    # standard query: 
    # java -cp CLASSPATH org.apache.lucene.demo.SearchFiles -query "Loki is the dad of Hel."
    
    # replace the following classpath with your local Lucene instance
    classpath = "/home/moinnadeem/Documents/UROP/lucene-7.4.0/demo/lucene-demo-7.4.0.jar"
    classpath += ":/home/moinnadeem/Documents/UROP/lucene-7.4.0/core/lucene-core-7.4.0.jar"
    classpath += ":/home/moinnadeem/Documents/UROP/lucene-7.4.0/queryparser/lucene-queryparser-7.4.0.jar"
    
    c = c.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
    # replace the following with the location of your index
    indexDir = "/home/moinnadeem/Documents/UROP/wiki-pages/index"
    
    
    return subprocess.check_output(["java", "-cp", classpath, "org.apache.lucene.demo.SearchFiles", "-index", indexDir, "-query", c]).decode("utf-8").split("\n")

def process_lucene_output(output):
    assert len(output)==13
    
    filenames = [o.split("/")[-1].split(".txt")[0] for o in output[2:-1]]
    return list(map(utils.preprocess_article_name, filenames))

def calculate_precision(retrieved, relevant, k=None):
    if k==None:
        k = len(retrieved)
    return len(set(retrieved[:k]).intersection(set(relevant))) / len(set(retrieved))

def calculate_recall(retrieved, relevant, k=None):
    if k==None:
        k = len(retrieved)
    return len(set(retrieved[:k]).intersection(set(relevant))) / len(set(relevant))

In [33]:
output = query_lucene(claims[0])
retrieved = process_lucene_output(output)
relevant = claim_to_article[claims[0]]

In [34]:
utils.preprocess_article_name(claims[0])

'nikolaj coster waldau worked with the fox broadcasting company '

In [35]:
query_lucene(claims[0])

['Searching for: nikolaj coster waldau worked fox broadcasting company',
 '316945 total matching documents',
 '1. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/Ved_verdens_ende.txt',
 '2. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/Nukaaka_Coster-Waldau.txt',
 '3. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/A_Second_Chance_-LRB-2014_film-RRB-.txt',
 '4. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/A_Thousand_Times_Good_Night.txt',
 '5. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/New_Amsterdam_-LRB-TV_series-RRB-.txt',
 '6. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/The_Baker_-LRB-film-RRB-.txt',
 '7. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/Nikolaj.txt',
 '8. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/Nikolaj_Coster-Waldau.txt',
 '9. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pages/Coster.txt',
 '10. /home/moinnadeem/Documents/UROP/wiki-pages/processed_pag

In [46]:
calculate_precision(retrieved, relevant, 10)

0.1

In [41]:
calculate_recall(retrieved, relevant, 10)

0.5

## Applying Statistics to Dataset

In [67]:
k = [1,2, 5,10]

In [68]:
def score_claim(claim):
    cleaned_claim = claim.replace("/", " ")
    choices = query_lucene(cleaned_claim)
    retrieved = process_lucene_output(choices)
    relevant = claim_to_article[claim]
    mAP = {}
    for i in k:
        precision = calculate_precision(retrieved=retrieved, relevant=relevant, k=i)
        recall = calculate_recall(retrieved=retrieved, relevant=relevant, k=i)
        mAP[i] = {}
        mAP[i]['precision'] = precision
        mAP[i]['recall'] = recall
    return mAP

In [73]:
result = Parallel(n_jobs=8, verbose=1)(delayed(score_claim)(k) for k in list(claim_to_article.keys())[:500])

[Parallel(n_jobs=8)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=8)]: Done  34 tasks      | elapsed:    6.1s
[Parallel(n_jobs=8)]: Done 184 tasks      | elapsed:   24.2s
[Parallel(n_jobs=8)]: Done 434 tasks      | elapsed:   54.6s
[Parallel(n_jobs=8)]: Done 500 out of 500 | elapsed:  1.0min finished


In [79]:
with open("result.pkl", "wb") as f:
    pickle.dump(result, f)

In [74]:
def calculatemAP(mAP, k):
    mAP_final = {}
    
    for i in k:
        mAP_final[i] = {}
        mAP_final[i]['precision'] = []
        mAP_final[i]['recall'] = []
        
    for ap in mAP:
        for k, v in ap.items():
            mAP_final[k]['precision'].append(v['precision'])
            mAP_final[k]['recall'].append(v['recall'])

    return mAP_final

def displaymAP(mAP):
    for k, v in mAP.items():
        for k_i, v_i in v.items():
            print("{} @ {}: {}".format(k_i, k, np.mean(v_i)))

In [83]:
mAP = calculatemAP(result, k)

In [84]:
displaymAP(mAP)

recall @ 1: 0.19014285714285714
precision @ 1: 0.020200000000000003
recall @ 2: 0.2778095238095238
precision @ 2: 0.029600000000000005
recall @ 10: 0.5027619047619047
precision @ 10: 0.0542
recall @ 5: 0.41695238095238096
precision @ 5: 0.0446
