In [81]:
#!pip install pyserini==0.8.1.0
import re
import sys

# !python -m spacy download en_core_web_sm
import en_core_web_sm
import spacy
from anglicize import anglicize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.tokenize import word_tokenize
from pyserini.search import pysearch
from spacy.lang.en import English
from spacy.tokenizer import Tokenizer

nlp = en_core_web_sm.load()
nlp.add_pipe("merge_noun_chunks")
tokenizer = Tokenizer(nlp.vocab)
import json
import time
from timeit import default_timer as timer

import numpy as np
from gensim.models.keyedvectors import KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

DATA_TRAIN = "train.jsonl"
DATA_DEV = "dev.jsonl"

In [5]:
model = KeyedVectors.load_word2vec_format(
    "enwiki_20180420_win10_100d.txt", binary=False
)
vocab = model.key_to_index.keys()

print(len(vocab))

4530030


In [114]:
tfidf_sources = [DATA_TRAIN]
# tfidf_sources = [DATA_TRAIN, DATA_DEV]

from sklearn.feature_extraction.text import TfidfVectorizer


def tfidf_tokenize(text):
    tokens = []
    for chunk in nlp(text):
        if chunk.pos_ in ["VERB", "NOUN"]:
            tokens.extend(word_tokenize(anglicize(chunk.lemma_)))
    return tokens


tfidf = TfidfVectorizer(tokenizer=tfidf_tokenize, stop_words=stopwords.words("english"))

corpus = []

for tfidf_source in tfidf_sources:
    with open(tfidf_source, "r", encoding="utf-8") as f:
        tfidf_dataset = [json.loads(line)["claim"] for line in f]

    corpus.extend(tfidf_dataset)

print("Documents in TF-IDF corpus:", len(corpus))
# print(corpus[0:10])

tfidf.fit(corpus)
del corpus

tfidf_features = np.array(tfidf.get_feature_names())

print(tfidf_features[0:10])

Documents in TF-IDF corpus: 37803




['!' '#' '$' '%' '&' "'" "''" "''hockey" "'55" "'70"]


In [115]:
tfidf_sorting = np.argsort(
    tfidf.transform(
        [
            "Tammy Garcia was born in California but currently lives in Taos, she comes from a long line of Santa Clara Pueblo artists and her great-great-great grandmother Sara Fina Tafoya was a potter."
        ]
    ).toarray()
).flatten()[::-1]

tfidf_features[tfidf_sorting][
    : len(
        tfidf_tokenize(
            "Tammy Garcia was born in California but currently lives in Taos, she comes from a long line of Santa Clara Pueblo artists and her great-great-great grandmother Sara Fina Tafoya was a potter."
        )
    )
][0:3]

array(['tafoya', 'great-great-great', 'tammy'], dtype='<U37')

In [126]:
lucene_dir = "anserini/indexes/fever/lucene-index-fever-paragraph"
searcher = pysearch.SimpleSearcher(lucene_dir)


def judge(
    dataset,
    FULL_CLAIM,
    SPACY_ENTITIES,
    CASE_ENTITIES,
    ADD_SYNONYMS,
    ADD_SIMILAR,
    LINKED_PAGES,
    CATCH_VERBS,
    TFIDF_TOP,
    FIND_ALL,
    SIMILAR_THRESHOLD=0.85,
    TFIDF_TOP_N=3,
):

    if not SPACY_ENTITIES and not CASE_ENTITIES and not LINKED_PAGES:
        FULL_CLAIM = True

    found = 0
    claims = 0

    start_timer = timer()
    for data in dataset:
        claims += 1

        claim = nlp(
            data["claim"].replace(" ", " ")
        )  # Replaces the obnoxious space character with normal space

        keywords = set()
        entities = set()

        if SPACY_ENTITIES:
            spacy_entities = [X.text for X in claim.ents]
            entities.update(spacy_entities)

        if CASE_ENTITIES:
            case_entities = set()
            chunks = claim.noun_chunks
            for chunk in chunks:
                for token in tokenizer(chunk.text):
                    if token.text[0].isupper():
                        case_entities.add(chunk.text)
                        break

            entities.update(case_entities)
            # print(case_entities)
            # print(entities)
            # sys.exit(0)

        keywords.update(entities)
        # print(keywords)

        if ADD_SYNONYMS:
            for token in claim:
                if token.is_stop:
                    continue
                synonyms = wn.synsets(token.text)
                if synonyms and token:
                    for synonym in synonyms:
                        if (
                            synonym.pos() == token.pos_[0].lower()
                            and synonym.pos() == "n"
                        ):
                            keywords.update(
                                [
                                    lemma.replace("_", " ")
                                    for lemma in synonym.lemma_names()
                                ]
                            )

        if ADD_SIMILAR:
            similar_check = [keyword for keyword in list(keywords) if keyword in vocab]

            if similar_check:
                similar_words = model.most_similar(positive=similar_check)
                for i in range(0, len(similar_words)):
                    if i == 0 or similar_words[i][1] >= SIMILAR_THRESHOLD:
                        keywords.update([similar_words[i][0]])

        if CATCH_VERBS:
            for chunk in claim:
                if (
                    chunk.pos_ == "VERB"
                    and chunk.dep_ != "case"
                    and chunk.dep_ != "prep"
                ):
                    keywords.update([chunk.lemma_])

        if TFIDF_TOP:
            tfidf_sorting = np.argsort(
                tfidf.transform([claim.text]).toarray()
            ).flatten()[::-1]
            tfidf_len = len(tfidf_tokenize(claim.text))
            keywords.update(
                tfidf_features[tfidf_sorting][0 : min([TFIDF_TOP_N, tfidf_len])]
            )

        if not FULL_CLAIM and not keywords:
            continue
        elif not FULL_CLAIM:
            search_query = anglicize(", ".join(keywords))
        else:
            search_query = anglicize(
                claim.text + " " + ", ".join(keywords) + ", ".join(entities)
            )

        try:
            lucene_hits = searcher.search(search_query, k=50)
        except:
            with open("error_pages.txt", "a", encoding="utf-8") as f:
                f.write("{}\n".format(claim.text))
            lucene_hits = None

        if not lucene_hits:
            continue

        linked_pages = set()
        if LINKED_PAGES:
            for hit in lucene_hits:
                links = re.findall(r"(?:\[\[)(.*?)(?:\|)", hit.raw)
                for link in links:
                    linked_pages.update([link.replace("_", " ").lower()])

        expected_pages = set()

        for evidence in data["evidence"]:
            for content in evidence["content"]:
                expected_pages.add(content.split("_")[0])

        # If FIND_ALL, all evidence pages must be retrieved to be considered successful.
        # Otherwise, finding at least one of the evidence pages is enough.
        if FIND_ALL:
            is_found = True
        else:
            is_found = False

        for page in expected_pages:
            expected_found = False
            for hit in lucene_hits:
                if FIND_ALL and (
                    page.lower() == hit.docid.lower() or page.lower() in linked_pages
                ):
                    expected_found = True
                    break
                elif page.lower() == hit.docid.lower() or page.lower() in linked_pages:
                    is_found = True
                    break

            if FIND_ALL and not expected_found:
                is_found = False
                break
            elif is_found:
                break

        if is_found:
            found += 1

    end_timer = timer()
    elapsed = int(end_timer - start_timer)
    elapsed_formatted = time.strftime("%H:%M:%S", time.gmtime(elapsed))
    print(
        "PARAMETERS: \nFULL_CLAIM: {}, \nSPACY_ENTITIES: {}, \nCASE_ENTITIES: {}, \nADD_SYNONYMS: {}, \nADD_SIMILAR: {}, \nLINKED_PAGES: {}, \nCATCH_VERBS: {}, \nTFIDF_TOP: {}, \nFIND_ALL: {}".format(
            FULL_CLAIM,
            SPACY_ENTITIES,
            CASE_ENTITIES,
            ADD_SYNONYMS,
            ADD_SIMILAR,
            LINKED_PAGES,
            CATCH_VERBS,
            TFIDF_TOP,
            FIND_ALL,
        )
    )
    print("Found {} out of {} claims ({}%)".format(found, claims, 100 * found / claims))
    print("Elapsed:", elapsed_formatted)
    return (found / claims), elapsed

In [36]:
dataset_path = DATA_DEV
# dataset_path = DATA_TRAIN

with open(dataset_path, "r", encoding="utf-8") as f:
    dataset = [json.loads(line) for line in f]

Base (only the claim):

In [127]:
accuracy_1, elapsed_1 = judge(
    dataset,
    FULL_CLAIM=True,
    SPACY_ENTITIES=False,
    CASE_ENTITIES=False,
    ADD_SYNONYMS=False,
    ADD_SIMILAR=False,
    LINKED_PAGES=False,
    CATCH_VERBS=False,
    TFIDF_TOP=False,
    FIND_ALL=True,
)

PARAMETERS: 
FULL_CLAIM: True, 
SPACY_ENTITIES: False, 
CASE_ENTITIES: False, 
ADD_SYNONYMS: False, 
ADD_SIMILAR: False, 
LINKED_PAGES: False, 
CATCH_VERBS: False, 
TFIDF_TOP: False, 
FIND_ALL: True
Found 6706 out of 7891 claims (84.98289190216703%)
Elapsed: 00:27:14


In [128]:
accuracy_2, elapsed_2 = judge(
    dataset,
    FULL_CLAIM=True,
    SPACY_ENTITIES=True,
    CASE_ENTITIES=True,
    ADD_SYNONYMS=False,
    ADD_SIMILAR=False,
    LINKED_PAGES=False,
    CATCH_VERBS=False,
    TFIDF_TOP=False,
    FIND_ALL=True,
)

PARAMETERS: 
FULL_CLAIM: True, 
SPACY_ENTITIES: True, 
CASE_ENTITIES: True, 
ADD_SYNONYMS: False, 
ADD_SIMILAR: False, 
LINKED_PAGES: False, 
CATCH_VERBS: False, 
TFIDF_TOP: False, 
FIND_ALL: True
Found 6764 out of 7891 claims (85.71790647573185%)
Elapsed: 00:22:49


In [129]:
accuracy_3, elapsed_3 = judge(
    dataset,
    FULL_CLAIM=True,
    SPACY_ENTITIES=True,
    CASE_ENTITIES=True,
    ADD_SYNONYMS=False,
    ADD_SIMILAR=False,
    LINKED_PAGES=False,
    CATCH_VERBS=True,
    TFIDF_TOP=False,
    FIND_ALL=True,
)

PARAMETERS: 
FULL_CLAIM: True, 
SPACY_ENTITIES: True, 
CASE_ENTITIES: True, 
ADD_SYNONYMS: False, 
ADD_SIMILAR: False, 
LINKED_PAGES: False, 
CATCH_VERBS: True, 
TFIDF_TOP: False, 
FIND_ALL: True
Found 6761 out of 7891 claims (85.67988848054746%)
Elapsed: 00:25:48


In [130]:
# Best when entities are not inserted to the query twice.
# One of the best and fastest when entities are inserted to the query twice.
accuracy_4, elapsed_4 = judge(
    dataset,
    FULL_CLAIM=True,
    SPACY_ENTITIES=True,
    CASE_ENTITIES=True,
    ADD_SYNONYMS=False,
    ADD_SIMILAR=False,
    LINKED_PAGES=True,
    CATCH_VERBS=False,
    TFIDF_TOP=False,
    FIND_ALL=True,
)

PARAMETERS: 
FULL_CLAIM: True, 
SPACY_ENTITIES: True, 
CASE_ENTITIES: True, 
ADD_SYNONYMS: False, 
ADD_SIMILAR: False, 
LINKED_PAGES: True, 
CATCH_VERBS: False, 
TFIDF_TOP: False, 
FIND_ALL: True
Found 7069 out of 7891 claims (89.58306931947789%)
Elapsed: 00:25:02


In [131]:
accuracy_5, elapsed_5 = judge(
    dataset,
    FULL_CLAIM=True,
    SPACY_ENTITIES=True,
    CASE_ENTITIES=True,
    ADD_SYNONYMS=False,
    ADD_SIMILAR=False,
    LINKED_PAGES=True,
    CATCH_VERBS=True,
    TFIDF_TOP=False,
    FIND_ALL=True,
)

PARAMETERS: 
FULL_CLAIM: True, 
SPACY_ENTITIES: True, 
CASE_ENTITIES: True, 
ADD_SYNONYMS: False, 
ADD_SIMILAR: False, 
LINKED_PAGES: True, 
CATCH_VERBS: True, 
TFIDF_TOP: False, 
FIND_ALL: True
Found 7069 out of 7891 claims (89.58306931947789%)
Elapsed: 00:27:10
