In [1]:
import os
os.chdir("..")

In [15]:
import json
from collections import Counter
import spacy
from spacy import displacy
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
import string
from os import listdir
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords as get_stopwords
from nltk.tokenize import word_tokenize
from classes.constants import *
from classes.claim import Claim
from util.claim_util import *
from util.undefined_util import *
from util.evaluate_util import *
from util.wiki_util import *
from typing import List
from random import sample

In [3]:
train = load_claims_train()

In [18]:
train_no_nei = [t for t in train if t.label != Claim.NEI]

In [21]:
train_subset = sample(train_no_nei, 100)

In [23]:
Counter([t.label for t in train_subset])

Counter({'SUPPORTS': 64, 'REFUTES': 36})

In [5]:
wiki_titles = load_wikipedia_titles() 

In [6]:
c = train[0]

In [7]:
c

SUPPORTS: Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.

In [8]:
claim = c.claim

In [9]:
claim

'Nikolaj Coster-Waldau worked with the Fox Broadcasting Company.'

In [10]:
def evaluate_documents_sample(train_subset: List[Claim], wiki_titles):
    documents : List[set] = []
    for claim in tqdm(train_subset):
        documents.append(convert_wikipedia_titles_to_ids(retrieve_documents(claim.claim, wiki_titles)))
    return evaluate_documents_retrieval_full([c.evidence_document_sets() for c in train_subset], documents)

In [11]:
def retrieve_documents(claim: str, wiki_titles: List[str]) -> set:
    documents = set()
    for title in wiki_titles:
        if title in claim:
            documents.add(title)
    return documents

In [39]:
def retrieve_documents(claim: str, wiki_titles: List[str]) -> set:
    documents = set()
    stopwords = get_stopwords.words("english")
    claim_entities = tokenize(get_entities(claim), stopwords)
    for title in tqdm(wiki_titles):
        if tokenize([title], stopwords)[0] in claim_entities:
            documents.add(title)
    return documents

In [45]:
stopwords = get_stopwords.words("english")

In [76]:
c = train[3]

In [77]:
c.claim

'Adrienne Bailon is an accountant.'

In [78]:
get_entities(c.claim)

{'Adrienne Bailon'}

In [79]:
c.all_evidence_documents()

{'Adrienne_Bailon'}

In [82]:
n = nlp('Calvin Harris is a person with at least one vocation.')

In [84]:
([(X.text, X.label_) for X in n.ents])

[('Calvin Harris', 'PERSON'), ('at least one', 'CARDINAL')]

In [81]:
[(t.claim, t.all_evidence_documents(), get_entities(t.claim)) for t in train_subset]

[('Everton F.C. played at Wembley Stadium.',
  {'1995_FA_Cup_Final', 'Everton_F.C.'},
  {'Everton F.C.', 'Wembley Stadium'}),
 ('Dr. Dre established his own country.', {'Dr._Dre'}, {'Dre'}),
 ('Seiko has only ever sold machines.', {'Seiko'}, set()),
 ('Calvin Harris is a person with at least one vocation.',
  {'Calvin_Harris'},
  {'Calvin Harris', 'at least one'}),
 ('Night Attack at Târgovişte was fought on a Monday.',
  {'Night_Attack_at_Târgovişte'},
  {'Monday', 'Night Attack', 'Târgovişte'}),
 ('Russell Crowe portrayed Maximus Decimus Meridius, a Roman General.',
  {'Russell_Crowe'},
  {'Maximus Decimus Meridius', 'Roman', 'Russell Crowe'}),
 ('The Lion King was only released in 1345.',
  {"Disney's_Hollywood_Studios",
   'Disney_Renaissance',
   'Don_Hahn',
   'Elton_John',
   'George_Scribner',
   'Hamlet',
   'Jeffrey_Katzenberg',
   'List_of_Walt_Disney_Animation_Studios_films',
   'Matthew_Broderick',
   'Pocahontas_(1995_film)',
   'Simba',
   'The_Lion_King',
   'The_

In [40]:
documents = retrieve_documents(claim, wiki_titles)

100%|██████████| 5416536/5416536 [01:13<00:00, 73896.64it/s]


In [41]:
documents

{'Fox Broadcasting Company', 'Nikolaj Coster-Waldau'}

In [42]:
evaluate_document_retrieval_full(c.evidence_document_sets(), convert_wikipedia_titles_to_ids(documents))

{'precision': 1.0,
 'recall': 1.0,
 'f1_score': 1.0,
 'f2_score': 1.0,
 'true_positives': 2,
 'false_positives': 0,
 'false_negatives': 0,
 'oracle_accuracy': 1}

In [24]:
evaluate_documents_sample(train_subset, wiki_titles)

100%|██████████| 100/100 [02:04<00:00,  1.25s/it]


{'average_precision': 0.08449472926139479,
 'average_recall': 0.7242811355311355,
 'average_f1_score': 0.1424154258955716,
 'average_f2_score': 0.25717014456216886,
 'oracle_accuracy': 0.77}

In [15]:
documents

['Cos',
 'Cost',
 'Fox Broadcasting Company',
 'Wald',
 'Wal',
 'W',
 'Nikola',
 'Nikolaj Coster-Waldau',
 'Nikolaj',
 'Fox',
 'Coste',
 'Coster',
 'F',
 'Bro',
 'Broa',
 'Com',
 'Comp',
 'B',
 'Broad',
 'Broadcasting',
 'N',
 'Nikol',
 'Waldau',
 'Nik',
 'Niko',
 'C',
 'Company']

In [12]:
len(documents)

27

In [9]:
tokenize(claim)

['nikolaj', 'costerwaldau', 'worked', 'fox', 'broadcasting', 'company']

In [26]:
get_entities(claim)

{'Nikolaj Coster-Waldau', 'the Fox Broadcasting Company'}

In [30]:
tokenize(get_entities(claim))

['nikolaj costerwaldau', 'fox broadcasting company']

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?',
]
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)
print(vectorizer.get_feature_names())

print(X.shape)

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
(4, 9)


In [168]:
X.todense()

matrix([[0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
         0.        , 0.38408524, 0.        , 0.38408524],
        [0.        , 0.6876236 , 0.        , 0.28108867, 0.        ,
         0.53864762, 0.28108867, 0.        , 0.28108867],
        [0.51184851, 0.        , 0.        , 0.26710379, 0.51184851,
         0.        , 0.26710379, 0.51184851, 0.26710379],
        [0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
         0.        , 0.38408524, 0.        , 0.38408524]])

In [31]:
vectorizer = TfidfVectorizer()
X = vectorizer.fit(titles)

In [36]:
vectorizer.get_feature_names()[100000:100010]

['autengruber',
 'autenrieth',
 'autenti',
 'autentico',
 'auterac',
 'auterive',
 'autero',
 'auteroche',
 'auterrive',
 'autery']

In [52]:
result = vectorizer.transform([claim])

In [70]:
result_row = np.ndarray.flatten(result.getrow(0).toarray())

In [71]:
result_row.shape

(1442247,)

In [42]:
import numpy as np

In [87]:
top_3 = np.argsort(result_row)[-3:][::-1]

In [88]:
result[0, top_3].todense()

matrix([[0.43705291, 0.43256636, 0.40342318]])

In [45]:
result[0, np.argmax(result)]

0.43705291143195724

In [48]:
vectorizer.get_feature_names()[np.argmax(result)]

'worked'

In [89]:
np.array(vectorizer.get_feature_names())[top_3]

array(['worked', 'waldau', 'nikolaj'], dtype='<U182')