In [1]:
import wikipediaapi
import pandas as pd
import nltk

import tensorflow as tf
import tensorflow_hub as hub
from scipy import spatial



2023-11-28 17:30:58.864479: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
wiki_wiki = wikipediaapi.Wikipedia('FactCheck (ociolli@ucsd.edu)', 'en')

In [3]:
#nltk.download('punkt')
from nltk.tokenize import sent_tokenize


[nltk_data] Downloading package punkt to /home/ociolli/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Load the Universal Sentence Encoder model
embed = hub.load("https://tfhub.dev/google/universal-sentence-encoder/4")

In [None]:
import spacy
spacy.cli.download("en_core_web_sm")
nlp = spacy.load("en_core_web_sm")

# Functions

In [6]:
def get_entities(text):
    doc = nlp(text)
    entities = [(ent.text, ent.label_) for ent in doc.ents]
    return entities

In [7]:
def naive_similarity(sentence1, sentence2):
    # Process the sentences using spaCy
    doc1 = nlp(sentence1)
    doc2 = nlp(sentence2)

    # Compute the similarity between the two sentences
    similarity = doc1.similarity(doc2)

    return similarity

In [8]:
def compute_similarity(sentence1, sentence2):
    
    embedding1 = embed([sentence1])[0].numpy()
    embedding2 = embed([sentence2])[0].numpy()

    # Compute the cosine similarity between the two sentence embeddings
    similarity = 1 - spatial.distance.cosine(embedding1, embedding2)

    return similarity

# Test
# sentence1 = "I love coding in Python."
# sentence2 = "Python is my favorite programming language."

# similarity_score = compute_similarity(sentence1, sentence2)
# print(f"Similarity Score: {similarity_score}")

Similarity Score: 0.7764968276023865


In [9]:
def split_into_sentences(text):
    sentences = sent_tokenize(text)
    return sentences

In [10]:
def retrieve_documents(claim):
    docs = []
    
    entities = get_entities(claim)
    
    for entity in entities:
        docs.append(wiki_wiki.page(entity[0]).text)
    
    return docs

In [11]:
def top_k_sentences(claim, split_sentences, k):
    sims = []
    
    for sentence in split_sentences: 
        sims.append((sentence, compute_similarity(sentence, claim)))

    sims = sorted(sims, key=lambda x: x[1], reverse = True)
    return [elem[0] for elem in sims[:k]]


In [12]:
def produce_evidence(claim, k):
    docs = retrieve_documents(claim)
    evidence = []
    
    for doc in docs:
        sentences = split_into_sentences(doc)
        evidence += top_k_sentences(claim, sentences, k)
        
    return evidence

In [22]:
sentence = 'Joe Biden is the president of the United States'

In [65]:
test = produce_evidence(sentence, 3)

# Model

In [1]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

In [10]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch

model = AutoModelForSequenceClassification.from_pretrained('cross-encoder/nli-roberta-base')
tokenizer = AutoTokenizer.from_pretrained('cross-encoder/nli-roberta-base')


In [32]:
features = tokenizer(['Joseph R. Biden Jr. is an American politician who is the 46th and current president of the United States. He has been president for 4 years'], ['Joseph Biden is the current president of the United States.'], 
                     padding=True, truncation=True, return_tensors="pt")

In [33]:
model.eval()
with torch.no_grad():
    scores = model(**features).logits
    label_mapping = ['contradiction', 'entailment', 'neutral']
    labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    print(labels)

['entailment']


# Experiment with full pipeline

In [None]:
#premise (the ground truth), then hypothesis (claim)

In [35]:
' '.join(["sentence 1.", "sentence 2"])

'sentence 1. sentence 2'

In [50]:
def pipeline(claim, k):
    
    evidence = produce_evidence(claim, k)[:k] #just a test
    premise = ' '.join(evidence)
    
    features = tokenizer([premise], [claim], 
                     padding=True, truncation=True, return_tensors="pt")
    
    
    model.eval()
    with torch.no_grad():
        scores = model(**features).logits
        label_mapping = ['contradiction', 'entailment', 'neutral']
        labels = [label_mapping[score_max] for score_max in scores.argmax(dim=1)]
    return labels

In [51]:
pipeline('joe biden is the president of the united states', 2)

['entailment']

In [52]:
pipeline('the 2020 election was won by Donald Trump', 2)

['contradiction']

In [53]:
pipeline('the 2020 election was stolen from Donald Trump', 2)

['contradiction']

# Trying with data

In [54]:
import pandas as pd
import numpy as np
import tqdm

In [55]:
df = pd.read_csv('liar_plus/train2.tsv', delimiter='\t', header = None)
df = df.drop(columns = [0])

In [57]:
df.rename({1: 'id', 2: 'label', 3: 'statement', 4: 'subject', 5: 'speaker', 6: 'job-title',
           7: 'state_info', 8: 'party_affiliation', 9: 'barely_true_counts', 10: 'false_counts',
           11: 'half_true_counts', 12: 'mostly_true_counts', 13: 'pants_on_fire_counts', 14: 'context',
           15: 'justification'
          }, axis = 1, inplace = True)

df = df[~df['statement'].isna()]

In [61]:
df['label'] = df['label'].replace({'pants-fire': 'contradiction', 'false': 'contradiction', 
                     'barely-true': 'neutral', 
                     'half-true': 'neutral', 'mostly-true': 'entailment', 'true': 'entailment'})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['label'] = df['label'].replace({'pants-fire': 'contradiction', 'false': 'contradiction',


In [72]:
sample = df.sample(df.shape[0] // 10)

In [73]:
sample_statements = list(sample['statement'])
    

In [78]:
from tqdm import tqdm

In [79]:
sample_preds = []

for statement in tqdm(sample_statements):
    sample_preds.append(pipeline(statement, 3))

  6%|▌         | 60/1024 [09:58<2:40:21,  9.98s/it]


KeyboardInterrupt: 

In [82]:
sample_labels = sample['label'].iloc[:60]

In [89]:
vals = [sam[0] for sam in sample_preds]

In [90]:
(np.array(vals) == sample_labels).mean()

0.2833333333333333