In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

In [None]:
!pip install datasets
!pip install peft
!pip install rank_bm25

In [1]:
from datasets import Dataset
from dataset.def_dataset import DefinitionDataset, Fact
from config import DB_URL
from transformers import AutoTokenizer
from models.evidence_selection_model import EvidenceSelectionModel
from peft import AutoPeftModelForFeatureExtraction
import torch
from fever_scorer import fever_score
from pipeline.pipeline import TestPipeline
from pipeline.pipeline import WikiPipeline
from utils import convert_document_id_to_word
from sklearn.metrics import classification_report
from tqdm import tqdm

In [2]:
dataset_query = """
select dd.id, docs.document_id, docs.text, dd.claim, dd.label, group_concat(dd.evidence_sentence_id, ';') as evidence_lines
from def_dataset dd
    join documents docs on docs.document_id = dd.evidence_wiki_url
    -- join atomic_facts af on af.claim_id = dd.id
where set_type='{set_type}' -- and length(claim) < 50 and length(docs.text) < 400
group by dd.id, evidence_annotation_id, evidence_wiki_url
limit 40
"""

raw_dataset = Dataset.from_sql(dataset_query.format(set_type='dev'), con=DB_URL)
dataset = DefinitionDataset(raw_dataset, tokenizer=None, model='claim_verification')

In [3]:
from transformers import AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"

#model_name = 'google/bigbird-roberta-large'
#model = AutoPeftModelForFeatureExtraction.from_pretrained('selection_model_intermediate_04-30_09-40')

selection_model_tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-m-long')
model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-m-long', trust_remote_code=True, add_pooling_layer=False, safe_serialization=True)
selection_model = EvidenceSelectionModel(model).to(device)
#selection_model_tokenizer = AutoTokenizer.from_pretrained(model_name)

# still using base
verification_model=None
verification_model_tokenizer=None

<All keys matched successfully>


In [4]:
from utils import build_fever_instance

test_pipeline = TestPipeline(selection_model=selection_model,selection_model_tokenizer=selection_model_tokenizer)

#test_pipeline = TestPipeline()

pr_labels = []
gt_labels = []
fever_instances = []
for entry in tqdm(dataset):
    output = test_pipeline.verify(entry['document_id'], entry['claim'])
    pr_labels.extend([fact.to_factuality() for fact in output['factualities']])
    gt_labels += [Fact[entry['label']].to_factuality()] * len(output['factualities'])

    evidence = entry['evidence_lines'].split(';')
    predicted_label = output.get('factualities')[0]  # TODO add atomic fact support
    predicted_evidence = output.get('evidences')
    fever_instance = build_fever_instance(entry['label'], evidence, entry['document_id'], predicted_label, predicted_evidence)
    fever_instances.append(fever_instance)

print(classification_report(gt_labels, pr_labels, zero_division=0))
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances)

print(strict_score)  
print(label_accuracy) 
print(precision)  # TP / TP + FP not too important, rather at least one TP than none
print(recall)     # more important
print(f1)

100%|██████████| 36/36 [00:22<00:00,  1.62it/s]

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.90      0.56      0.69        16
           1       0.84      0.93      0.88        28

    accuracy                           0.80        44
   macro avg       0.58      0.50      0.52        44
weighted avg       0.86      0.80      0.81        44

0.7777777777777778
0.8055555555555556
0.5324074074074074
0.9444444444444444
0.6809474050853361





In [5]:
print([int(label) for label in pr_labels])
print(gt_labels)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 0, 1, -1, 1, 0, -1, -1, -1, -1]
[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]


In [5]:
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances, use_gold_labels=True)
print(strict_score)  
print(label_accuracy) 
print(precision)  
print(recall) 
print(f1)

0.9444444444444444
1.0
0.5324074074074074
0.9444444444444444
0.6809474050853361


In [7]:
pipeline = WikiPipeline(selection_model=selection_model, selection_model_tokenizer=selection_model_tokenizer)

pr_labels = []
gt_labels = []
for entry in tqdm(dataset):
    word = convert_document_id_to_word(entry['document_id'])
    
    factuality = pipeline.verify(word, entry['claim'])
    pr_labels.extend([fact.to_factuality() for fact in factuality])
    gt_labels += [Fact[entry['label']].to_factuality()] * len(factuality)

print(classification_report(gt_labels, pr_labels, zero_division=0))
print(classification_report(gt_labels, pr_labels, zero_division=0))
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances)

print(strict_score)  
print(label_accuracy) 
print(precision)  
print(recall)  
print(f1)

  0%|          | 0/20 [00:04<?, ?it/s]


AttributeError: 'str' object has no attribute 'to_factuality'

In [None]:
print([int(label) for label in pr_labels])
print(gt_labels)