In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Mounted at /content/drive
/content/drive/My Drive/evaluating_factuality_word_definitions


In [2]:
!pip install datasets
!pip install peft
!pip install rank_bm25
!pip install einops

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m5.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m8.0 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m9.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m12.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-any

In [3]:
!python -m spacy download en_core_web_lg
!python -m spacy download de_core_news_lg

Collecting en-core-web-lg==3.7.1
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_lg-3.7.1/en_core_web_lg-3.7.1-py3-none-any.whl (587.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m587.7/587.7 MB[0m [31m2.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: en-core-web-lg
Successfully installed en-core-web-lg-3.7.1
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_lg')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.
Collecting de-core-news-lg==3.7.0
  Downloading https://github.com/explosion/spacy-models/releases/download/de_core_news_lg-3.7.0/de_core_news_lg-3.7.0-py3-none-any.whl (567.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m567.8

In [4]:
from datasets import Dataset
from dataset.def_dataset import DefinitionDataset, Fact
from config import DB_URL
from transformers import AutoTokenizer
from models.evidence_selection_model import EvidenceSelectionModel
from peft import AutoPeftModelForFeatureExtraction
import torch
from fever_scorer import fever_score
#from pipeline.pipeline import TestPipeline
#from pipeline.pipeline import WikiPipeline
from utils import convert_document_id_to_word
from sklearn.metrics import classification_report
from tqdm import tqdm

In [5]:
dataset_query = """
select dd.id, docs.document_id, docs.text, dd.claim, dd.label, group_concat(dd.evidence_sentence_id, ';') as evidence_lines
from def_dataset dd
    join documents docs on docs.document_id = dd.evidence_wiki_url
    -- join atomic_facts af on af.claim_id = dd.id
where set_type='{set_type}' -- and length(claim) < 50 and length(docs.text) < 400
group by dd.id, evidence_annotation_id, evidence_wiki_url
"""

raw_dataset = Dataset.from_sql(dataset_query.format(set_type='dev'), con=DB_URL)
dataset = DefinitionDataset(raw_dataset, tokenizer=None, model='claim_verification')

Generating train split: 0 examples [00:00, ? examples/s]

Filter:   0%|          | 0/2688 [00:00<?, ? examples/s]

# Pipeline class

In [31]:
"""Pipelines for the claim verification process."""
from typing import List, Dict, Tuple

import torch
from sklearn.metrics import accuracy_score, f1_score
from tqdm import tqdm
from transformers import BigBirdModel, AutoTokenizer, AutoModelForSequenceClassification
from torch.nn.functional import cosine_similarity

from database.db_retriever import FeverDocDB
from dataset.def_dataset import Fact, process_sentence, process_lines, split_text
from fetchers.wikipedia import Wikipedia
from models.claim_verification_model import ClaimVerificationModel
from models.evidence_selection_model import EvidenceSelectionModel
from utils import rank_docs


class Pipeline:
    """General Pipeline. Implement fetch_evidence, select_evidence, verify_claim."""

    def __init__(self):
      self.device = "cuda" if torch.cuda.is_available() else "cpu"

    def verify(self, word: str, claim: str) -> Dict:
        """
        Verify a claim related to a word.
        :param word: Word associated to the claim.
        :param claim: Claim to be verified.
        :return: dict containing factuality, atomic claim factualities and selected evidences.
        """
        ev_sents = self.fetch_evidence(word)
        selected_evidences = self.select_evidence(claim, ev_sents)   # we need to know the line and the page the info was taken from
        selected_ev_sents = [evidence[2] for evidence in selected_evidences]
        atomic_claims = self.process_claim(claim)

        total_factuality = 0
        factualities = []
        for atomic_claim in atomic_claims:
            factuality = self.verify_claim(atomic_claim, selected_ev_sents)
            total_factuality += 1 if factuality == Fact.SUPPORTS else 0
            factualities.append(factuality)

        return {'factuality': total_factuality / len(atomic_claims),
                'factualities': factualities,
                'evidences': [(evidence[0], evidence[1]) for evidence in selected_evidences]}

    @staticmethod
    def process_claim(claim: str) -> List[str]:
        """Process a claim. E.g. split it into its atomic facts."""
        return [claim]

    def fetch_evidence(self, word: str) -> List[Tuple[str, List[str], List[str]]]:
        """
        Fetch the information of the word inside the knowledge base.
        :param word: Word, for which we need information.
        :return: List of sentences, representing all information known to the word.
        """

    def select_evidence(self, claim: str, evidence_list: List[Tuple[str, List[str], List[str]]]) -> List[Tuple[str, str, str]]:
        """
        Select sentences possibly containing evidence for the claim.
        :param claim: Claim to be verified.
        :param evidence_list: Sentences to choose from. Can be from multiple sources.
        :return: List of sentences, possibly containing evidence.
        """

    def verify_claim(self, claim: str, sentences: List[str]) -> Fact:
        """
        Verify the claim using sentences as evidence.
        :param claim: Claim to be verified.
        :param sentences: Sentences to use as evidence.
        :return: either Fact.SUPPORTS, Fact.REFUTES or Fact.NOT_ENOUGH_INFO
        """


class ModelPipeline(Pipeline):
    """Pipeline using llm models."""

    def __init__(self, selection_model=None, selection_model_tokenizer=None,
                 verification_model=None, verification_model_tokenizer=None):
        super().__init__()
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        if not selection_model:
            model_name = 'google/bigbird-roberta-large'
            model = BigBirdModel.from_pretrained(model_name)
            selection_model_tokenizer = AutoTokenizer.from_pretrained(model_name)
            selection_model = EvidenceSelectionModel(model).to(self.device)
        self.selection_model = selection_model
        self.selection_model_tokenizer = selection_model_tokenizer

        if not verification_model:
            model_name = "MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7"
            verification_model_tokenizer = AutoTokenizer.from_pretrained(model_name)
            model = AutoModelForSequenceClassification.from_pretrained(model_name)
            verification_model = ClaimVerificationModel(model).to(self.device)
        self.verification_model = verification_model
        self.verification_model_tokenizer = verification_model_tokenizer

    def _build_selection_model_input(self, claim: str, sentences: List[str]):
        encoded_sequence = []
        sentence_mask = []
        for i, sentence in enumerate(sentences):
            encoded_sentence = self.selection_model_tokenizer.encode(sentence)[1:-1]  # + [1]
            encoded_sequence += encoded_sentence
            sentence_mask += [i] * len(encoded_sentence)
            # sentence_mask += [int(i)] + [-1] * (len(encoded_sentence) - 1)  # try only with cls token
            encoded_sequence.append(self.selection_model_tokenizer.sep_token_id)
            sentence_mask.append(-1)

        unique_sentence_numbers = set(sentence_mask)
        sentence_masks = []
        for num in unique_sentence_numbers:
            if num == -1:
                continue
            sentence_masks.append([1 if val == num else 0 for val in sentence_mask])

        return (self.selection_model_tokenizer(claim, return_tensors='pt').to(self.device),
                {'input_ids': torch.tensor(encoded_sequence).unsqueeze(0).to(self.device),
                 'attention_mask': torch.ones(len(encoded_sequence)).unsqueeze(0).to(self.device),
                 'sentence_mask': torch.tensor(sentence_masks).unsqueeze(0).to(self.device)})

    def verify_claim(self, claim: str, sentences: list[str]) -> Fact:
        model_inputs = self._build_verification_model_input(claim, sentences)
        with torch.no_grad():
            output = self.verification_model(**model_inputs)
            predicted = torch.softmax(output['logits'], dim=-1)
            predicted = torch.argmax(predicted, dim=-1).item()
        return Fact(predicted)

    def _build_verification_model_input(self, claim: str, sentences: list[str]):
        hypothesis = ' '.join(sentences)
        model_inputs = self.verification_model_tokenizer(hypothesis, claim)

        return {'input_ids': torch.tensor(model_inputs['input_ids']).unsqueeze(0).to(self.device),
                'attention_mask': torch.tensor(model_inputs['attention_mask']).unsqueeze(0).to(self.device)}


class TestPipeline(ModelPipeline):
    """Pipeline used for test purposes."""

    def fetch_evidence(self, word: str) -> list[tuple[str, list[str], list[str]]]:
        with FeverDocDB() as db:
            lines = db.get_doc_lines(word)

        lines = process_lines(lines)
        processed_lines = []
        line_numbers = []
        for line in lines.split('\n'):
            line = process_sentence(line)
            line_number, text = split_text(line)
            processed_lines.append(text)
            line_numbers.append(line_number)
        return [(word, line_numbers, processed_lines)]

    @staticmethod
    def process_claim(claim: str) -> list[str]:
        #with FeverDocDB() as db:
        #    facts = db.read("""SELECT DISTINCT af.fact
        #                                 FROM atomic_facts af
        #                                 JOIN def_dataset dd ON af.claim_id = dd.id
        #                                 WHERE dd.claim = ?""", params=(claim,))
        #return [fact[0] for fact in facts] if facts else [claim]
        return [claim]

    def select_evidence(self, claim: str, evidence_list: list[tuple[str, list[str], list[str]]], top_k=3) -> list[tuple[str, str, str]]:
        page, line_numbers, sentences = evidence_list[0]  # in test case we only have one page

        claim_model_input, sentences_model_input = self._build_selection_model_input(claim,
                                                                                     sentences)
        with torch.no_grad():
            claim_embedding = self.selection_model(**claim_model_input)
            sentence_embeddings = self.selection_model(**sentences_model_input)
            claim_similarities = cosine_similarity(claim_embedding, sentence_embeddings, dim=2)
            top_indices = torch.topk(claim_similarities,
                                     k=min(top_k, claim_similarities.size(1)))[1].squeeze(0)

        return [(page, line_numbers[idx], sentences[idx]) for idx in top_indices]


class WikiPipeline(ModelPipeline):
    """Pipeline using Wikipedia."""

    def __init__(self, selection_model=None, selection_model_tokenizer=None,
                 verification_model=None, verification_model_tokenizer=None):
        super().__init__(selection_model, selection_model_tokenizer, verification_model,
                         verification_model_tokenizer)
        self.wiki = Wikipedia()

    def fetch_evidence(self, word: str) -> list[tuple[str, list[str], list[str]]]:
        summaries = self.wiki.get_summaries(word, k=20)  # TODO line numbers
        return [(page, [str(i) for i in range(len(lines))], lines) for page, lines in summaries]

    def select_evidence(self, claim: str, evidence_list: list[tuple[str, list[str], list[str]]], top_k=3,
                        max_evidence_count=3) -> list[tuple[str, str, str]]:
        if len(evidence_list) > max_evidence_count:
            ranked_indices = rank_docs(claim, [" ".join(entry[2]) for entry in evidence_list],
                                       k=max_evidence_count)
            evidence_list = [evidence_list[i] for i in ranked_indices]

        sentence_similarities = []
        for page, line_numbers, sentences in evidence_list:
            claim_model_input, sentences_model_input = self._build_selection_model_input(claim,
                                                                                         sentences)
            with torch.no_grad():
                claim_embedding = self.selection_model(**claim_model_input)
                sentence_embeddings = self.selection_model(**sentences_model_input)
                claim_similarities = cosine_similarity(claim_embedding,
                                                       sentence_embeddings, dim=2).tolist()[0]
                sentence_similarity = [(page, *values) for values in zip(line_numbers, sentences, claim_similarities)]
                sentence_similarities.extend(sentence_similarity)

        sorted_sentences = sorted(sentence_similarities, key=lambda x: x[3], reverse=True)
        return [(sentence[0], sentence[1], sentence[2]) for sentence in sorted_sentences[:top_k]]

# Training

In [15]:
from transformers import AutoModel

device = "cuda" if torch.cuda.is_available() else "cpu"

#model_name = 'google/bigbird-roberta-large'
#model = AutoPeftModelForFeatureExtraction.from_pretrained('selection_model_intermediate_04-30_09-40')

selection_model_tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-m-long')
model = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-m-long', trust_remote_code=True, add_pooling_layer=False, safe_serialization=True)
selection_model = EvidenceSelectionModel(model).to(device)
#selection_model_tokenizer = AutoTokenizer.from_pretrained(model_name)

# still using base
verification_model=None
verification_model_tokenizer=None



tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



In [32]:
from utils import build_fever_instance

test_pipeline = TestPipeline(selection_model=selection_model,selection_model_tokenizer=selection_model_tokenizer)

#test_pipeline = TestPipeline()

pr_labels = []
gt_labels = []
fever_instances = []
for entry in tqdm(dataset):
    output = test_pipeline.verify(entry['document_id'], entry['claim'])
    pr_labels.extend([fact.to_factuality() for fact in output['factualities']])
    gt_labels += [Fact[entry['label']].to_factuality()] * len(output['factualities'])

    evidence = entry['evidence_lines'].split(';')
    predicted_label = output.get('factualities')[0]  # TODO add atomic fact support
    predicted_evidence = output.get('evidences')
    fever_instance = build_fever_instance(entry['label'], evidence, entry['document_id'], predicted_label, predicted_evidence)
    fever_instances.append(fever_instance)

print(classification_report(gt_labels, pr_labels, zero_division=0))
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances)

print(strict_score)
print(label_accuracy)
print(precision)  # TP / TP + FP not too important, rather at least one TP than none
print(recall)     # more important
print(f1)

100%|██████████| 2313/2313 [03:07<00:00, 12.33it/s]

              precision    recall  f1-score   support

          -1       0.00      0.00      0.00         0
           0       0.97      0.87      0.92      1238
           1       0.97      0.93      0.95      1075

    accuracy                           0.90      2313
   macro avg       0.64      0.60      0.62      2313
weighted avg       0.97      0.90      0.93      2313

0.8802421098140942
0.8996973627323822
0.42080991497334974
0.9684392563769996
0.5866893420375288





In [None]:
print([int(label) for label in pr_labels])
print(gt_labels)

[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, -1, -1, -1, 1, 1, 0, 1, -1, 1, 0, -1, -1, -1, -1]
[1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0]


In [33]:
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances, use_gold_labels=True)
print(strict_score)
print(label_accuracy)
print(precision)
print(recall)
print(f1)

0.9684392563769996
1.0
0.42080991497334974
0.9684392563769996
0.5866893420375288


In [None]:
pipeline = WikiPipeline(selection_model=selection_model, selection_model_tokenizer=selection_model_tokenizer)

pr_labels = []
gt_labels = []
for entry in tqdm(dataset):
    word = convert_document_id_to_word(entry['document_id'])

    factuality = pipeline.verify(word, entry['claim'])
    pr_labels.extend([fact.to_factuality() for fact in factuality])
    gt_labels += [Fact[entry['label']].to_factuality()] * len(factuality)

print(classification_report(gt_labels, pr_labels, zero_division=0))
print(classification_report(gt_labels, pr_labels, zero_division=0))
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances)

print(strict_score)
print(label_accuracy)
print(precision)
print(recall)
print(f1)

  0%|          | 0/20 [00:04<?, ?it/s]


AttributeError: 'str' object has no attribute 'to_factuality'

In [None]:
print([int(label) for label in pr_labels])
print(gt_labels)