In [1]:
from dataset.def_dataset import DefinitionDataset, Fact
from transformers import AutoTokenizer
from models.evidence_selection_model import EvidenceSelectionModel
from models.claim_verification_model import ClaimVerificationModel
import torch
from transformers import AutoModel, AutoModelForSequenceClassification
from general_utils.fever_scorer import fever_score
from pipeline.pipeline import TestPipeline, WikiPipeline
from general_utils.utils import build_fever_instance
from general_utils.utils import convert_document_id_to_word
from sklearn.metrics import classification_report
from tqdm import tqdm
from general_utils.reader import JSONLineReader

In [2]:
fh = JSONLineReader()

In [14]:
device = "cuda" if torch.cuda.is_available() else "cpu"

#selection_model_tokenizer = AutoTokenizer.from_pretrained('Snowflake/snowflake-arctic-embed-m-long')
#selection_model_raw = AutoModel.from_pretrained('Snowflake/snowflake-arctic-embed-m-long', trust_remote_code=True, add_pooling_layer=False, safe_serialization=True)
selection_model_name = 'lukasellinger/evidence_selection_model-v2'
selection_model_tokenizer = AutoTokenizer.from_pretrained(selection_model_name)
selection_model_raw = AutoModel.from_pretrained(selection_model_name, trust_remote_code=True, add_pooling_layer=False, safe_serialization=True)
selection_model = EvidenceSelectionModel(selection_model_raw).to(device)

verification_model_name = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'
#verification_model_name = 'lukasellinger/claim_verification_model-v1'
verification_model_tokenizer = AutoTokenizer.from_pretrained(verification_model_name)
verification_model_raw = AutoModelForSequenceClassification.from_pretrained(verification_model_name)
verification_model = ClaimVerificationModel(verification_model_raw).to(device)

<All keys matched successfully>


In [17]:
from datasets import load_dataset

raw_dataset = load_dataset("lukasellinger/fever_evidence_selection-v1", cache_dir=None).get('dev')
# dataset = DefinitionDataset(raw_dataset, tokenizer=None, model='claim_verification')
print(raw_dataset.features)

test_pipeline = TestPipeline(selection_model=selection_model,selection_model_tokenizer=selection_model_tokenizer, 
                             verification_model=verification_model, verification_model_tokenizer=verification_model_tokenizer)

pr_labels = []
gt_labels = []
fever_instances = []
for entry in tqdm(raw_dataset):
    word = entry.get('document_id')
    fallback_word = convert_document_id_to_word(word)

    output = test_pipeline.verify(word, entry['short_claim'], fallback_word, split_facts=False)
    if output.get('factuality') == 1:
        factuality = Fact.SUPPORTED
    else:
        factuality = Fact.NOT_SUPPORTED
    pr_labels.append(factuality.to_factuality())

    if entry['label'] == 'SUPPORTS':
        label = Fact.SUPPORTED
    else:
        label = Fact.NOT_SUPPORTED
    gt_labels.append(label.to_factuality())

    evidence = entry['evidence_lines'].split(';')
    #predicted_label = output.get('factualities')[0]  # TODO add atomic fact support
    #predicted_evidence = output.get('evidences')
    predicted_evidence = [(x, y) for (x, y, z) in output.get('evidences')]
    fever_instance = build_fever_instance(label.name, evidence, entry['document_id'], factuality, predicted_evidence)
    fever_instances.append(fever_instance)

print(classification_report(gt_labels, pr_labels, zero_division=0))
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances)

print(strict_score)
print(label_accuracy)
print(precision)  # TP / TP + FP not too important, rather at least one TP than none
print(recall)     # more important
print(f1)

{'id': Value(dtype='int64', id=None), 'claim': Value(dtype='string', id=None), 'short_claim': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None), 'document_id': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'lines': Value(dtype='string', id=None), 'evidence_lines': Value(dtype='string', id=None), 'atomic_facts': Value(dtype='string', id=None)}


  0%|          | 0/1978 [00:37<?, ?it/s]

KeyboardInterrupt



In [7]:
offline_wiki = 'lukasellinger/wiki_dump_2024-07-08'
pipeline = WikiPipeline(selection_model=selection_model, selection_model_tokenizer=selection_model_tokenizer, word_lang='de', use_offline_wiki=offline_wiki)

In [17]:
from config import PROJECT_DIR
from general_utils.reader import JSONReader
from datasets import load_dataset

identification = 'german_dpr_dissim'
dataset = load_dataset("lukasellinger/german_dpr_claim_verification_dissim-v1").get('train')
print(dataset.features)
#fact_dict = JSONReader().read(PROJECT_DIR.joinpath('dataset/openai/output_german_dpr_facts-gpt3_5-turbo.json'))

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = entry.get('english_claim', entry['claim'])
    atomic_facts = entry['atomic_facts']
    atomic_facts = atomic_facts.split('--;--') if atomic_facts else []
    # atomic_facts = fact_dict.get(str(entry['id']))

    factuality = pipeline.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word, atomic_claims=atomic_facts)
    factualities.append(factuality)
    
    in_wiki = True
    if factuality.get('factuality') == 1:
        predicted = Fact.SUPPORTED
    elif factuality.get('factuality') == -1:
        not_in_wiki += 1
        in_wiki = False
    else:
        predicted = Fact.NOT_SUPPORTED
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    
    lines.append({
        'id': entry['id'],
        'word': entry['word'],
        'claim': claim,
        'label': entry['label'],
        'predicted': predicted.name,
        'atoms': factuality.get('factualities'),
        'selected_evidences': factuality.get('evidences')
    })

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

{'id': Value(dtype='int64', id=None), 'question': Value(dtype='string', id=None), 'claim': Value(dtype='string', id=None), 'english_claim': Value(dtype='string', id=None), 'fact': Value(dtype='string', id=None), 'word': Value(dtype='string', id=None), 'english_word': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None), 'document_search_word': Value(dtype='string', id=None), 'connected_claim': Value(dtype='string', id=None), 'atomic_facts_old': Value(dtype='string', id=None), 'atomic_facts': Value(dtype='string', id=None)}


100%|██████████| 168/168 [04:37<00:00,  1.65s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.6436    0.9420    0.7647        69
           1     0.8947    0.4857    0.6296        70

    accuracy                         0.7122       139
   macro avg     0.7692    0.7139    0.6972       139
weighted avg     0.7701    0.7122    0.6967       139






In [17]:
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

              precision    recall  f1-score   support

           0     0.7532    0.8406    0.7945        69
           1     0.8226    0.7286    0.7727        70

    accuracy                         0.7842       139
   macro avg     0.7879    0.7846    0.7836       139
weighted avg     0.7882    0.7842    0.7835       139



In [29]:
print(factualities)

