In [6]:
from dataset.def_dataset import DefinitionDataset, Fact
from transformers import AutoTokenizer
from models.evidence_selection_model import EvidenceSelectionModel
from models.claim_verification_model import ClaimVerificationModel
import torch
from transformers import AutoModel, AutoModelForSequenceClassification
from general_utils.fever_scorer import fever_score
from pipeline.pipeline import TestPipeline, WikiPipeline
from general_utils.utils import build_fever_instance
from general_utils.utils import convert_document_id_to_word
from sklearn.metrics import classification_report
from tqdm import tqdm
from general_utils.reader import JSONLineReader
from datasets import load_dataset

In [2]:
fh = JSONLineReader()

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

selection_model_name = 'Snowflake/snowflake-arctic-embed-m-long'
selection_model_tokenizer_base = AutoTokenizer.from_pretrained(selection_model_name)
selection_model_raw_base = AutoModel.from_pretrained(selection_model_name, trust_remote_code=True, add_pooling_layer=False, safe_serialization=True)
selection_model_base = EvidenceSelectionModel(selection_model_raw_base).to(device)

selection_model_name = 'lukasellinger/evidence_selection_model-v2'
selection_model_tokenizer = AutoTokenizer.from_pretrained(selection_model_name)
selection_model_raw = AutoModel.from_pretrained(selection_model_name, trust_remote_code=True, add_pooling_layer=False, safe_serialization=True)
selection_model = EvidenceSelectionModel(selection_model_raw).to(device)

verification_model_name = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'
verification_model_tokenizer_base = AutoTokenizer.from_pretrained(verification_model_name)
verification_model_raw_base = AutoModelForSequenceClassification.from_pretrained(verification_model_name)
verification_model_base = ClaimVerificationModel(verification_model_raw_base).to(device)

verification_model_name = 'lukasellinger/claim_verification_model-v1'
verification_model_tokenizer = AutoTokenizer.from_pretrained(verification_model_name)
verification_model_raw = AutoModelForSequenceClassification.from_pretrained(verification_model_name)
verification_model = ClaimVerificationModel(verification_model_raw).to(device)

<All keys matched successfully>
<All keys matched successfully>


In [17]:
raw_dataset = load_dataset("lukasellinger/fever_evidence_selection-v1", cache_dir=None).get('dev')
# dataset = DefinitionDataset(raw_dataset, tokenizer=None, model='claim_verification')
print(raw_dataset.features)

test_pipeline = TestPipeline(selection_model=selection_model,selection_model_tokenizer=selection_model_tokenizer, 
                             verification_model=verification_model, verification_model_tokenizer=verification_model_tokenizer)

pr_labels = []
gt_labels = []
fever_instances = []
for entry in tqdm(raw_dataset):
    word = entry.get('document_id')
    fallback_word = convert_document_id_to_word(word)

    output = test_pipeline.verify(word, entry['short_claim'], fallback_word, split_facts=False)
    if output.get('factuality') == 1:
        factuality = Fact.SUPPORTED
    else:
        factuality = Fact.NOT_SUPPORTED
    pr_labels.append(factuality.to_factuality())

    if entry['label'] == 'SUPPORTS':
        label = Fact.SUPPORTED
    else:
        label = Fact.NOT_SUPPORTED
    gt_labels.append(label.to_factuality())

    evidence = entry['evidence_lines'].split(';')
    #predicted_label = output.get('factualities')[0]  # TODO add atomic fact support
    #predicted_evidence = output.get('evidences')
    predicted_evidence = [(x, y) for (x, y, z) in output.get('evidences')]
    fever_instance = build_fever_instance(label.name, evidence, entry['document_id'], factuality, predicted_evidence)
    fever_instances.append(fever_instance)

print(classification_report(gt_labels, pr_labels, zero_division=0))
strict_score, label_accuracy, precision, recall, f1 = fever_score(fever_instances)

print(strict_score)
print(label_accuracy)
print(precision)  # TP / TP + FP not too important, rather at least one TP than none
print(recall)     # more important
print(f1)

{'id': Value(dtype='int64', id=None), 'claim': Value(dtype='string', id=None), 'short_claim': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None), 'document_id': Value(dtype='string', id=None), 'text': Value(dtype='string', id=None), 'lines': Value(dtype='string', id=None), 'evidence_lines': Value(dtype='string', id=None), 'atomic_facts': Value(dtype='string', id=None)}


  0%|          | 0/1978 [00:37<?, ?it/s]

KeyboardInterrupt



In [4]:
offline_wiki = 'lukasellinger/wiki_dump_2024-07-08'
pipeline_base = WikiPipeline(selection_model=selection_model_base, selection_model_tokenizer=selection_model_tokenizer_base, word_lang='de', use_offline_wiki=offline_wiki)
pipeline = WikiPipeline(selection_model=selection_model, selection_model_tokenizer=selection_model_tokenizer, word_lang='de', use_offline_wiki=offline_wiki)

# GermanDPR

In [7]:
dataset = load_dataset("lukasellinger/german_dpr_claim_verification_dissim-v1").get('train')

In [8]:
print(dataset.features)

{'id': Value(dtype='int64', id=None), 'question': Value(dtype='string', id=None), 'claim': Value(dtype='string', id=None), 'english_claim': Value(dtype='string', id=None), 'fact': Value(dtype='string', id=None), 'word': Value(dtype='string', id=None), 'english_word': Value(dtype='string', id=None), 'context': Value(dtype='string', id=None), 'label': Value(dtype='string', id=None), 'document_search_word': Value(dtype='string', id=None), 'connected_claim': Value(dtype='string', id=None), 'atomic_facts_old': Value(dtype='string', id=None), 'atomic_facts': Value(dtype='string', id=None), 'factscore_facts': Value(dtype='string', id=None), 'in_wiki': Value(dtype='string', id=None)}


In [9]:
def process_factuality(factualities, not_in_wiki, lines, pr_labels, gt_labels):
    factualities.append(factuality)
    
    in_wiki = True
    if factuality.get('factuality') == 1:
        predicted = Fact.SUPPORTED
    elif factuality.get('factuality') == -1:
        not_in_wiki += 1
        in_wiki = False
    else:
        predicted = Fact.NOT_SUPPORTED
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    
    lines.append({
        'id': entry['id'],
        'word': entry['word'],
        'claim': claim,
        'label': entry['label'],
        'predicted': predicted.name,
        'atoms': factuality.get('factualities'),
        'selected_evidences': factuality.get('evidences')
    })

In [16]:
def get_prediction(factuality):
    in_wiki = True
    predicted = None
    if factuality.get('factuality') == 1:
        predicted = Fact.SUPPORTED
    elif factuality.get('factuality') == -1:
        in_wiki = False
    else:
        predicted = Fact.NOT_SUPPORTED
        
    return predicted, in_wiki

def build_output(entry, claim, predicted, factuality):
    return {
        'id': entry['id'],
        'word': entry['word'],
        'claim': claim,
        'label': entry['label'],
        'predicted': predicted.name if predicted else None,
        'atoms': factuality.get('factualities'),
        'selected_evidences': factuality.get('evidences')
    }

## {word} : {claim}

In [17]:
from config import PROJECT_DIR

identification = 'german_dpr_word_claim_finetuned'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = entry.get('english_claim', entry['claim'])

    factuality = pipeline.verify(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [04:34<00:00,  1.63s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.7000    0.9130    0.7925        69
           1     0.8776    0.6143    0.7227        70

    accuracy                         0.7626       139
   macro avg     0.7888    0.7637    0.7576       139
weighted avg     0.7894    0.7626    0.7573       139






In [18]:
from config import PROJECT_DIR

identification = 'german_dpr_word_claim_base'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = entry.get('english_claim', entry['claim'])

    factuality = pipeline_base.verify(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [03:51<00:00,  1.38s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.6739    0.8986    0.7702        69
           1     0.8511    0.5714    0.6838        70

    accuracy                         0.7338       139
   macro avg     0.7625    0.7350    0.7270       139
weighted avg     0.7631    0.7338    0.7267       139






## connected sentence

In [19]:
from config import PROJECT_DIR

identification = 'german_dpr_connected_finetuned'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = entry.get('connected_claim')

    factuality = pipeline.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [03:58<00:00,  1.42s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.7356    0.9275    0.8205        69
           1     0.9038    0.6714    0.7705        70

    accuracy                         0.7986       139
   macro avg     0.8197    0.7995    0.7955       139
weighted avg     0.8203    0.7986    0.7953       139






In [20]:
from config import PROJECT_DIR

identification = 'german_dpr_connected_base'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = entry.get('connected_claim')

    factuality = pipeline_base.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [03:58<00:00,  1.42s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.6818    0.8696    0.7643        69
           1     0.8235    0.6000    0.6942        70

    accuracy                         0.7338       139
   macro avg     0.7527    0.7348    0.7293       139
weighted avg     0.7532    0.7338    0.7290       139






## dissim facts

In [24]:
from config import PROJECT_DIR

identification = 'german_dpr_dissim_finetuned'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = f'{search_word}: {entry.get("english_claim", entry["claim"])}'
    atomic_facts = entry['atomic_facts']
    atomic_facts = atomic_facts.split('--;--') if atomic_facts else []

    factuality = pipeline.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word, atomic_claims=atomic_facts)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [04:46<00:00,  1.71s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.6381    0.9710    0.7701        69
           1     0.9412    0.4571    0.6154        70

    accuracy                         0.7122       139
   macro avg     0.7896    0.7141    0.6927       139
weighted avg     0.7907    0.7122    0.6922       139






In [25]:
from config import PROJECT_DIR

identification = 'german_dpr_dissim_base'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = f'{search_word}: {entry.get("english_claim", entry["claim"])}'
    atomic_facts = entry['atomic_facts']
    atomic_facts = atomic_facts.split('--;--') if atomic_facts else []

    factuality = pipeline_base.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word, atomic_claims=atomic_facts)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [04:35<00:00,  1.64s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.6465    0.9275    0.7619        69
           1     0.8750    0.5000    0.6364        70

    accuracy                         0.7122       139
   macro avg     0.7607    0.7138    0.6991       139
weighted avg     0.7616    0.7122    0.6987       139






## factscore facts

In [26]:
from config import PROJECT_DIR

identification = 'german_dpr_factscore_finetuned'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = f'{search_word}: {entry.get("english_claim", entry["claim"])}'
    atomic_facts = entry['factscore_facts']
    atomic_facts = atomic_facts.split('--;--') if atomic_facts else []

    factuality = pipeline_base.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word, atomic_claims=atomic_facts)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [05:28<00:00,  1.96s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.5877    0.9710    0.7322        69
           1     0.9200    0.3286    0.4842        70

    accuracy                         0.6475       139
   macro avg     0.7539    0.6498    0.6082       139
weighted avg     0.7551    0.6475    0.6073       139






In [27]:
from config import PROJECT_DIR

identification = 'german_dpr_factscore_base'

pr_labels = []
gt_labels = []
factualities = []
not_in_wiki = 0
lines = []
for entry in tqdm(dataset):
    word = entry.get('word')
    english_word = entry.get('english_word', word)
    search_word = entry.get('document_search_word')
    claim = f'{search_word}: {entry.get("english_claim", entry["claim"])}'
    atomic_facts = entry['factscore_facts']
    atomic_facts = atomic_facts.split('--;--') if atomic_facts else []

    factuality = pipeline_base.verify3(word, claim, english_word, only_intro=True, split_facts=False, search_word=search_word, atomic_claims=atomic_facts)
    factualities.append(factuality)
    
    predicted, in_wiki = get_prediction(factuality)
    
    if in_wiki:
        pr_labels.append(predicted.to_factuality())
        gt_labels.append(Fact[entry['label']].to_factuality())
    else:
        not_in_wiki += 1
    lines.append(build_output(entry, claim, predicted, factuality))

print(f'Not in wiki {not_in_wiki}')
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))
fh.write(PROJECT_DIR.joinpath(f'dataset/evaluation/{identification}_pipeline.jsonl'), lines)

100%|██████████| 168/168 [05:27<00:00,  1.95s/it]

Not in wiki 29
              precision    recall  f1-score   support

           0     0.5877    0.9710    0.7322        69
           1     0.9200    0.3286    0.4842        70

    accuracy                         0.6475       139
   macro avg     0.7539    0.6498    0.6082       139
weighted avg     0.7551    0.6475    0.6073       139




