In [1]:
from general_utils.reader import JSONLineReader
from sklearn.metrics import classification_report
from dataset.def_dataset import Fact
from config import PROJECT_DIR

In [18]:
from datasets import load_dataset

dataset_name = "lukasellinger/german_dpr_claim_verification_dissim-v1"
dataset = load_dataset(dataset_name).get('train')
outputs = JSONLineReader().read(PROJECT_DIR.joinpath('dataset/openai/output/german_dpr/output_german_dpr_factscore-gpt3_5-turbo-gtr.jsonl'))
outputs = {d['id']: d for d in outputs}

In [19]:
exclude_not_in_wiki = True

gt_labels = []
pr_labels = []
for entry in dataset:
    if exclude_not_in_wiki and entry['in_wiki'] == 'No':
        continue
    output = outputs[entry['id']]
    pr_labels.append(Fact[output['predicted']].to_factuality())
    gt_labels.append(Fact[output['label']].to_factuality())

In [20]:
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

              precision    recall  f1-score   support

           0     0.5312    0.9855    0.6904        69
           1     0.9091    0.1429    0.2469        70

    accuracy                         0.5612       139
   macro avg     0.7202    0.5642    0.4686       139
weighted avg     0.7215    0.5612    0.4670       139



In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

In [1]:
from datasets import load_dataset

from config import PROJECT_DIR
from general_utils.utils import print_classification_report
from pipeline.statement_verifier import ModelStatementVerifier
from pipeline.evidence_selector import ModelEvidenceSelector
from pipeline.translator import OpusMTTranslator
from pipeline.sentence_connector import PhiSentenceConnector, ColonSentenceConnector
from pipeline.evidence_fetcher import WikipediaEvidenceFetcher
from pipeline.pipeline import Pipeline
from pipeline.claim_splitter import DisSimSplitter, T5SplitRephraseSplitter, FactscoreSplitter

In [2]:
base_selection_model = 'Snowflake/snowflake-arctic-embed-m-long'
base_verification_model = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'

finetuned_selection_model = 'lukasellinger/evidence_selection_model-v2'
finetuned_verification_model = 'lukasellinger/claim_verification_model-v1'

In [3]:
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german-claim_verification': {
        'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    }
}

In [27]:
def evaluate_pipeline(pipeline: Pipeline, dataset, batch_size=4, output_file_name='', only_intro=True):
    outputs, report, not_in_wiki = pipeline.verify_test_dataset(dataset, batch_size, output_file_name, only_intro)
    total_claim_count = sum(len(entry['atoms']) for entry in outputs if entry.get('atoms'))
    total = sum(1 for entry in outputs if entry.get('atoms'))
    avg_claim_count = total_claim_count / total if total > 0 else 0
    print_classification_report(report, not_in_wiki, avg_claim_count)
    return outputs

# Init models

In [5]:
# Translator
translator = OpusMTTranslator()

# Sentence Connectors
colon_sentence_connector = ColonSentenceConnector()
phi_sentence_connector = PhiSentenceConnector()

# Evidence Fetcher
offline_evid_fetcher = WikipediaEvidenceFetcher()
online_evid_fetcher = WikipediaEvidenceFetcher(offline=False)

pipeline_models = {
    'base': {
        'evid_selector': ModelEvidenceSelector(model_name=base_selection_model),
        'stm_verifier': ModelStatementVerifier(model_name=base_verification_model)
    },
    'finetuned': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model),
        'stm_verifier': ModelStatementVerifier(model_name=finetuned_verification_model)
    }
}

`flash-attention` package not found, consider installing for better performance: No module named 'flash_attn'.
Current `flash-attention` does not support `window_size`. Either upgrade or use `attn_implementation='eager'`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
<All keys matched successfully>
<All keys matched successfully>


# Finetuned vs Base (Best Setup)

In [28]:
output_file_base_name = PROJECT_DIR.joinpath("data/evaluation/{dataset}_{model}.jsonl")
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for model_name, models in pipeline_models.items():
        print(f"Evaluating {dataset_name} with pipeline {model_name}...")
        pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=models.get('evid_selector'),
                        stm_verifier=models.get('stm_verifier'),
                        lang=lang)
        evaluate_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(dataset=dataset_name, model_name=model_name))

Evaluating german_dpr_dataset with pipeline base...


100%|██████████| 42/42 [04:41<00:00,  6.69s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6818    0.8696    0.7643        69
           1     0.8235    0.6000    0.6942        70

    accuracy                         0.7338       139
   macro avg     0.7527    0.7348    0.7293       139
weighted avg     0.7532    0.7338    0.7290       139

################################
Evaluating german_dpr_dataset with pipeline finetuned...


100%|██████████| 42/42 [04:28<00:00,  6.39s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7792    0.8696    0.8219        69
           1     0.8548    0.7571    0.8030        70

    accuracy                         0.8129       139
   macro avg     0.8170    0.8134    0.8125       139
weighted avg     0.8173    0.8129    0.8124       139

################################
Evaluating german_dataset with pipeline base...


100%|██████████| 178/178 [46:54<00:00, 15.81s/it] 


################################
Not in wikipedia: 26
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.5645    0.8772    0.6870       334
           1     0.7515    0.3543    0.4816       350

    accuracy                         0.6096       684
   macro avg     0.6580    0.6158    0.5843       684
weighted avg     0.6602    0.6096    0.5819       684

################################
Evaluating german_dataset with pipeline finetuned...


100%|██████████| 178/178 [46:35<00:00, 15.71s/it] 


################################
Not in wikipedia: 26
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.5699    0.7934    0.6633       334
           1     0.6849    0.4286    0.5272       350

    accuracy                         0.6067       684
   macro avg     0.6274    0.6110    0.5953       684
weighted avg     0.6288    0.6067    0.5937       684

################################
Evaluating squad_dataset with pipeline base...


100%|██████████| 40/40 [03:23<00:00,  5.09s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7500    0.8571    0.8000        63
           1     0.8333    0.7143    0.7692        63

    accuracy                         0.7857       126
   macro avg     0.7917    0.7857    0.7846       126
weighted avg     0.7917    0.7857    0.7846       126

################################
Evaluating squad_dataset with pipeline finetuned...


100%|██████████| 40/40 [03:37<00:00,  5.45s/it]

################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8000    0.8889    0.8421        63
           1     0.8750    0.7778    0.8235        63

    accuracy                         0.8333       126
   macro avg     0.8375    0.8333    0.8328       126
weighted avg     0.8375    0.8333    0.8328       126

################################





# Online Finetuned Pipeline (Best Setup)

In [None]:
output_file_base_name = PROJECT_DIR.joinpath("data/evaluation/{dataset}_finetuned_online.jsonl")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    print(f"Evaluating {dataset_name}...")
    pipeline = Pipeline(translator=translator,
                    sent_connector=phi_sentence_connector,
                    claim_splitter=None,
                    evid_fetcher=online_evid_fetcher,
                    evid_selector=pipeline_models['finetuned']['evid_selector'],
                    stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                    lang=lang)
    evaluate_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(dataset=dataset_name))

# Evaluate different Claim Splitters

In [None]:
claim_splitters = {
    'DisSimSplitter': DisSimSplitter(),
    'T5SplitRephraseSplitter': T5SplitRephraseSplitter(),
    'FactscoreSplitter': FactscoreSplitter(),
    'None': None
}

In this setting, the splits are not recalculated; instead, they are reused as they are already present in the dataset.

In [None]:
output_file_base_name = PROJECT_DIR.joinpath("data/evaluation/{dataset}_finetuned_{splitter}.jsonl")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for name, splitter in claim_splitters.items():
        print(f"Evaluating {dataset_name} with claim splitter {name}...")
        pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=splitter,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=pipeline_models['finetuned']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                        lang=lang)
        evaluate_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(dataset=dataset_name, splitter=name))

# Whole Wiki Page Run (Best setup)

In [None]:
output_file_base_name = PROJECT_DIR.joinpath("data/evaluation/{dataset}_finetuned_whole_page.jsonl")
max_intro_sent_indices = offline_evid_fetcher.get_max_intro_sent_idx()

for dataset_name, config in datasets.items():
    print(f"Evaluating {dataset_name}...")
    dataset = config['dataset']
    lang = config['lang']
    
    pipeline = Pipeline(translator=translator,
                    sent_connector=phi_sentence_connector,
                    claim_splitter=None,
                    evid_fetcher=offline_evid_fetcher,
                    evid_selector=pipeline_models['finetuned']['evid_selector'],
                    stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                    lang=lang)
    outputs = evaluate_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(dataset=dataset_name), only_intro=False)               

# Evaluate gold labels on Fever (FeverScore)


TODO: need to create a Testpipeline

In [None]:
for name, models in pipeline_models.items():
    print(f"Evaluating {dataset_name} with {name}...")
    pipeline = Pipeline(translator=translator,
                    sent_connector=phi_sentence_connector,
                    claim_splitter=None,
                    evid_fetcher=offline_evid_fetcher,
                    evid_selector=models.get('evid_selector'),
                    stm_verifier=models.get('stm_verifier'),
                    lang='en')
    evaluate_pipeline(pipeline, dataset, output_file_base_name.format(name=name))