In [7]:
from general_utils.reader import JSONLineReader
from sklearn.metrics import classification_report
from dataset.def_dataset import Fact
from config import PROJECT_DIR

In [18]:
from datasets import load_dataset

dataset_name = "lukasellinger/german_dpr_claim_verification_dissim-v1"
dataset = load_dataset(dataset_name).get('train')
outputs = JSONLineReader().read(PROJECT_DIR.joinpath('dataset/openai/output/german_dpr/output_german_dpr_factscore-gpt3_5-turbo-gtr.jsonl'))
outputs = {d['id']: d for d in outputs}

In [19]:
exclude_not_in_wiki = True

gt_labels = []
pr_labels = []
for entry in dataset:
    if exclude_not_in_wiki and entry['in_wiki'] == 'No':
        continue
    output = outputs[entry['id']]
    pr_labels.append(Fact[output['predicted']].to_factuality())
    gt_labels.append(Fact[output['label']].to_factuality())

In [20]:
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

              precision    recall  f1-score   support

           0     0.5312    0.9855    0.6904        69
           1     0.9091    0.1429    0.2469        70

    accuracy                         0.5612       139
   macro avg     0.7202    0.5642    0.4686       139
weighted avg     0.7215    0.5612    0.4670       139



In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

In [None]:
from datasets import load_dataset

from general_utils.utils import print_classification_report
from pipeline.statement_verifier import ModelStatementVerifier
from pipeline.evidence_selector import ModelEvidenceSelector
from pipeline.translator import OpusMTTranslator
from pipeline.sentence_connector import PhiSentenceConnector, ColonSentenceConnector
from pipeline.evidence_fetcher import WikipediaEvidenceFetcher
from pipeline.pipeline import Pipeline
from pipeline.claim_splitter import DisSimSplitter, T5SplitRephraseSplitter, FactscoreSplitter

In [None]:
base_selection_model = 'Snowflake/snowflake-arctic-embed-m-long'
base_verification_model = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'

finetuned_selection_model = 'lukasellinger/evidence_selection_model-v2'
finetuned_verification_model = 'lukasellinger/claim_verification_model-v1'

In [None]:
datasets = {
    'german_dpr_dataset': {
        'dataset': load_dataset('lukasellinger/german_dpr_claim_verification_dissim-v1'),
        'lang': 'de'
    },
    'german_dataset': {
        'dataset': load_dataset('lukasellinger/german_claim_verification_dissim-v1'),
        'lang': 'de'
    },
    'squad_dataset': {
                'dataset': load_dataset('lukasellinger/squad_claim_verification_dissim-v1'),
        'lang': 'en'
    }
}

In [None]:
def evaluate_pipeline(pipeline, dataset, output_file_name=''):
    outputs, report, not_in_wiki = pipeline.verify_test_dataset(dataset, output_file_name)
    avg_claim_count = sum(len(output['atoms']) for output in outputs) / len(outputs) if outputs else 0  
    print_classification_report(report, not_in_wiki, avg_claim_count)
    return outputs

# Init models

In [None]:
# Translator
translator = OpusMTTranslator()

# Sentence Connectors
colon_sentence_connector = ColonSentenceConnector()
phi_sentence_connector = PhiSentenceConnector()

# Evidence Fetcher
offline_evid_fetcher = WikipediaEvidenceFetcher()
online_evid_fetcher = WikipediaEvidenceFetcher(offline=False)

pipeline_models = {
    'base': {
        'evid_selector': ModelEvidenceSelector(model_name=base_selection_model),
        'stm_verifier': ModelStatementVerifier(model_name=base_verification_model)
    },
    'finetuned': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model),
        'stm_verifier': ModelStatementVerifier(model_name=finetuned_verification_model)
    }
}

# Finetuned vs Base (Best Setup)

In [None]:
output_file_base_name = "{}"
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for name, models in pipeline_models.items():
        print(f"Evaluating {dataset_name} with {name}...")
        pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=models.get('evid_selector'),
                        stm_verifier=models.get('stm_verifier'),
                        lang=lang)
        evaluate_pipeline(pipeline, dataset, output_file_base_name.format(name=name))

# Evaluate different Claim Splitters

In [None]:
claim_splitters = {
    'DisSimSplitter': DisSimSplitter(),
    'T5SplitRephraseSplitter': T5SplitRephraseSplitter(),
    'FactscoreSplitter': FactscoreSplitter()
}

In [None]:
output_file_base_name = "{}"
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for name, splitter in claim_splitters.items():
        print(f"Evaluating {dataset_name} with {name}...")
        pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=splitter,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=pipeline_models['finetuned']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                        lang=lang)
        evaluate_pipeline(pipeline, dataset, output_file_base_name.format(name=name))