In [1]:
from general_utils.reader import JSONLineReader
from sklearn.metrics import classification_report
from dataset.def_dataset import Fact
from config import PROJECT_DIR

In [18]:
from datasets import load_dataset

dataset_name = "lukasellinger/german_dpr_claim_verification_dissim-v1"
dataset = load_dataset(dataset_name).get('train')
outputs = JSONLineReader().read(PROJECT_DIR.joinpath(
    'dataset/openai/output/german_dpr/output_german_dpr_factscore-gpt3_5-turbo-gtr.jsonl'))
outputs = {d['id']: d for d in outputs}

In [19]:
exclude_not_in_wiki = True

gt_labels = []
pr_labels = []
for entry in dataset:
    if exclude_not_in_wiki and entry['in_wiki'] == 'No':
        continue
    output = outputs[entry['id']]
    pr_labels.append(Fact[output['predicted']].to_factuality())
    gt_labels.append(Fact[output['label']].to_factuality())

In [20]:
print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

              precision    recall  f1-score   support

           0     0.5312    0.9855    0.6904        69
           1     0.9091    0.1429    0.2469        70

    accuracy                         0.5612       139
   macro avg     0.7202    0.5642    0.4686       139
weighted avg     0.7215    0.5612    0.4670       139



# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Next, we install the packages and import the modules needed in this notebook:

In [1]:
from collections import defaultdict
from datasets import load_dataset
import json
import random
from typing import List, Dict
from tqdm import tqdm

from config import PROJECT_DIR
from general_utils.utils import print_classification_report, calc_bin_stats, rank_docs, print_fever_classification_report

from general_utils.reader import JSONLineReader, JSONReader
from pipeline_module.statement_verifier import ModelStatementVerifier
from pipeline_module.evidence_selector import ModelEvidenceSelector
from pipeline_module.translator import OpusMTTranslator
from pipeline_module.sentence_connector import PhiSentenceConnector, ColonSentenceConnector
from pipeline_module.evidence_fetcher import WikipediaEvidenceFetcher
from pipeline_module.pipeline import Pipeline, FeverPipeline
from pipeline_module.claim_splitter import DisSimSplitter, T5SplitRephraseSplitter, FactscoreSplitter

# 1 Setup: Define Models and Datasets
Now we define our models and datasets we want to evaluate:

In [2]:
# Base Models
base_selection_model = 'Snowflake/snowflake-arctic-embed-m-long'
base_verification_model = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'

# Finetuned Models
finetuned_selection_model = 'lukasellinger/evidence_selection_model-v3'
finetuned_verification_model = 'lukasellinger/claim_verification_model-v3'

In [3]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german-claim_verification': {
        'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    }
}

## 1.1 Evaluation Util Functions
Next we define some helper functions

In [4]:
def evaluate_pipeline(pipeline: Pipeline, dataset, batch_size=4, output_file_name='',
                      only_intro=True):
    outputs, report, not_in_wiki = pipeline.verify_test_dataset(dataset, batch_size,
                                                                output_file_name, only_intro)

    total_claim_count = sum(len(entry['atoms']) for entry in outputs if entry.get('atoms'))
    total_entries_with_atoms = sum(1 for entry in outputs if entry.get('atoms'))

    avg_claim_count = total_claim_count / total_entries_with_atoms if total_entries_with_atoms > 0 else 0

    print_classification_report(report, not_in_wiki, avg_claim_count)

    return outputs


def calc_claim_lengths_stats(outputs: List[Dict]):
    pr_labels, gt_labels, claim_lengths = [], [], []
    for output in outputs:
        pr_labels.append(1 if output['predicted'] == 'SUPPORTED' else 0)
        gt_labels.append(1 if output['label'] == 'SUPPORTED' else 0)
        claim_lengths.append(len(output['connected_claim'].split()))
    return calc_bin_stats(pr_labels, gt_labels, claim_lengths)


def calc_additional_stats(outputs: List[Dict], output_file_name=''):
    evid_line_number_dist = defaultdict(int)
    total_wikipedia = 0
    total_wiktionary = 0
    in_intro = 0

    for output in outputs:
        evidences = output.get('evidence', [])
        for evidence in evidences:
            if evidence.get('title').endswith('(wikipedia)'):
                if evidence.get('in_intro'):
                    in_intro += 1
                total_wikipedia += 1
            else:
                total_wiktionary += 1

            evid_line_number_dist[evidence.get('line_idx')] += 1
    total_evidences = total_wikipedia + total_wiktionary

    stats =  {
        'evid_line_number_dist': dict(sorted(evid_line_number_dist.items())),
        'claim_length_stats': calc_claim_lengths_stats(outputs),
        'avg_sent_0_selected': evid_line_number_dist.get(0) / sum(evid_line_number_dist.values()),
        'avg_in_intro': in_intro / total_wikipedia if total_wikipedia > 0 else 0,
        'in_intro': in_intro,
        'avg_wikipedia': total_wikipedia / total_evidences if total_evidences > 0 else 0,
        'total_wikipedia': total_wikipedia,
        'avg_wiktionary': total_wiktionary / total_evidences if total_evidences > 0 else 0,
        'total_wiktionary': total_wiktionary
    }
    if output_file_name:
        JSONReader().write(f'{output_file_name}.json', stats)
        
    return stats

## 1.2 Initialization of the pipeline modules
Here, we initialize the pipeline modules that will be used later. 
These modules will be loaded onto your device when the first inference step is performed. 
In this notebook, the translator and sentence connector are not directly utilized, but their results are already included in the datasets. We use their names to ensure the correct output is retrieved.

In [5]:
%%capture
# Translator
translator = OpusMTTranslator()

# Sentence Connectors
colon_sentence_connector = ColonSentenceConnector()
phi_sentence_connector = PhiSentenceConnector()

# Evidence Fetcher
offline_evid_fetcher = WikipediaEvidenceFetcher()
online_evid_fetcher = WikipediaEvidenceFetcher(offline=False)

pipeline_models = {
    'base': {
        'evid_selector': ModelEvidenceSelector(model_name=base_selection_model),
        'stm_verifier': ModelStatementVerifier(model_name=base_verification_model)
    },
    'finetuned': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model),
        'stm_verifier': ModelStatementVerifier(model_name=finetuned_verification_model)
    }
}

Exception: data did not match any variant of untagged enum PyPreTokenizerTypeWrapper at line 88 column 3

### 2 Evaluating Finetuned vs. Base Pipeline (Optimal Setup)

In this section, we assess the performance of the base and finetuned pipeline using the identified optimal setup.

#### Datasets and Models

- **Datasets:**
  - `lukasellinger/german_dpr-claim_verification`
  - `lukasellinger/german-claim_verification`
  - `lukasellinger/squad-claim_verification`

- **Base Models:**
  - Selection: `Snowflake/snowflake-arctic-embed-m-long`
  - Verification: `MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7`

- **Finetuned Models:**
  - Selection: `lukasellinger/evidence_selection_model-v2`
  - Verification: `lukasellinger/claim_verification_model-v1`

#### Evaluation Strategy

- **Configuration:**
  - Using `OpusMTTranslator`
  - Using `PhiSentenceConnector`
  - No claim splitting
  - Offline Evidence Fetcher for reproducibility (state of 08.07.2024)

In [6]:
output_file_base_name = str(PROJECT_DIR / "data/evaluation/{dataset}_{model}")
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for model_name, models in pipeline_models.items():
        print(f"Evaluating {dataset_name} with pipeline {model_name}...")
        pipeline = Pipeline(translator=translator,
                            sent_connector=phi_sentence_connector,
                            claim_splitter=None,
                            evid_fetcher=offline_evid_fetcher,
                            evid_selector=models.get('evid_selector'),
                            stm_verifier=models.get('stm_verifier'),
                            lang=lang)
        outputs = evaluate_pipeline(pipeline, dataset,
                                    output_file_name=output_file_base_name.format(
                                        dataset=dataset_name, model=model_name))
        additional_stats = calc_additional_stats(outputs, f'{output_file_base_name.format(dataset=dataset_name, model=model_name)}_additional_stats')

NameError: name 'pipeline_models' is not defined

# 3 Evaluating Finetuned Pipeline Online (Best Setup)
In the previous section, we evaluated our finetuned pipeline using the offline evidence fetcher. We can also connect our pipeline to the Wikipedia API to retrieve the most current knowledge available.

Let's check on that:

In [None]:
output_file_base_name = str(PROJECT_DIR / "data/evaluation/{dataset}_finetuned_online")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    print(f"Evaluating {dataset_name}...")
    pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=online_evid_fetcher,
                        evid_selector=pipeline_models['finetuned']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                        lang=lang)
    evaluate_pipeline(pipeline, dataset,
                      output_file_name=output_file_base_name.format(dataset=dataset_name))

# 4 Evaluation of Different Claim Splitters

A sentence can be split into multiple facts, where the combination of these facts represents the entire sentence.

For this evaluation, we test four different splitters:

- **`DisSimSplitter`**: Based on [DiscourseSimplification](https://github.com/Lambda-3/DiscourseSimplification)
- **`T5SplitRephraseSplitter`**: Based on [T5 Split and Rephrase](https://huggingface.co/unikei/t5-base-split-and-rephrase)
- **`FactscoreSplitter`**: Based on [FActScore](https://github.com/shmsw25/FActScore)
- **`None`**: No splitting

In [6]:
%%capture
claim_splitters = {
    'DisSimSplitter': DisSimSplitter(),
    'T5SplitRephraseSplitter': T5SplitRephraseSplitter(),
    'FactscoreSplitter': FactscoreSplitter(),
    'None': None
}

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In this setting, the splits are not calculated. Instead, they are reused as they have already been precomputed and are present in the datasets.

In [7]:
output_file_base_name = str(PROJECT_DIR / "data/evaluation/{dataset}_finetuned_{splitter}")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for name, splitter in claim_splitters.items():
        print(f"Evaluating {dataset_name} with claim splitter {name}...")
        pipeline = Pipeline(translator=translator,
                            sent_connector=phi_sentence_connector,
                            claim_splitter=splitter,
                            evid_fetcher=offline_evid_fetcher,
                            evid_selector=pipeline_models['finetuned']['evid_selector'],
                            stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                            lang=lang)
        evaluate_pipeline(pipeline, dataset,
                          output_file_name=output_file_base_name.format(dataset=dataset_name,
                                                                        splitter=name))

Evaluating german_dpr-claim_verification with claim splitter DisSimSplitter...


  0%|          | 0/42 [00:00<?, ?it/s]<All keys matched successfully>
 10%|▉         | 4/42 [00:57<09:10, 14.47s/it]


KeyboardInterrupt: 

# 5 Whole Wiki Page Run (Best Setup)

In our previous setting, we limited the text fetched to the intro sections of Wikipedia pages. However, we could extend this to use the entire Wikipedia page, which would provide more comprehensive information but would significantly increase the processing time.

Let us try it out:


In [None]:
output_file_base_name = str(PROJECT_DIR / "data/evaluation/{dataset}_finetuned_whole_page")

for dataset_name, config in datasets.items():
    print(f"Evaluating {dataset_name}...")
    dataset = config['dataset']
    lang = config['lang']

    pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=pipeline_models['finetuned']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned']['stm_verifier'],
                        lang=lang)
    outputs = evaluate_pipeline(pipeline, dataset,
                                output_file_name=output_file_base_name.format(dataset=dataset_name),
                                only_intro=False)
    additional_stats = calc_additional_stats(outputs, f'{output_file_base_name.format(dataset=dataset_name)}_additional_stats')

# 6 Evaluate FEVER Score

Our models were trained on the FEVER dataset. To assess their performance, we use the FEVER Score, a metric specifically designed to evaluate fact-checking systems. The FEVER Score measures both the accuracy of claim verification and the relevance of the retrieved evidence.

The FEVER Score for a sample is determined as follows:
- **Score of 1**: If at least one group of evidence is correctly identified among the selected evidence sentences and the predicted label (Supported, Refuted) matches the true label.
- **Score of 0**: If the above conditions are not met.

In this section, we will evaluate and compare the FEVER Scores of our base and finetuned pipelines.

In [6]:
def evaluate_fever_pipeline(pipeline: FeverPipeline, dataset, batch_size=4, output_file_name=''):
    outputs, report, fever_report = pipeline.verify_test_dataset(dataset, batch_size, output_file_name)
    print_fever_classification_report(report, fever_report)
    return outputs

In [7]:
output_file_base_name = str(PROJECT_DIR / "data/evaluation/fever_{name}")
dataset = load_dataset("lukasellinger/fever_evidence_selection-v1", split='test')
dataset = dataset.map(lambda entry: {'label': 'SUPPORTED' if entry['label'] == 'SUPPORTS' else 'NOT_SUPPORTED'})

In [8]:
for name, models in pipeline_models.items():
    print(f"Evaluating with {name}...")
    pipeline = FeverPipeline(claim_splitter=None,
                             evid_selector=models.get('evid_selector'),
                             stm_verifier=models.get('stm_verifier'))
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=name))

Evaluating with base...


  0%|          | 0/422 [00:00<?, ?it/s]A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.
<All keys matched successfully>
100%|██████████| 422/422 [1:20:08<00:00, 11.39s/it]     


################################
FeverScore: 0.8943620178041543
Gold FeverScore: 0.9448071216617211
              precision    recall  f1-score   support

           0     0.9324    0.9462    0.9392       874
           1     0.9411    0.9260    0.9335       811

    accuracy                         0.9365      1685
   macro avg     0.9367    0.9361    0.9364      1685
weighted avg     0.9366    0.9365    0.9365      1685

################################
Evaluating with finetuned...


  0%|          | 0/422 [00:00<?, ?it/s]<All keys matched successfully>
100%|██████████| 422/422 [19:14<00:00,  2.74s/it]

################################
FeverScore: 0.913353115727003
Gold FeverScore: 0.9614243323442137
              precision    recall  f1-score   support

           0     0.9601    0.9371    0.9485       874
           1     0.9339    0.9581    0.9458       811

    accuracy                         0.9472      1685
   macro avg     0.9470    0.9476    0.9471      1685
weighted avg     0.9475    0.9472    0.9472      1685

################################





# 7 Check on incorrect predicted

Similar to the approach used in FactScore, we sample 30 incorrect predictions from our test sets, excluding those cases where no evidence is available on Wikipedia. These sampled examples are saved to a separate file for detailed manual evaluation. The analysis focuses on identifying the following types of issues:
- No direct evidence from retrieved passages
- Atomic fact is contex-dependent
- Distracted by other passages
- Wrong prediction even with right passage
- Error in pipeline (e.g translation error)
- Annotation Error

In [47]:
def sample_incorrect_predictions(dataset_names: List[str], file_pattern: str, output_file: str,
                                 sample_size: int = 30, seed: int = 42) -> None:
    """
    Samples incorrect predictions from test sets and saves them to a file for manual assessment.

    Args:
        dataset_names (List[str]): List of dataset names to process.
        file_pattern (str): Pattern to match file names for loading predictions.
        output_file (str): Path to the output file for saving the sampled incorrect predictions.
        sample_size (int): Number of incorrect predictions to sample.
        seed (int): Random seed for reproducibility.
    """
    random.seed(seed)
    all_outputs = []

    for dataset_name in dataset_names:
        file_path = PROJECT_DIR / file_pattern.format(dataset_name=dataset_name)
        outputs = JSONLineReader().read(file_path)
        all_outputs.extend(outputs)
    false_predicted_outputs = [output for output in all_outputs if
                               output['label'] != output['predicted'] and output['predicted'] != -1]

    sample_size = min(sample_size, len(false_predicted_outputs))
    sampled_outputs = random.sample(false_predicted_outputs, sample_size)

    # Print sampled outputs for review
    print(f"Sampled {len(sampled_outputs)} incorrect predictions:")
    for sample in sampled_outputs:
        print(json.dumps(sample, indent=4))
        print('############################')
    JSONLineReader().write(output_file, sampled_outputs)

In [48]:
# Sample incorrect predictions from the finetuned pipeline without Claim Splitting
sample_incorrect_predictions(
    dataset_names=list(datasets.keys()),
    file_pattern="data/evaluation/{dataset_name}_finetuned.jsonl",
    output_file=str(PROJECT_DIR / "data/evaluation/incorrect_pred_samples.jsonl")
)

# Sample incorrect predictions from models with DisSimSplitter
sample_incorrect_predictions(
    dataset_names=list(datasets.keys()),
    file_pattern="data/evaluation/{dataset_name}_finetuned_DisSimSplitter.jsonl",
    output_file=str(
        PROJECT_DIR / "data/evaluation/incorrect_pred_samples_DisSimSplitter.jsonl")
)

Sampled 30 incorrect predictions:
{
    "id": 15,
    "word": "Starbright Foundation",
    "claim": "schwer kranken Kindern hilft",
    "connected_claim": "The Starbright Foundation symbolizes the support and care provided to severely ill children.",
    "label": "SUPPORTED",
    "predicted": "NOT_SUPPORTED",
    "factuality": 0.0,
    "atoms": [
        {
            "atom": "The Starbright Foundation symbolizes the support and care provided to severely ill children.",
            "predicted": "NOT_SUPPORTED"
        }
    ],
    "evidence": [
        {
            "title": "Starlight Children's Foundation (wikipedia)",
            "line_idx": 1,
            "text": "Starlight's programs include providing hospital wear, games, and deliveries to hospitalized children.",
            "sim": 0.6311880946159363,
            "in_intro": true
        },
        {
            "title": "Starlight Children's Foundation (wikipedia)",
            "line_idx": 0,
            "text": "Starlight Chil

### 8. BM25 Selection (Whole Page vs. Intro Page) (Best Setup)

In our document selection process, we first retrieve all pages related to a specific word. Given the potentially large number of pages and the associated computation time, we use the BM25 algorithm to select the top 3 most relevant pages for the claim.

In this section, we evaluate whether using only the intro sections of these pages is sufficient for effective BM25 selection, which also reduces computation time. We will compare the results of selecting from intro sections versus the full pages to determine if the selections are consistent and if the top-ranked pages are the same in both scenarios.

In [25]:
for dataset_name, config in datasets.items():
    print(f"Evaluating {dataset_name}...")
    dataset = config['dataset']
    lang = config['lang']

    equal_evids = 0
    first_evid_match = 0
    first_intro_match = 0
    first_full_match = 0
    for entry in tqdm(dataset):
        if entry['in_wiki'] == 'No':
            continue

        _, intro_evids = offline_evid_fetcher.fetch_evidences(
            search_word=entry['document_search_word'], only_intro=True)
        _, full_evids = offline_evid_fetcher.fetch_evidences(
            search_word=entry['document_search_word'], only_intro=False)

        intro_evids_indices = rank_docs(entry['connected_claim'],
                                        [" ".join(evidence.get('lines')) for evidence in
                                         intro_evids], k=3)
        selected_intro_pages = [intro_evids[idx].get('title') for idx in intro_evids_indices]
        full_evids_indices = rank_docs(entry['connected_claim'],
                                       [" ".join(evidence.get('lines')) for evidence in full_evids],
                                       k=3)
        selected_full_pages = [full_evids[idx].get('title') for idx in full_evids_indices]

        if set(selected_intro_pages) == set(selected_full_pages):
            equal_evids += 1
        if selected_intro_pages[0] == selected_full_pages[0]:
            first_evid_match += 1
        if selected_intro_pages[0] in selected_full_pages:
            first_intro_match += 1    
        if selected_full_pages[0] in selected_intro_pages:
            first_full_match += 1   

    print(f"All Evidence Match: {round(100 * equal_evids / len(dataset), 2)}%")
    print(f"First Evidence Match: {round(100 * first_evid_match / len(dataset), 2)}%")
    print(f"First Intro in Full Pages: {round(100 * first_intro_match / len(dataset), 2)}%")
    print(f"First Full Page in Intro: {round(100 * first_full_match / len(dataset), 2)}%")

Evaluating german_dpr-claim_verification...


100%|██████████| 168/168 [06:06<00:00,  2.18s/it]


All Evidence Match: 59.52
First Evidence Match: 60.71
Evaluating german-claim_verification...


 10%|█         | 73/710 [10:33<1:32:04,  8.67s/it]


KeyboardInterrupt: 