# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Next, we install the packages and import the modules needed in this notebook:

In [None]:
%%capture
!pip install datasets~=2.18.0
!pip install openai~=1.35.10

In [1]:
from pathlib import Path

import numpy as np
from datasets import load_dataset
from sklearn.metrics import classification_report
from tqdm import tqdm

from config import FACT_EVULATION_OPENAI_TOKEN, PROJECT_DIR
from fetchers.openai import OpenAiFetcher
from general_utils.reader import JSONLineReader
from general_utils.utils import get_openai_prediction, parse_model_answer

# 1 Setup: Define Datasets
Now we define our models and datasets we want to evaluate:

In [13]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    'shroom-claim_verification': {
        'dataset': load_dataset('lukasellinger/shroom-claim_verification', split='test'),
        'lang': 'en'
    },
    ## optional (contains 10k entries)
    #'german_wiktionary-claim_verification-large': {
    #    'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
    #    'lang': 'de'
    #},
    # outdated
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
}

In [14]:
models = [
    'gpt-3.5-turbo',
    'gpt-4o-mini',
    'gpt-4o'
]

In [15]:
openai_fetcher = OpenAiFetcher(api_key=FACT_EVULATION_OPENAI_TOKEN)
fh = JSONLineReader()

In [16]:
EVALUATION_DIR = PROJECT_DIR / 'data/evaluation'

In [40]:
def build_task(idx, model, content):
    return {
            "custom_id": f"task-{idx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "temperature": 0.1,
                "messages": [
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                "seed": 42,
                "logprobs": True,
                "top_logprobs": 5
            }
        }


def create_tasks(dataset, model, file_name, prompt_func):
    tasks = [build_task(idx, model, prompt_func(entry)) for idx, entry in enumerate(dataset)]
    fh.write(file_name, tasks, mode='w')


def create_long_german_prompt(entry):
    return f'Bitte überprüfen die folgende Aussage über {entry["word"]}. Aussage: {entry["claim"]} Wahr oder Falsch?\nAusgabe:'

    
def create_long_prompt(entry):
    return create_long_prompt_from_claim(entry['word'], entry['claim'])


def create_connected_prompt(entry):
    return f'Please verify the following statement. Input: {entry["connected_claim"]} True or False?\nOutput:'


def create_long_prompt_from_claim(word, claim):
    return f'Please verify the following statement about {word}. Input: {claim} True or False?\nOutput:'


def create_long_translation_prompt(entry):
    return f'Please verify the following statement about {entry["english_word"]}. Input: {entry["english_claim"]} True or False?\nOutput:'


def create_short_prompt(entry):
    return f'Input: {entry["word"]}: {entry["claim"]} True or False?\nOutput:'
    
    
def process_results(file_name, dataset, translations: bool, lang='en'):
    results = fh.read(file_name)
    outputs, all_gt_labels, all_pr_labels, wiki_gt_labels, wiki_pr_labels = [], [], [], [], []

    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[-1])
        entry = dataset[index]
        claim = entry['claim']
        word = entry['word']
        predicted = get_openai_prediction(res['response']['body'])
        if predicted == 'UNKNOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer, language=lang)
        
        output = {
            'id': entry['id'],
            'word': word,
            'claim': claim,
            'label': entry['label'],
            'predicted': predicted,
            'in_wiki': entry['in_wiki']
        }
        
        if translations:
            output['translated_word'] = entry['english_word']
            output['translated_claim'] = entry['english_claim']
        
        outputs.append(output)
        gt_label = 1 if output['label'] == 'SUPPORTED' else 0
        pr_label = 1 if output['predicted'] == 'SUPPORTED' else 0
        all_gt_labels.append(gt_label)
        all_pr_labels.append(pr_label)
        if output['in_wiki'] == 'Yes':
            wiki_gt_labels.append(gt_label)
            wiki_pr_labels.append(pr_label)
        
    report = classification_report(all_gt_labels, all_pr_labels, zero_division=0, digits=4)
    wiki_report = classification_report(wiki_gt_labels, wiki_pr_labels, zero_division=0, digits=4)
    print('Report all entries:')
    print(report)
    print('Filtered Report for entries with evidence in wikipedia:')
    print(wiki_report)
    return outputs

# 2 Manual Batch Fetching

In [20]:
batch_jobs = {}

In [33]:
input_file_name = str(EVALUATION_DIR / 'german_dpr-claim_verification/zero_shot/input/input_zero_shot_connected_prompt-german_dpr-claim_verification-gpt-4o-mini.jsonl')
batch_job = openai_fetcher.create_batch_job(input_file_name)
batch_jobs[input_file_name] = batch_job

In [34]:
len(batch_jobs)

12

In [36]:
for file_name, batch_job in batch_jobs.items():
    batch_job = openai_fetcher.get_batch_update(batch_job)
    batch_jobs[file_name] = batch_job
    print(file_name)
    print(batch_job)
    print("_______________")

/Users/lukasellinger/PycharmProjects/evaluating_factuality_word_definitions/data/evaluation/squad-claim_verification/zero_shot/input/input_zero_shot_connected_prompt-squad-claim_verification-gpt-4o-mini.jsonl
Batch(id='batch_66fab332d7f081908f5352b706b12879', completion_window='24h', created_at=1727705907, endpoint='/v1/chat/completions', input_file_id='file-jMqHUJ260guskRlHM4X17dMT', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1727706044, error_file_id=None, errors=None, expired_at=None, expires_at=1727792307, failed_at=None, finalizing_at=1727706032, in_progress_at=1727705907, metadata=None, output_file_id='file-nFVtlkaKDFh6bVzNe7bjGG6v', request_counts=BatchRequestCounts(completed=158, failed=0, total=158))
_______________
/Users/lukasellinger/PycharmProjects/evaluating_factuality_word_definitions/data/evaluation/squad-claim_verification/zero_shot/input/input_zero_shot_connected_prompt-squad-claim_verification-gpt-4o.jsonl
Batch(id='batch_

In [37]:
for file_name, batch_job in batch_jobs.items():
    output_file_name = file_name.replace('input', 'raw_output')
    openai_fetcher.get_batch_result(output_file_name, batch_job)

# 3 Zero Shot

In [22]:
file_base_name = str(EVALUATION_DIR / '{dataset}/zero_shot/{type}/{type}_zero_shot-{dataset}-{model}.jsonl')

In [8]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_prompt
        )

100%|██████████| 1/1 [00:00<00:00,  3.70it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [23]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        file_name = file_base_name.format(type='raw_output', dataset=dataset_name, model=model)
        if not Path(file_name).exists():
            continue
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs, mode='w')

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.9474    0.4286    0.5902        84
           1     0.6308    0.9762    0.7664        84

    accuracy                         0.7024       168
   macro avg     0.7891    0.7024    0.6783       168
weighted avg     0.7891    0.7024    0.6783       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9375    0.4348    0.5941        69
           1     0.6355    0.9714    0.7684        70

    accuracy                         0.7050       139
   macro avg     0.7865    0.7031    0.6812       139
weighted avg     0.7854    0.7050    0.6818       139

Evaluating german_dpr-claim_verification with gpt-4o-mini...
Report all entries:
              precision    recall  f1-score   support

           0  

# 4 German Prompt

In [15]:
file_base_name = str(EVALUATION_DIR / '{dataset}/zero_shot/{type}/{type}_zero_shot_german_prompt-{dataset}-{model}.jsonl')

In [17]:
for dataset_name, config in tqdm(datasets.items()):
    if config['lang'] == 'en':
        continue
        
    dataset = config['dataset']
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_german_prompt
        )

100%|██████████| 1/1 [00:00<00:00, 10433.59it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [17]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        file_name = file_base_name.format(type='raw_output', dataset=dataset_name, model=model)
        if not Path(file_name).exists():
            continue
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False, lang='de')
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs, mode='w')

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.8000    0.9048    0.8492        84
           1     0.8904    0.7738    0.8280        84

    accuracy                         0.8393       168
   macro avg     0.8452    0.8393    0.8386       168
weighted avg     0.8452    0.8393    0.8386       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.8243    0.8841    0.8531        69
           1     0.8769    0.8143    0.8444        70

    accuracy                         0.8489       139
   macro avg     0.8506    0.8492    0.8488       139
weighted avg     0.8508    0.8489    0.8488       139

Evaluating german_dpr-claim_verification with gpt-4o-mini...
Report all entries:
              precision    recall  f1-score   support

           0  

# 5 Connected Claim

In [38]:
file_base_name = str(EVALUATION_DIR / '{dataset}/zero_shot/{type}/{type}_zero_shot_connected_prompt-{dataset}-{model}.jsonl')

In [19]:
for dataset_name, config in tqdm(datasets.items()):      
    dataset = config['dataset']
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_connected_prompt
        )

100%|██████████| 4/4 [00:00<00:00,  8.82it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [39]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        file_name = file_base_name.format(type='raw_output', dataset=dataset_name, model=model)
        if not Path(file_name).exists():
            continue
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False, lang='de')
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs, mode='w')

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.9241    0.8690    0.8957        84
           1     0.8764    0.9286    0.9017        84

    accuracy                         0.8988       168
   macro avg     0.9002    0.8988    0.8987       168
weighted avg     0.9002    0.8988    0.8987       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9355    0.8406    0.8855        69
           1     0.8571    0.9429    0.8980        70

    accuracy                         0.8921       139
   macro avg     0.8963    0.8917    0.8917       139
weighted avg     0.8960    0.8921    0.8918       139

Evaluating german_dpr-claim_verification with gpt-4o-mini...
Report all entries:
              precision    recall  f1-score   support

           0  

# 5 Ablation Translated Claims

In [18]:
file_base_name = str(EVALUATION_DIR / '{dataset}/ablation_translated_claims/{type}/{type}_ablation_translated_claims-{dataset}-{model}.jsonl')

In [38]:
for dataset_name, config in tqdm(datasets.items()):
    if config['lang'] == 'en':
        continue

    dataset = config['dataset']    
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_translation_prompt
        )

100%|██████████| 3/3 [00:00<00:00, 19.95it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [19]:
for dataset_name, config in datasets.items():
    if config['lang'] == 'en':
        continue
        
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs, mode='w')

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.9403    0.7500    0.8344        84
           1     0.7921    0.9524    0.8649        84

    accuracy                         0.8512       168
   macro avg     0.8662    0.8512    0.8497       168
weighted avg     0.8662    0.8512    0.8497       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9259    0.7246    0.8130        69
           1     0.7765    0.9429    0.8516        70

    accuracy                         0.8345       139
   macro avg     0.8512    0.8337    0.8323       139
weighted avg     0.8507    0.8345    0.8324       139

Evaluating german_dpr-claim_verification with gpt-4o-mini...
Report all entries:
              precision    recall  f1-score   support

           0  

# 6 Zero Shot Single Facts

In [41]:
claim_split_types = [
    'DisSim_facts',
    'T5SplitRephrase_facts',
    'Factscore_facts'
]

In [42]:
file_base_name = str(EVALUATION_DIR / '{dataset}/zero_shot_{split_type}/{type}/{type}_zero_shot_{split_type}-{dataset}-{model}.jsonl')

In [35]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        for split_type in claim_split_types:
            tasks = []
            for idx, entry in enumerate(dataset):
                word = entry.get('english_word', entry['word'])
                atomic_facts = entry[split_type].split('--;--')
    
                for aidx, atomic_fact in enumerate(atomic_facts):
                    tasks.append(build_task(f'{idx}-{aidx}', model, create_long_prompt_from_claim(word, atomic_fact)))
            fh.write(file_base_name.format(type='input', dataset=dataset_name, model=model, split_type=split_type), tasks, mode='w')

100%|██████████| 4/4 [00:02<00:00,  1.91it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [43]:
def evaluate_model(dataset_name, dataset, model, split_type):
    print(f"Evaluating {dataset_name} with {model} - {split_type}...")
    results = fh.read(file_base_name.format(type='raw_output', dataset=dataset_name, model=model, split_type=split_type))
    
    data_dict = init_data_dict(dataset)
    process_results(results, dataset, data_dict, split_type)
    gt_labels, pr_labels, wiki_gt_labels, wiki_pr_labels = generate_labels(data_dict)
    
    fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model, split_type=split_type), data_dict.values(), mode='w')    
    report = classification_report(gt_labels, pr_labels, zero_division=0, digits=4)
    wiki_report = classification_report(wiki_gt_labels, wiki_pr_labels, zero_division=0, digits=4)
    print('Report all entries:')
    print(report)
    print('Filtered Report for entries with evidence in wikipedia:')
    print(wiki_report)

def init_data_dict(dataset):
    data_dict = {}
    for entry in dataset:
        data_dict[entry['id']] = {
            'id': entry['id'],
            'word': entry['word'],
            'claim': entry['claim'],
            'label': entry['label'],
            'predicted': -1,
            'atoms': [],
            'in_wiki': entry['in_wiki']
        }
    return data_dict

def process_results(results, dataset, data_dict, split_type):
    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[1])
        atom_index = int(task_id.split('-')[2])
        
        entry = dataset[index]
        atom = entry[split_type].split('--;--')[atom_index]
        
        predicted = get_openai_prediction(res['response']['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        data_dict[entry['id']]['atoms'].append({"atom": atom, "predicted": predicted})

def generate_labels(data_dict):
    gt_labels, wiki_gt_labels = [], []
    pr_labels, wiki_pr_labels = [], []
    
    for entry_id, entry in data_dict.items():
        all_predictions = [decision['predicted'] == 'SUPPORTED' for decision in entry['atoms']]
        average_is_supported = np.mean(all_predictions)
        data_dict[entry_id]['predicted'] = 'SUPPORTED' if average_is_supported == 1 else 'NOT_SUPPORTED'
                
        gt_label = 1 if entry['label'] == 'SUPPORTED' else 0
        pr_label = 1 if entry['predicted'] == 'SUPPORTED' else 0
        gt_labels.append(gt_label)
        pr_labels.append(pr_label)
        
        if entry['in_wiki'] == 'Yes':
            wiki_gt_labels.append(gt_label)
            wiki_pr_labels.append(pr_label)
            
    return gt_labels, pr_labels, wiki_gt_labels, wiki_pr_labels

In [44]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        for split_type in claim_split_types:
            evaluate_model(dataset_name, dataset, model, split_type)

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo - DisSim_facts...
Report all entries:
              precision    recall  f1-score   support

           0     0.8144    0.9405    0.8729        84
           1     0.9296    0.7857    0.8516        84

    accuracy                         0.8631       168
   macro avg     0.8720    0.8631    0.8623       168
weighted avg     0.8720    0.8631    0.8623       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.8025    0.9420    0.8667        69
           1     0.9310    0.7714    0.8438        70

    accuracy                         0.8561       139
   macro avg     0.8668    0.8567    0.8552       139
weighted avg     0.8672    0.8561    0.8551       139

Evaluating german_dpr-claim_verification with gpt-3.5-turbo - T5SplitRephrase_facts...
Report all entries:
              precision    r