# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Next, we install the packages and import the modules needed in this notebook:

In [None]:
%%capture
!pip install datasets~=2.18.0
!pip install openai~=1.35.10

In [1]:
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report

from fetchers.openai import OpenAiFetcher
from general_utils.reader import JSONLineReader
from general_utils.utils import parse_model_answer, get_openai_prediction
from config import FACT_EVULATION_OPENAI_TOKEN, PROJECT_DIR

# 1 Setup: Define Datasets
Now we define our models and datasets we want to evaluate:

In [2]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    # optional (contains 10k entries)
    #'german_wiktionary-claim_verification-large': {
    #    'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
    #    'lang': 'de'
    #},
    # outdated
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
}

In [3]:
models = [
    'gpt-3.5-turbo',
    'gpt-4o-mini',
    'gpt-4o'
]

In [4]:
openai_fetcher = OpenAiFetcher(api_key=FACT_EVULATION_OPENAI_TOKEN)
fh = JSONLineReader()

In [5]:
EVALUATION_DIR = PROJECT_DIR / 'data/evaluation'

In [37]:
def build_task(idx, model, content):
    return {
            "custom_id": f"task-{idx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "temperature": 0.1,
                "messages": [
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                "seed": 42,
                "logprobs": True,
                "top_logprobs": 5
            }
        }


def create_tasks(dataset, model, file_name, prompt_func):
    tasks = [build_task(idx, model, prompt_func(entry)) for idx, entry in enumerate(dataset)]
    fh.write(file_name, tasks)
    
    
def create_long_prompt(entry):
    return f'Please verify the following statement about {entry["word"]}. Input: {entry["claim"]} True or False?\nOutput:'

def create_long_translation_prompt(entry):
    return f'Please verify the following statement about {entry["english_word"]}. Input: {entry["english_claim"]} True or False?\nOutput:'


def create_short_prompt(entry):
    return f'Input: {entry["word"]}: {entry["claim"]} True or False?\nOutput:'
    
    
def process_results(file_name, dataset, translations: bool):
    results = fh.read(file_name)
    outputs, all_gt_labels, all_pr_labels, wiki_gt_labels, wiki_pr_labels = [], [], [], [], []

    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[-1])
        entry = dataset[index]
        claim = entry['claim']
        word = entry['word']
        predicted = get_openai_prediction(res['response']['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        output = {
            'id': entry['id'],
            'word': word,
            'claim': claim,
            'label': entry['label'],
            'predicted': predicted,
            'in_wiki': entry['in_wiki']
        }
        
        if translations:
            output['translated_word'] = entry['english_word']
            output['translated_claim'] = entry['english_claim']
        
        outputs.append(output)
        gt_label = 1 if output['label'] == 'SUPPORTED' else 0
        pr_label = 1 if output['predicted'] == 'SUPPORTED' else 0
        all_gt_labels.append(gt_label)
        all_pr_labels.append(pr_label)
        if output['in_wiki'] == 'Yes':
            wiki_gt_labels.append(gt_label)
            wiki_pr_labels.append(pr_label)
        
    report = classification_report(all_gt_labels, all_pr_labels, zero_division=0, digits=4)
    wiki_report = classification_report(wiki_gt_labels, wiki_pr_labels, zero_division=0, digits=4)
    print('Report all entries:')
    print(report)
    print('Filtered Report for entries with evidence in wikipedia:')
    print(wiki_report)
    return outputs

# 2 Manual Batch Fetching

In [9]:
batch_jobs = {}

In [73]:
input_file_name = str(EVALUATION_DIR / 'squad-claim_verification/zero_shot_T5SplitRephrase_facts/input/input_zero_shot_T5SplitRephrase_facts-squad-claim_verification-gpt-4o-mini.jsonl')
batch_job = openai_fetcher.create_batch_job(input_file_name)
batch_jobs[input_file_name] = batch_job

In [80]:
for file_name, batch_job in batch_jobs.items():
    batch_job = openai_fetcher.get_batch_update(batch_job)
    batch_jobs[file_name] = batch_job
    print(file_name)
    print(batch_job)
    print("_______________")

/Users/lukasellinger/PycharmProjects/evaluating_factuality_word_definitions/data/evaluation/german_dpr-claim_verification/input_zero_shot-german_dpr-claim_verification-gpt-3.5-turbo.jsonl
Batch(id='batch_D3U4N39kiLEmx6Sju3dzwsBu', completion_window='24h', created_at=1724058670, endpoint='/v1/chat/completions', input_file_id='file-aKqzmdCd40L5T1YhPoycDR5f', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1724059871, error_file_id=None, errors=None, expired_at=None, expires_at=1724145070, failed_at=None, finalizing_at=1724059861, in_progress_at=1724058671, metadata=None, output_file_id='file-GDIP8B7hSC0gg8rGcxQdNVkK', request_counts=BatchRequestCounts(completed=168, failed=0, total=168))
_______________
/Users/lukasellinger/PycharmProjects/evaluating_factuality_word_definitions/data/evaluation/german_dpr-claim_verification/input_zero_shot-german_dpr-claim_verification-gpt-4o.jsonl
Batch(id='batch_II9FiEZAsWqHzF6TjfFkKNKG', completion_window='24h', 

In [82]:
for file_name, batch_job in batch_jobs.items():
    output_file_name = file_name.replace('input', 'raw_output')
    openai_fetcher.get_batch_result(output_file_name, batch_job)

# 3 Zero Shot

In [83]:
file_base_name = str(EVALUATION_DIR / '{dataset}/zero_shot/{type}/{type}_zero_shot-{dataset}-{model}.jsonl')

In [31]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_prompt
        )

100%|██████████| 3/3 [00:00<00:00, 13.59it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [85]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs)

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.9474    0.4286    0.5902        84
           1     0.6308    0.9762    0.7664        84

    accuracy                         0.7024       168
   macro avg     0.7891    0.7024    0.6783       168
weighted avg     0.7891    0.7024    0.6783       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9375    0.4348    0.5941        69
           1     0.6355    0.9714    0.7684        70

    accuracy                         0.7050       139
   macro avg     0.7865    0.7031    0.6812       139
weighted avg     0.7854    0.7050    0.6818       139

Evaluating german_dpr-claim_verification with gpt-4o-mini...
Report all entries:
              precision    recall  f1-score   support

           0  

# 4 Ablation Translated Claims

In [86]:
file_base_name = str(EVALUATION_DIR / '{dataset}/ablation_translated_claims/{type}/{type}_ablation_translated_claims-{dataset}-{model}.jsonl')

In [38]:
for dataset_name, config in tqdm(datasets.items()):
    if config['lang'] == 'en':
        continue

    dataset = config['dataset']    
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_translation_prompt
        )

100%|██████████| 3/3 [00:00<00:00, 19.95it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [88]:
for dataset_name, config in datasets.items():
    if config['lang'] == 'en':
        continue
        
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs)

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.9403    0.7500    0.8344        84
           1     0.7921    0.9524    0.8649        84

    accuracy                         0.8512       168
   macro avg     0.8662    0.8512    0.8497       168
weighted avg     0.8662    0.8512    0.8497       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9259    0.7246    0.8130        69
           1     0.7765    0.9429    0.8516        70

    accuracy                         0.8345       139
   macro avg     0.8512    0.8337    0.8323       139
weighted avg     0.8507    0.8345    0.8324       139

Evaluating german_dpr-claim_verification with gpt-4o-mini...
Report all entries:
              precision    recall  f1-score   support

           0  

# 5 Zero Shot Single Facts

In [33]:
claim_split_types = [
    'DisSim_facts',
    'T5SplitRephrase_facts',
    'Factscore_facts'
]

In [89]:
file_base_name = str(EVALUATION_DIR / '{dataset}/zero_shot_{split_type}/{type}/{type}_zero_shot_{split_type}-{dataset}-{model}.jsonl')

In [35]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        for split_type in claim_split_types:
            tasks = []
            for idx, entry in enumerate(dataset):
                word = entry['word']
                atomic_facts = entry[split_type].split('--;--')
    
                for aidx, atomic_fact in enumerate(atomic_facts):
                    tasks.append(build_task(f'{idx}-{aidx}', model, f'Input: {word}: {atomic_fact} True or False?\nOutput:'))
            fh.write(file_base_name.format(type='input', dataset=dataset_name, model=model, split_type=split_type), tasks)

100%|██████████| 3/3 [00:00<00:00,  3.91it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [95]:
def evaluate_model(dataset_name, dataset, model, split_type):
    print(f"Evaluating {dataset_name} with {model} - {split_type}...")
    results = fh.read(file_base_name.format(type='raw_output', dataset=dataset_name, model=model, split_type=split_type))
    
    data_dict = init_data_dict(dataset)
    process_results(results, dataset, data_dict, split_type)
    gt_labels, pr_labels = generate_labels(data_dict)
    
    fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model, split_type=split_type), data_dict.values())
    print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

def init_data_dict(dataset):
    data_dict = {}
    for entry in dataset:
        data_dict[entry['id']] = {
            'id': entry['id'],
            'word': entry['word'],
            'claim': entry['claim'],
            'label': entry['label'],
            'predicted': -1,
            'atoms': [],
            'in_wiki': entry['in_wiki']
        }
    return data_dict

def process_results(results, dataset, data_dict, split_type):
    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[1])
        atom_index = int(task_id.split('-')[2])
        
        entry = dataset[index]
        atom = entry[split_type].split('--;--')[atom_index]
        
        predicted = get_openai_prediction(res['response']['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        data_dict[entry['id']]['atoms'].append({"atom": atom, "predicted": predicted})

def generate_labels(data_dict):
    gt_labels = []
    pr_labels = []
    
    for entry_id, entry in data_dict.items():
        all_predictions = [decision['predicted'] == 'SUPPORTED' for decision in entry['atoms']]
        average_is_supported = np.mean(all_predictions)
        data_dict[entry_id]['predicted'] = 'SUPPORTED' if average_is_supported == 1 else 'NOT_SUPPORTED'
        
        gt_labels.append(1 if entry['label'] == 'SUPPORTED' else 0)
        pr_labels.append(1 if entry['predicted'] == 'SUPPORTED' else 0)
    
    return gt_labels, pr_labels

In [96]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    print(f"{dataset_name}: _________________________")
    for model in models:
        for split_type in claim_split_types:
            evaluate_model(dataset_name, dataset, model, split_type)

german_dpr-claim_verification: _________________________
Evaluating german_dpr-claim_verification with gpt-3.5-turbo - DisSim_facts...
              precision    recall  f1-score   support

           0     0.8395    0.8095    0.8242        84
           1     0.8161    0.8452    0.8304        84

    accuracy                         0.8274       168
   macro avg     0.8278    0.8274    0.8273       168
weighted avg     0.8278    0.8274    0.8273       168

Evaluating german_dpr-claim_verification with gpt-3.5-turbo - T5SplitRephrase_facts...
              precision    recall  f1-score   support

           0     0.8987    0.8452    0.8712        84
           1     0.8539    0.9048    0.8786        84

    accuracy                         0.8750       168
   macro avg     0.8763    0.8750    0.8749       168
weighted avg     0.8763    0.8750    0.8749       168

Evaluating german_dpr-claim_verification with gpt-3.5-turbo - Factscore_facts...
              precision    recall  f1-score