# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Next, we install the packages and import the modules needed in this notebook:

In [None]:
%%capture
!pip install datasets~=2.18.0
!pip install openai~=1.35.10

In [1]:
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report

from fetchers.openai import OpenAiFetcher
from general_utils.reader import JSONLineReader
from general_utils.utils import parse_model_answer, get_openai_prediction
from config import FACT_EVULATION_OPENAI_TOKEN, PROJECT_DIR

# 1 Setup: Define Datasets
Now we define our models and datasets we want to evaluate:

In [2]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
    #'squad-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
    #    'lang': 'en'
    #}
}

In [34]:
models = [
    #'gpt-3.5-turbo',
    'gpt-4o-mini',
    #'gpt-4o'
]

In [4]:
openai_fetcher = OpenAiFetcher(api_key=FACT_EVULATION_OPENAI_TOKEN)
fh = JSONLineReader()

In [None]:
EVALUATION_DIR = PROJECT_DIR / 'data/evaluation'

In [21]:
def build_task(idx, model, content):
    return {
            "custom_id": f"task-{idx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "temperature": 0.1,
                "messages": [
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                "seed": 42,
                "logprobs": True,
                "top_logprobs": 5
            }
        }


def create_tasks(dataset, model, file_name, prompt_func):
    tasks = [build_task(idx, model, prompt_func(entry)) for idx, entry in enumerate(dataset)]
    fh.write(file_name, tasks)
    
    
def create_long_prompt(entry):
    return f'Please verify the following statement about {entry["word"]}. Input: {entry["claim"]} True or False?\nOutput:'


def create_short_prompt(entry):
    return f'Input: {entry["word"]}: {entry["claim"]} True or False?\nOutput:'
    
    
def process_results(file_name, dataset, translations: bool):
    results = fh.read(file_name)
    outputs, all_gt_labels, all_pr_labels, wiki_gt_labels, wiki_pr_labels = [], [], [], [], []

    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[-1])
        entry = dataset[index]
        claim = entry['claim']
        word = entry['word']
        predicted = get_openai_prediction(res['response']['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        output = {
            'id': entry['id'],
            'word': word,
            'claim': claim,
            'label': entry['label'],
            'predicted': predicted,
            'in_wiki': entry['in_wiki']
        }
        
        if translations:
            output['translated_word'] = entry['english_word']
            output['translated_claim'] = entry['english_claim']
        
        outputs.append(output)
        gt_label = 1 if output['label'] == 'SUPPORTED' else 0
        pr_label = 1 if output['predicted'] == 'SUPPORTED' else 0
        all_gt_labels.append(gt_label)
        all_pr_labels.append(pr_label)
        if output['in_wiki'] == 'Yes':
            wiki_gt_labels.append(gt_label)
            wiki_pr_labels.append(pr_label)
        
    report = classification_report(all_gt_labels, all_pr_labels, zero_division=0, digits=4)
    wiki_report = classification_report(wiki_gt_labels, wiki_pr_labels, zero_division=0, digits=4)
    print('Report all entries:')
    print(report)
    print('Filtered Report for entries with evidence in wikipedia:')
    print(wiki_report)
    return outputs

# 2 Manual Batch Fetching

In [37]:
input_file_name = ''
batch_job = openai_fetcher.create_batch_job(input_file_name)

In [50]:
batch_job = openai_fetcher.get_batch_update(batch_job)
print(batch_job)

Batch(id='batch_UU1sMQdpHdo2n0zriICWf6zH', completion_window='24h', created_at=1722948274, endpoint='/v1/chat/completions', input_file_id='file-hT6LKEVDf1hJiChFgeE5yQna', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1722951636, error_file_id=None, errors=None, expired_at=None, expires_at=1723034674, failed_at=None, finalizing_at=1722951623, in_progress_at=1722948275, metadata=None, output_file_id='file-AZ4mVWMglAxBGakwWJvowL4h', request_counts=BatchRequestCounts(completed=168, failed=0, total=168))


In [51]:
output_file_name = input_file_name.replace('input', 'raw_output')
openai_fetcher.get_batch_result(output_file_name, batch_job)

'raw_output_zero_shot-german_dpr-claim_verification-gpt-4o-mini_2.jsonl'

# 3 Zero Shot

In [35]:
file_base_name = str(EVALUATION_DIR / '{dataset}/{type}_zero_shot-{dataset}-{model}.jsonl')

In [36]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_prompt
        )

100%|██████████| 1/1 [00:00<00:00, 26.22it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [52]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs)

  0%|          | 0/1 [00:00<?, ?it/s]

Evaluating german_dpr-claim_verification with gpt-4o-mini...


100%|██████████| 1/1 [00:00<00:00,  2.87it/s]

Report all entries:
              precision    recall  f1-score   support

           0     0.9508    0.6905    0.8000        84
           1     0.7570    0.9643    0.8482        84

    accuracy                         0.8274       168
   macro avg     0.8539    0.8274    0.8241       168
weighted avg     0.8539    0.8274    0.8241       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9375    0.6522    0.7692        69
           1     0.7363    0.9571    0.8323        70

    accuracy                         0.8058       139
   macro avg     0.8369    0.8047    0.8008       139
weighted avg     0.8362    0.8058    0.8010       139






# 4 Ablation Translated Claims

In [14]:
file_base_name = str(EVALUATION_DIR / '{dataset}/{type}_ablation_translated_claims-{dataset}-{model}.jsonl')
for dataset_name, config in tqdm(datasets.items()):
    if config['lang'] == 'en':
        continue

    dataset = config['dataset']    
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            create_long_prompt
        )

100%|██████████| 1/1 [00:00<00:00, 26.63it/s]


Once input files are created, head to 2, to manually fetch your outputs

In [20]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs)

100%|██████████| 1/1 [00:00<00:00, 13.82it/s]

Evaluating german_dpr-claim_verification with gpt-3.5-turbo...
Report all entries:
              precision    recall  f1-score   support

           0     0.9394    0.3690    0.5299        84
           1     0.6074    0.9762    0.7489        84

    accuracy                         0.6726       168
   macro avg     0.7734    0.6726    0.6394       168
weighted avg     0.7734    0.6726    0.6394       168

Filtered Report for entries with evidence in wikipedia:
              precision    recall  f1-score   support

           0     0.9231    0.3478    0.5053        69
           1     0.6018    0.9714    0.7432        70

    accuracy                         0.6619       139
   macro avg     0.7624    0.6596    0.6242       139
weighted avg     0.7613    0.6619    0.6251       139






# 5 Zero Shot Single Facts

In [None]:
claim_split_types = [
    'DisSim_facts',
    'T5SplitRephrase_facts',
    'Factscore_facts'
]

In [None]:
file_base_name = str(EVALUATION_DIR / '{model}/{type}_zero_shot_{split_type}-{dataset}-{model}.jsonl')

for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        for split_type in claim_split_types:
            tasks = []
            for idx, entry in tqdm(enumerate(dataset)):
                word = entry['word']
                atomic_facts = entry[split_type].split('--;--')
    
                for pidx, atomic_fact in enumerate(atomic_facts):
                    tasks.append(build_task(f'{idx}-{pidx}', model, f'Input: {word}: {atomic_fact} True or False?\nOutput:'))
            fh.write(file_base_name.format(type='input', dataset=dataset_name, model=model, split_type=split_type), tasks)

Once input files are created, head to 2, to manually fetch your outputs

In [None]:
def evaluate_model(dataset_name, dataset, model, split_type):
    print(f"Evaluating {dataset_name} with {model} - {split_type}...")
    results = fh.read(file_base_name.format(type='output', dataset=dataset_name, model=model, split_type=split_type))
    
    data_dict = initialize_data_dict(dataset)
    process_results(results, dataset, data_dict, split_type)
    gt_labels, pr_labels = generate_labels(data_dict)
    
    fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model, split_type=split_type), data_dict.values())
    print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

def initialize_data_dict(dataset):
    data_dict = {}
    for entry in dataset:
        data_dict[entry['id']] = {
            'id': entry['id'],
            'word': entry['word'],
            'claim': entry['claim'],
            'label': entry['label'],
            'predicted': -1,
            'atoms': [],
            'in_wiki': entry['in_wiki']
        }
    return data_dict

def process_results(results, dataset, data_dict, split_type):
    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[1])
        atom_index = int(task_id.split('-')[2])
        
        entry = dataset[index]
        atom = entry[split_type].split('--;--')[atom_index]
        
        predicted = get_openai_prediction(res['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        data_dict[entry['id']]['atoms'].append({"atom": atom, "predicted": predicted})

def generate_labels(data_dict):
    gt_labels = []
    pr_labels = []
    
    for entry_id, entry in data_dict.items():
        all_predictions = [decision['predicted'] == 'SUPPORTED' for decision in entry['atoms']]
        average_is_supported = np.mean(all_predictions)
        data_dict[entry_id]['predicted'] = 'SUPPORTED' if average_is_supported == 1 else 'NOT_SUPPORTED'
        
        gt_labels.append(1 if entry['label'] == 'SUPPORTED' else 0)
        pr_labels.append(1 if entry['predicted'] == 'SUPPORTED' else 0)
    
    return gt_labels, pr_labels

In [None]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        for split_type in claim_split_types:
            evaluate_model(dataset_name, dataset, model, split_type)