# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Next, we install the packages and import the modules needed in this notebook:

In [None]:
%%capture
!pip install datasets~=2.18.0
!pip install openai~=1.35.10

In [None]:
from datasets import load_dataset
from tqdm import tqdm
import numpy as np
from sklearn.metrics import classification_report

from fetchers.openai import OpenAiFetcher
from general_utils.reader import JSONLineReader
from general_utils.utils import parse_model_answer, get_openai_prediction

# 1 Setup: Define Datasets
Now we define our models and datasets we want to evaluate:

In [None]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german-claim_verification': {
        'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    }
}

In [None]:
models = [
    'gpt-3.5-turbo',
    'gpt-4o-mini',
    'gpt-4o'
]

In [None]:
openai_fetcher = OpenAiFetcher()
fh = JSONLineReader()

In [None]:
def build_task(idx, model, content):
    return {
            "custom_id": f"task-{idx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "temperature": 0.1,
                "messages": [
                    {
                        "role": "user",
                        "content": content
                    }
                ],
                "seed": 42
            }
        }


def create_tasks(dataset, model, file_name, statement_func):
    tasks = [build_task(idx, model, f'Input: {statement_func(entry)} True or False?\nOutput:') for idx, entry in enumerate(dataset)]
    fh.write(file_name, tasks)
    

def process_results(file_name, dataset, translations: bool):
    results = fh.read(file_name)
    outputs, gt_labels, pr_labels = [], [], []

    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[-1])
        entry = dataset[index]
        claim = entry['claim']
        word = entry['word']
        predicted = get_openai_prediction(res['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        output = {
            'id': entry['id'],
            'word': word,
            'claim': claim,
            'label': entry['label'],
            'predicted': predicted,
            'in_wiki': entry['in_wiki']
        }
        
        if translations:
            output['translated_word'] = entry['english_word']
            output['translated_claim'] = entry['english_claim']
        
        outputs.append(output)
        gt_labels.append(1 if entry['label'] == 'SUPPORTED' else 0)
        pr_labels.append(1 if entry['predicted'] == 'SUPPORTED' else 0)
        
    report = classification_report(gt_labels, pr_labels, zero_division=0, digits=4)
    return outputs, report

# 2 Manual Batch Fetching

In [None]:
input_file_name = ''
batch_job = openai_fetcher.create_batch_job(input_file_name)

In [None]:
batch_job = openai_fetcher.get_batch_update(batch_job)
print(batch_job)

In [None]:
output_file_name = input_file_name.replace('input', 'raw_output')
openai_fetcher.get_batch_result(output_file_name, batch_job)

# 3 Zero Shot

In [None]:
file_base_name = '{type}_zero_shot-{dataset}-{model}'

In [None]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            lambda entry: f"{entry['word']}: {entry['claim']}"
        )

Once input files are created, head to 2, to manually fetch your outputs

In [None]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs, report = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs)
        print(report)

# 4 Ablation Translated Claims

In [None]:
file_base_name = '{type}_ablation_translated_claims-{dataset}-{model}'
for dataset_name, config in tqdm(datasets.items()):
    if config['lang'] == 'en':
        continue

    dataset = config['dataset']    
    for model in models:
        create_tasks(
            dataset,
            model,
            file_base_name.format(type='input', dataset=dataset_name, model=model),
            lambda entry: f"{entry['english_word']}: {entry['english_claim']}"
        )

Once input files are created, head to 2, to manually fetch your outputs

In [None]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        print(f"Evaluating {dataset_name} with {model}...")
        outputs, report = process_results(file_base_name.format(type='raw_output', dataset=dataset_name, model=model), dataset, translations=False)
        fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model), outputs)
        print(report)

# 5 Zero Shot Single Facts

In [None]:
claim_split_types = [
    'DisSim_facts',
    'T5SplitRephrase_facts',
    'Factscore_facts'
]

In [None]:
file_base_name = '{type}_zero_shot_{split_type}-{dataset}-{model}'

for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        for split_type in claim_split_types:
            tasks = []
            for idx, entry in tqdm(enumerate(dataset)):
                word = entry['word']
                atomic_facts = entry[split_type].split('--;--')
    
                for pidx, atomic_fact in enumerate(atomic_facts):
                    tasks.append(build_task(f'{idx}-{pidx}', model, f'Input: {word}: {atomic_fact} True or False?\nOutput:'))
            fh.write(file_base_name.format(type='input', dataset=dataset_name, model=model, split_type=split_type), tasks)

Once input files are created, head to 2, to manually fetch your outputs

In [None]:
def evaluate_model(dataset_name, dataset, model, split_type):
    print(f"Evaluating {dataset_name} with {model} - {split_type}...")
    results = fh.read(file_base_name.format(type='output', dataset=dataset_name, model=model, split_type=split_type))
    
    data_dict = initialize_data_dict(dataset)
    process_results(results, dataset, data_dict, split_type)
    gt_labels, pr_labels = generate_labels(data_dict)
    
    fh.write(file_base_name.format(type='output', dataset=dataset_name, model=model, split_type=split_type), data_dict.values())
    print(classification_report(gt_labels, pr_labels, zero_division=0, digits=4))

def initialize_data_dict(dataset):
    data_dict = {}
    for entry in dataset:
        data_dict[entry['id']] = {
            'id': entry['id'],
            'word': entry['word'],
            'claim': entry['claim'],
            'label': entry['label'],
            'predicted': -1,
            'atoms': [],
            'in_wiki': entry['in_wiki']
        }
    return data_dict

def process_results(results, dataset, data_dict, split_type):
    for res in results:
        task_id = res['custom_id']
        index = int(task_id.split('-')[1])
        atom_index = int(task_id.split('-')[2])
        
        entry = dataset[index]
        atom = entry[split_type].split('--;--')[atom_index]
        
        predicted = get_openai_prediction(res['body'])
        if predicted == 'UNKOWN':
            txt_answer = res['response']['body']['choices'][0]['message']['content']
            predicted = parse_model_answer(txt_answer)
        
        data_dict[entry['id']]['atoms'].append({"atom": atom, "predicted": predicted})

def generate_labels(data_dict):
    gt_labels = []
    pr_labels = []
    
    for entry_id, entry in data_dict.items():
        all_predictions = [decision['predicted'] == 'SUPPORTED' for decision in entry['atoms']]
        average_is_supported = np.mean(all_predictions)
        data_dict[entry_id]['predicted'] = 'SUPPORTED' if average_is_supported == 1 else 'NOT_SUPPORTED'
        
        gt_labels.append(1 if entry['label'] == 'SUPPORTED' else 0)
        pr_labels.append(1 if entry['predicted'] == 'SUPPORTED' else 0)
    
    return gt_labels, pr_labels

In [None]:
for dataset_name, config in tqdm(datasets.items()):
    dataset = config['dataset']
    for model in models:
        for split_type in claim_split_types:
            evaluate_model(dataset_name, dataset, model, split_type)