# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [None]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Next, we install the packages and import the modules needed in this notebook:

In [None]:
%%capture
!pip install datasets~=2.18.0
!pip install openai~=1.35.10

In [1]:
import string
from datasets import load_dataset
from general_utils.reader import JSONLineReader
from config import PROJECT_DIR
from fetchers.wikipedia import Wikipedia
import numpy as np
from rank_bm25 import BM25Okapi
from general_utils.utils import parse_model_answer
from collections import defaultdict
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from fetchers.openai import OpenAiFetcher
from general_utils.atomic_facts import FactScoreFactGenerator
from tqdm import tqdm
from config import FACT_EVULATION_OPENAI_TOKEN
from sentence_transformers import SentenceTransformer
from datasets import DatasetDict
from config import HF_WRITE_TOKEN
from dataset.def_dataset import Fact
from general_utils.utils import print_classification_report
from sklearn.metrics import classification_report

# 1 Setup: Define Datasets
Now we define our models and datasets we want to evaluate:

In [33]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    'shroom-claim_verification': {
        'dataset': load_dataset('lukasellinger/shroom-claim_verification', split='test'),
        'lang': 'en'
    }
    # optional (contains 10k entries)
    #'german_wiktionary-claim_verification-large': {
    #    'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
    #    'lang': 'de'
    #},
    # outdated
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
}

Downloading readme:   0%|          | 0.00/810 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 232k/232k [00:00<00:00, 242kB/s]


Generating test split:   0%|          | 0/563 [00:00<?, ? examples/s]

In [3]:
openai_fetcher = OpenAiFetcher(api_key=FACT_EVULATION_OPENAI_TOKEN)
fh = JSONLineReader()
offline_wiki = 'lukasellinger/wiki_dump_2024-09-27'
wiki = Wikipedia(use_dataset=offline_wiki)

Downloading readme:   0%|          | 0.00/437 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 166M/166M [00:28<00:00, 5.75MB/s] 


Generating train split:   0%|          | 0/35566 [00:00<?, ? examples/s]



In [4]:
EVALUATION_DIR = PROJECT_DIR / 'data/evaluation'

# 2 Create Factscore Facts
Batched Request are not supported with gpt-3.5-turbo-instruct

In [5]:
af_prompt_generator = FactScoreFactGenerator(PROJECT_DIR.joinpath('factscore/demos'), is_bio=False)
model = "gpt-3.5-turbo-instruct"
temperature = 0.7
max_tokens = 512

In [6]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']

    fact_column = []
    for entry in tqdm(dataset):
        claim = entry['connected_claim']
        prompt = af_prompt_generator.get_prompt_for_sentence(claim)
        response = openai_fetcher.client.completions.create(
            model=model,
            temperature=temperature,
            max_tokens=max_tokens,
            prompt=prompt,
            seed=42
        )
        generated_answer = response.choices[0].text
        facts = af_prompt_generator.get_facts_from_response(claim, generated_answer)
        fact_column.append('--;--'.join(facts))
        
    dataset = dataset.add_column('Factscore_facts', fact_column)
    data_dict = DatasetDict()
    data_dict['test'] = dataset
    data_dict.push_to_hub(dataset_name, token=HF_WRITE_TOKEN)

100%|██████████| 563/563 [06:53<00:00,  1.36it/s]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

# 3 Calc OpenAi Prediction

In [7]:
file_base_name = str(EVALUATION_DIR / '{dataset}/factscore/{type}/{type}_factscore-{dataset}.jsonl')

## 3.1 Helper functions

In [8]:
def get_bm25_passages_indices(query, passages, k):
    assert len(passages) > 0, f'passages are empty for {query}'
    bm25 = BM25Okapi([psg.replace("<s>", "").replace("</s>", "").split() for psg in passages])
    scores = bm25.get_scores(query.split())
    return np.argsort(-scores)[:k]


def get_gtr_passages_indices(retrieval_query, passages, k, encoder, batch_size=8):
    inputs = [psg["title"] + " " + psg["text"].replace("<s>", "").replace("</s>", "") for
                  psg in passages]
    passage_vectors = encoder.encode(inputs, batch_size=batch_size,
                                    device=encoder.device)
    query_vectors = encoder.encode([retrieval_query],
                                    batch_size=batch_size,
                                    device=encoder.device)[0]
    scores = np.inner(query_vectors, passage_vectors)
    return np.argsort(-scores)[:k]


def get_passages(topic, fallback_topic, question, search_word, k=5, only_intro=True, word_lang='de', retrieval='bm25', encoder=None):
    assert retrieval in ('bm25', 'gtr'), 'retrieval method not supported'
    texts, wiki_word = wiki.get_pages(topic, fallback_topic, word_lang, only_intro=only_intro, split_level='passage', search_word=search_word)
    if texts:
        passages = {'word': wiki_word, 'passages': [{'title': str(text[0]).split('(wik')[0], 'text': text[1]} for text in texts]}
        if retrieval == 'bm25':
            ranked_indices = get_bm25_passages_indices(question, [passage.get('text') for passage in passages.get('passages')], k)     
        else:
            ranked_indices = get_gtr_passages_indices(question, passages.get('passages'), k, encoder)
        return {'word': wiki_word, 'passages': [passages.get('passages')[i] for i in ranked_indices]}
    else:
        return {}


def build_prompts(topic, fallback_topic, search_word, atomic_facts, retrieval='gtr', encoder=None):
    prompts = []
    not_in_context = 0
    for atom in atomic_facts:
        atom = atom.strip()
        retrieved = get_passages(topic, fallback_topic, atom, search_word, k=5, retrieval=retrieval, encoder=encoder)
        word = retrieved.get('word')
        passages = retrieved.get('passages')
        
        if not (word and passages):
            not_in_context += 1
            continue
 
        definition = "Answer the question about {} based on the given context.\n\n".format(word)
        context = ""
        for psg_idx, psg in enumerate(reversed(passages)):
            context += "Title: {}\nText: {}\n\n".format(psg["title"],psg["text"].replace("<s>", "").replace("</s>", ""))
        definition += context.strip()
        if not definition[-1] in string.punctuation:
            definition += "."
        prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(),atom.strip())
        prompts.append(prompt)
    return prompts, not_in_context


def create_task(custom_id, prompt, model="gpt-3.5-turbo", temperature=0.7, max_tokens=2048):
    return {
        "custom_id": custom_id,
        "method": "POST",
        "url": "/v1/chat/completions",
        "body": {
            "model": model,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "messages": [{"role": "user", "content": prompt}],
        }
    }


def process_dataset(dataset_name, config, encoder):
    dataset = config['dataset']
    tasks = []
    total_not_in_context = 0
    
    for idx, entry in tqdm(enumerate(dataset)):
        topic = entry['word']
        atomic_facts = entry['Factscore_facts'].split('--;--')
        prompts, not_in_context = build_prompts(topic, entry.get('english_word', topic), entry.get('document_search_word'), atomic_facts, encoder=encoder)
        
        total_not_in_context += 1 if not_in_context > 0 else 0
        
        for pidx, prompt in enumerate(prompts):
            custom_id = f"task-{idx}-{pidx}"
            task = create_task(custom_id, prompt)
            tasks.append(task)
    print(f'Not in context for {dataset_name}: {total_not_in_context}')
    return tasks

## 3.2 Get Openai Outputs

In [9]:
encoder = SentenceTransformer("sentence-transformers/gtr-t5-large")
encoder = encoder.to("cuda" if torch.cuda.is_available() else "cpu")
encoder = encoder.eval()



In [14]:
for dataset_name, config in datasets.items():
    tasks = process_dataset(dataset_name, config, encoder)
    fh.write(file_base_name.format(dataset=dataset_name, type='input'), tasks)

114it [14:01, 20.81s/it]Token indices sequence length is longer than the specified maximum sequence length for this model (631 > 512). Running this sequence through the model will result in indexing errors
563it [59:54,  6.38s/it]

Not in context for shroom-claim_verification: 21





Now manually initializes the batch processing for each created file

In [15]:
batch_jobs = {}

In [16]:
input_file_name = str(EVALUATION_DIR / 'shroom-claim_verification/factscore/input/input_factscore-shroom-claim_verification.jsonl')
batch_job = openai_fetcher.create_batch_job(input_file_name)
batch_jobs[input_file_name] = batch_job

In [25]:
for file_name, batch_job in batch_jobs.items():
    batch_job = openai_fetcher.get_batch_update(batch_job)
    batch_jobs[file_name] = batch_job
    print(file_name)
    print(batch_job)
    print("_______________")

/Users/lukasellinger/PycharmProjects/evaluating_factuality_word_definitions/data/evaluation/shroom-claim_verification/factscore/input/input_factscore-shroom-claim_verification.jsonl
Batch(id='batch_66f99a26eb4081908f7bcbd2c09fe5e0', completion_window='24h', created_at=1727633959, endpoint='/v1/chat/completions', input_file_id='file-VxPPbGWg5QJb5o7SdKVQ3T7o', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1727634564, error_file_id=None, errors=None, expired_at=None, expires_at=1727720359, failed_at=None, finalizing_at=1727634433, in_progress_at=1727634021, metadata=None, output_file_id='file-TQTq8n26pgaPT97Gftm6lGgs', request_counts=BatchRequestCounts(completed=1478, failed=0, total=1478))
_______________


In [27]:
for file_name, batch_job in batch_jobs.items():
    output_file_name = file_name.replace('input', 'raw_output')
    openai_fetcher.get_batch_result(output_file_name, batch_job)

# 4 Calc Factscore

In [28]:
file_base_name = str(EVALUATION_DIR / '{dataset}/factscore/{type}/{type}_factscore-{dataset}.jsonl')

## 4.1 Helper functions

In [29]:
def softmax(x):
    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())

class NPM:
    def __init__(self, model_name):
        assert model_name.startswith("npm")
        self.model_name = model_name
        self.model = None

        self.tokenizer = AutoTokenizer.from_pretrained("facebook/" + self.model_name)
        self.mask_id = self.tokenizer.mask_token_id
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        with open(PROJECT_DIR.joinpath("factscore/roberta_stopwords.txt"), "r") as f:
            self.stopwords = set()
            for line in f:
                self.stopwords.add(int(line.strip()))

    def load_model(self):
        self.model = AutoModelForMaskedLM.from_pretrained("facebook/" + self.model_name)
        self.model.to(self.device)
        self.model.eval()

    def tokenize(self, texts, skip_special_tokens=False, padding=True):
        assert type(texts)==list
        all_input_ids = self.tokenizer(texts)["input_ids"]
        if skip_special_tokens:
            for i, input_ids in enumerate(all_input_ids):
                assert input_ids[0]==0 and input_ids[-1]==2
                all_input_ids[i] = input_ids[1:-1]
        if not padding:
            return all_input_ids
        max_length = np.max([len(_ids) for _ids in all_input_ids])    
        _all_input_ids = []
        _all_attention_mask = []   
        for i, input_ids in enumerate(all_input_ids):
            n_valid = len(input_ids)
            n_masks = max_length - n_valid
            _all_input_ids.append(input_ids + [0 for _ in range(n_masks)])
            _all_attention_mask.append([1 for _ in range(n_valid)] + [0 for _ in range(n_masks)])
        return torch.LongTensor(_all_input_ids), torch.LongTensor(_all_attention_mask)

    def decode(self, input_ids):
        return self.tokenizer.decode(input_ids)

    def encode(self, texts, skip_special_tokens=False, gt_input_ids=None):
        assert type(texts)==list
        if self.model is None:
            self.load_model()
        if gt_input_ids is not None:
            assert len(texts)==len(gt_input_ids)
        all_input_ids, all_attention_mask = self.tokenize(texts, skip_special_tokens=skip_special_tokens)
        
        with torch.no_grad():
            outputs = self.model(all_input_ids.to(self.device),
                                 all_attention_mask.to(self.device),
                                 output_hidden_states=True,
                                 return_dict=True)
            all_logits = outputs["logits"].detach().cpu().numpy()
            all_hidden_states = outputs["hidden_states"][-1].detach().cpu().numpy()

        results = []
        for i, (text, input_ids, logits, hidden_states) in enumerate(zip(texts, all_input_ids, all_logits, all_hidden_states)):
            input_ids = input_ids.numpy().tolist()
            if self.mask_id in input_ids:
                idx = input_ids.index(self.mask_id)
                assert gt_input_ids is not None
                prob = softmax(logits[idx])[gt_input_ids[i]]
                results.append((prob, hidden_states[idx]))
            else:
                _input_ids = [_id for _id in input_ids if _id not in [0, 2]]
                _hidden_states = [h for _id, h in zip(input_ids, hidden_states) if _id not in [0, 2]]
                results.append((_input_ids, _hidden_states))

        return results

    def get_probabilty(self, topic, fallback_topic, question, search_word):
        retrieved = get_passages(topic, fallback_topic, question, search_word, k=3, only_intro=True, word_lang='de')        
        passages = [p["text"].strip() for p in retrieved.get('passages')]
        
        encoded = self.encode(passages, skip_special_tokens=True)
        stacked_passage_tokens, stacked_passage_vectors = [], []
        for input_ids, vectors in encoded:
            stacked_passage_tokens += input_ids
            if len(vectors)>0:
                stacked_passage_vectors.append(vectors)
        stacked_passage_vectors = np.concatenate(stacked_passage_vectors, 0)
            
        question_input_ids = self.tokenize(["Fact: " + question], skip_special_tokens=False, padding=False)[0]
        if 2 in question_input_ids:
            question_input_ids = question_input_ids[:question_input_ids.index(2)]
        question_input_ids = question_input_ids[1:]

        triples = []
        batch = []
        gt_input_ids = []
        prefix = True
        for i, input_id in enumerate(question_input_ids):
            if prefix:
                if input_id==35: # the end of prefix
                    prefix = False
                continue
            if input_id in [0, 2] or input_id in self.stopwords:
                continue
            batch.append(self.decode(question_input_ids[:i] + [self.mask_id] + question_input_ids[i+1:]))
            gt_input_ids.append(input_id)
        for (prob, vector), gt_input_id in zip(self.encode(batch, gt_input_ids=gt_input_ids), gt_input_ids):
            triples.append((prob, vector, gt_input_id))

        stacked_question_vectors = np.stack([v for _, v, _ in triples], 0)
        all_scores = np.exp(np.inner(stacked_question_vectors, stacked_passage_vectors) / np.sqrt(stacked_passage_vectors.shape[-1]))

        probs = []
        for (softmax_prob, vector, input_id), scores in zip(triples, all_scores):
            assert len(stacked_passage_tokens)==len(scores)
            if input_id not in stacked_passage_tokens:
                probs.append(0)
            else:
                aggregated_scores = defaultdict(list)
                for token, score in zip(stacked_passage_tokens, scores):
                    aggregated_scores[token].append(score)
                tot = np.sum([np.sum(v) for v in aggregated_scores.values()])
                prob = np.sum(aggregated_scores[input_id]) / tot
                probs.append(prob)
        return np.mean(probs)

In [30]:
npm = NPM('npm-single')

def calc_factscore(topic, fallback_topic, search_word, generated_answer, atom, use_npm=True):
    is_supported = parse_model_answer(generated_answer)
    if is_supported == 'SUPPORTED' and use_npm:
        npprob = npm.get_probabilty(topic, fallback_topic, atom, search_word)
        is_supported = is_supported if npprob > 0.3 else 'NOT_SUPPORTED'
    return is_supported

## 3.2 Get Factscore Output

In [31]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    results = fh.read(file_base_name.format(dataset=dataset_name, type='raw_output'))
    data_dict = {}

    # Filter out entries not in the wiki and prepare the data_dict
    for entry in dataset:
        if entry['in_wiki'] == 'No':
            continue

        data_dict[entry['id']] = {
            'id': entry['id'],
            'word': entry['word'],
            'claim': entry['claim'],
            'label': entry['label'],
            'predicted': -1,
            'atoms': [],
            'selected_evidences': []
        }
        
    # Process each result and update data_dict
    for res in tqdm(results):
        task_id = res['custom_id']
        index, atom_index = map(int, task_id.split('-')[1:3])
        
        entry = dataset[index]
        generated_answer = res['response']['body']['choices'][0]['message']['content'].lower()
        atom = entry['Factscore_facts'].split('--;--')[atom_index]
        
        predicted = calc_factscore(entry['word'], entry.get('english_word', entry['word']), entry['document_search_word'], generated_answer, atom, use_npm=True)
        data_dict[entry['id']]['atoms'].append({"atom": atom, "predicted": predicted})

    # Calculate the final prediction for each entry
    for entry_id, entry in data_dict.items():
        all_predictions = [decision['predicted'] == 'SUPPORTED' for decision in entry['atoms']]
        average_is_supported = np.mean(all_predictions)
        
        entry['predicted'] = Fact.SUPPORTED.name if average_is_supported == 1 else Fact.NOT_SUPPORTED.name

    # Write the output to a file
    fh.write(file_base_name.format(dataset=dataset_name, type='output'), data_dict.values())

100%|██████████| 1478/1478 [52:25<00:00,  2.13s/it]   


In [36]:
for dataset_name in datasets.keys():
    print(f'Evaluating {dataset_name} with Factscore...')
    file_name = file_base_name.format(dataset=dataset_name, type='output')
    gt_labels, pr_labels = [], []
    for entry in fh.read(file_name):
        if entry['predicted'] != -1:
            gt_labels.append(Fact[entry['label']].to_factuality())
            pr_labels.append(Fact[entry['predicted']].to_factuality())
    report = classification_report(gt_labels, pr_labels, zero_division=0, digits=4)
    print_classification_report(report)

Evaluating german_dpr-claim_verification with Factscore...
################################
              precision    recall  f1-score   support

           0     0.5354    0.9855    0.6939        69
           1     0.9167    0.1571    0.2683        70

    accuracy                         0.5683       139
   macro avg     0.7260    0.5713    0.4811       139
weighted avg     0.7274    0.5683    0.4796       139

################################
Evaluating german_wiktionary-claim_verification-mini with Factscore...
################################
              precision    recall  f1-score   support

           0     0.5226    1.0000    0.6864        81
           1     1.0000    0.0633    0.1190        79

    accuracy                         0.5375       160
   macro avg     0.7613    0.5316    0.4027       160
weighted avg     0.7583    0.5375    0.4063       160

################################
Evaluating squad-claim_verification with Factscore...
##############################