In [1]:
import string
from datasets import load_dataset
from general_utils.reader import JSONLineReader
import json
from config import PROJECT_DIR
from fetchers.wikipedia import Wikipedia
import numpy as np
from rank_bm25 import BM25Okapi
from general_utils.utils import parse_model_answer
from collections import defaultdict
import torch
from transformers import AutoTokenizer, AutoModelForMaskedLM
from fetchers.openai import OpenAiFetcher
from general_utils.atomic_facts import FactScoreFactGenerator
from tqdm import tqdm
from config import FACT_EVULATION_OPENAI_TOKEN
from sentence_transformers import SentenceTransformer

In [2]:
openai_fetcher = OpenAiFetcher(api_key=FACT_EVULATION_OPENAI_TOKEN)
fh = JSONLineReader()

In [7]:
dataset_name = "lukasellinger/german_wiktionary-claim_verification-mini"
dataset = load_dataset(dataset_name, split='test')
offline_wiki = 'lukasellinger/wiki_dump_2024-08-14'
wiki = Wikipedia(use_dataset=offline_wiki)



# Calc Atomic Facts

In [30]:
af_prompt_generator = FactScoreFactGenerator(PROJECT_DIR.joinpath('factscore/demos'), is_bio=False)
model = "gpt-3.5-turbo-instruct"
temperature = 0.7
max_tokens = 512
tasks = []

for idx, entry in enumerate(dataset):
    topic = entry['document_search_word']  # translation via wikipedia langlinks
    # claim = f"{topic}: {entry.get('english_claim', entry['claim'])}"
    prompt = af_prompt_generator.get_prompt_for_sentence(entry['connected_claim'])
    task = {
        "custom_id": f"task-{idx}",
        "method": "POST",
        "url": "/v1/completions",
        "body": {
            "model": model,
            "temperature": temperature,
            "max_tokens": max_tokens,
            "prompt": prompt
        }
    }
    tasks.append(task)

In [31]:
identification = 'german_dpr_factscore_facts-gpt3_5-turbo'
file_name = PROJECT_DIR.joinpath(f'dataset/openai/batch_{identification}.jsonl')

with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [32]:
batch_job = openai_fetcher.create_batch_job(file_name, endpoint="/v1/completions")

In [34]:
batch_job = openai_fetcher.get_batch_update(batch_job)
print(batch_job)

Batch(id='batch_Z7ddZ8ocZzTbGFj5b52vu51f', completion_window='24h', created_at=1721146201, endpoint='/v1/completions', input_file_id='file-1yX3GMHuHT6sDMXoTLI0Xetd', object='batch', status='failed', cancelled_at=None, cancelling_at=None, completed_at=None, error_file_id=None, errors=Errors(data=[BatchError(code='model_not_found', line=1, message="The provided model 'gpt-3.5-turbo-instruct' is not supported by the Batch API.", param='body.model'), BatchError(code='model_not_found', line=2, message="The provided model 'gpt-3.5-turbo-instruct' is not supported by the Batch API.", param='body.model'), BatchError(code='model_not_found', line=3, message="The provided model 'gpt-3.5-turbo-instruct' is not supported by the Batch API.", param='body.model'), BatchError(code='model_not_found', line=4, message="The provided model 'gpt-3.5-turbo-instruct' is not supported by the Batch API.", param='body.model'), BatchError(code='model_not_found', line=5, message="The provided model 'gpt-3.5-turbo-i

In [28]:
result_file_name = openai_fetcher.get_batch_result(identification, batch_job)

In [29]:
results = fh.read(result_file_name)

TypeError: expected str, bytes or os.PathLike object, not NoneType

In [9]:
af_prompt_generator = FactScoreFactGenerator(PROJECT_DIR.joinpath('factscore/demos'), is_bio=False)
model = "gpt-3.5-turbo-instruct"
temperature = 0.7
max_tokens = 512

fact_column = []
for entry in tqdm(dataset):
    claim = entry['connected_claim']
    prompt = af_prompt_generator.get_prompt_for_sentence(claim)
    response = openai_fetcher.client.completions.create(
        model=model,
        temperature=temperature,
        max_tokens=max_tokens,
        prompt=prompt,
        seed=42
    )
    generated_answer = response.choices[0].text
    facts = af_prompt_generator.get_facts_from_response(claim, generated_answer)
    fact_column.append( '--;--'.join(facts))

100%|██████████| 200/200 [02:51<00:00,  1.17it/s]


In [12]:
from datasets import DatasetDict
from config import HF_WRITE_TOKEN

dataset = dataset.add_column('Factscore_facts', fact_column)
data_dict = DatasetDict()
data_dict['test'] = dataset
dataset.push_to_hub(dataset_name, token=HF_WRITE_TOKEN)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/752 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/german_wiktionary-claim_verification-mini/commit/81dde5817f7fbbbbe56620c3bd1e768a7809207a', commit_message='Upload dataset', commit_description='', oid='81dde5817f7fbbbbe56620c3bd1e768a7809207a', pr_url=None, pr_revision=None, pr_num=None)

In [13]:
fact_column

['Reanalysis is a term.--;--Reanalysis means empirical research.--;--Empirical research involves re-evaluating a study.--;--Reanalysis uses original data for its research.',
 'A gold prospector is a person.--;--A gold prospector digs.--;--A gold prospector digs for gold.',
 'A war painting represents a painting.--;--It can be on the face or the whole body.--;--It is carried by indigenous peoples.--;--It is used before or during wartime conflicts.',
 'Endophytic medicine is a type of medicine.--;--It is related to biology.--;--It involves growing inward.',
 "Walking is a means of transportation.--;--Walking involves using one's feet.--;--Walking is a form of exercise.",
 'Hearing disruption is a medical term.--;--Hearing disruption can occur suddenly.--;--Hearing disruption can have no identifiable cause (idiopathic).--;--Hearing disruption often affects only one ear.--;--Hearing disruption can cause a sensation of sound disorder.',
 'Polynesia is a series of archipelagos in the Pacific

# Calc OpenAi Prediction

In [34]:
def get_bm25_passages_indices(query, passages, k):
    assert len(passages) > 0, f'passages are empty for {query}'
    bm25 = BM25Okapi([psg.replace("<s>", "").replace("</s>", "").split() for psg in passages])
    scores = bm25.get_scores(query.split())
    return np.argsort(-scores)[:k]


def get_gtr_passages_indices(retrieval_query, passages, k, encoder, batch_size=8):
    inputs = [psg["title"] + " " + psg["text"].replace("<s>", "").replace("</s>", "") for
                  psg in passages]
    passage_vectors = encoder.encode(inputs, batch_size=batch_size,
                                    device=encoder.device)
    query_vectors = encoder.encode([retrieval_query],
                                    batch_size=batch_size,
                                    device=encoder.device)[0]
    scores = np.inner(query_vectors, passage_vectors)
    return np.argsort(-scores)[:k]


def get_passages(topic, fallback_topic, question, search_word, k=5, only_intro=True, word_lang='de', retrieval='bm25', encoder=None):
    assert retrieval in ('bm25', 'gtr'), 'retrieval method not supported'
    texts, wiki_word = wiki.get_pages(topic, fallback_topic, word_lang, only_intro=only_intro, split_level='passage', search_word=search_word)
    if texts:
        passages = {'word': wiki_word, 'passages': [{'title': str(text[0]).split('(wik')[0], 'text': text[1]} for text in texts]}
        if retrieval == 'bm25':
            ranked_indices = get_bm25_passages_indices(question, [passage.get('text') for passage in passages.get('passages')], k)     
        else:
            ranked_indices = get_gtr_passages_indices(question, passages.get('passages'), k, encoder)
        return {'word': wiki_word, 'passages': [passages.get('passages')[i] for i in ranked_indices]}
    else:
        return {}


def build_prompts(topic, fallback_topic, search_word, atomic_facts, retrieval='gtr', encoder=None):
    prompts = []
    not_in_context = 0
    for atom in atomic_facts:
        atom = atom.strip()
        retrieved = get_passages(topic, fallback_topic, atom, search_word, k=5, retrieval=retrieval, encoder=encoder)
        word = retrieved.get('word')
        passages = retrieved.get('passages')
        
        if not (word and passages):
            not_in_context += 1
            continue
 
        definition = "Answer the question about {} based on the given context.\n\n".format(word)
        context = ""
        for psg_idx, psg in enumerate(reversed(passages)):
            context += "Title: {}\nText: {}\n\n".format(psg["title"],psg["text"].replace("<s>", "").replace("</s>", ""))
        definition += context.strip()
        if not definition[-1] in string.punctuation:
            definition += "."
        prompt = "{}\n\nInput: {} True or False?\nOutput:".format(definition.strip(),atom.strip())
        prompts.append(prompt)
    return prompts, not_in_context

In [57]:
encoder = SentenceTransformer("sentence-transformers/gtr-t5-large")
encoder = encoder.to("cuda" if torch.cuda.is_available() else "cpu")
encoder = encoder.eval()

model = "gpt-3.5-turbo"
temperature = 0.7
max_tokens = 2048
tasks = []
total_not_in_context = 0

for idx, entry in tqdm(enumerate(dataset)):
    topic = entry['word']
    atomic_facts = entry['factscore_facts'].split('--;--')
    prompts, not_in_context = build_prompts(topic, entry.get('english_word', topic), entry.get('document_search_word'), atomic_facts, encoder=encoder)
    total_not_in_context += 1 if not_in_context > 0 else 0
    for pidx, prompt in enumerate(prompts):
        task = {
            "custom_id": f"task-{idx}-{pidx}",
            "method": "POST",
            "url": "/v1/chat/completions",
            "body": {
                "model": model,
                "temperature": temperature,
                "max_tokens": max_tokens,
                "messages": [
                    {
                        "role": "user",
                        "content": prompt
                    }
                ],
            }
        }
        tasks.append(task)
print(f'Not found {total_not_in_context}')

168it [1:01:18, 21.90s/it]

Not found 29





In [58]:
identification = 'german_dpr_factscore-gpt3_5-turbo-gtr'
file_name = PROJECT_DIR.joinpath(f'dataset/openai/input/german_dpr/batch_{identification}.jsonl')
with open(file_name, 'w') as file:
    for obj in tasks:
        file.write(json.dumps(obj) + '\n')

In [59]:
batch_job = openai_fetcher.create_batch_job(file_name)

In [64]:
batch_job = openai_fetcher.get_batch_update(batch_job)
print(batch_job)

Batch(id='batch_dC6P1N9kcyiI2AAsTzmjK8Bi', completion_window='24h', created_at=1721254761, endpoint='/v1/chat/completions', input_file_id='file-8XnNUPaWHyPLrfcQVFA0XStR', object='batch', status='completed', cancelled_at=None, cancelling_at=None, completed_at=1721254840, error_file_id=None, errors=None, expired_at=None, expires_at=1721341161, failed_at=None, finalizing_at=1721254823, in_progress_at=1721254762, metadata=None, output_file_id='file-rzuK81AAKAZaB1FTrVvIlc0h', request_counts=BatchRequestCounts(completed=495, failed=0, total=495))


In [65]:
result_file_name = openai_fetcher.get_batch_result(f'{identification}', batch_job)

In [48]:
#result_file_name = PROJECT_DIR.joinpath('dataset/openai/output/german_dpr/output_german_dpr_factscore-gpt3_5-turbo-bm25-raw.jsonl')

# Calc Factscore

In [66]:
# Loading data from saved file
results = fh.read(result_file_name)

In [67]:
def softmax(x):
    return(np.exp(x - np.max(x)) / np.exp(x - np.max(x)).sum())

class NPM:
    def __init__(self, model_name):
        assert model_name.startswith("npm")
        self.model_name = model_name
        self.model = None

        self.tokenizer = AutoTokenizer.from_pretrained("facebook/" + self.model_name)
        self.mask_id = self.tokenizer.mask_token_id
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        with open(PROJECT_DIR.joinpath("factscore/roberta_stopwords.txt"), "r") as f:
            self.stopwords = set()
            for line in f:
                self.stopwords.add(int(line.strip()))

    def load_model(self):
        self.model = AutoModelForMaskedLM.from_pretrained("facebook/" + self.model_name)
        self.model.to(self.device)
        self.model.eval()

    def tokenize(self, texts, skip_special_tokens=False, padding=True):
        assert type(texts)==list
        all_input_ids = self.tokenizer(texts)["input_ids"]
        if skip_special_tokens:
            for i, input_ids in enumerate(all_input_ids):
                assert input_ids[0]==0 and input_ids[-1]==2
                all_input_ids[i] = input_ids[1:-1]
        if not padding:
            return all_input_ids
        max_length = np.max([len(_ids) for _ids in all_input_ids])    
        _all_input_ids = []
        _all_attention_mask = []   
        for i, input_ids in enumerate(all_input_ids):
            n_valid = len(input_ids)
            n_masks = max_length - n_valid
            _all_input_ids.append(input_ids + [0 for _ in range(n_masks)])
            _all_attention_mask.append([1 for _ in range(n_valid)] + [0 for _ in range(n_masks)])
        return torch.LongTensor(_all_input_ids), torch.LongTensor(_all_attention_mask)

    def decode(self, input_ids):
        return self.tokenizer.decode(input_ids)

    def encode(self, texts, skip_special_tokens=False, gt_input_ids=None):
        assert type(texts)==list
        if self.model is None:
            self.load_model()
        if gt_input_ids is not None:
            assert len(texts)==len(gt_input_ids)
        all_input_ids, all_attention_mask = self.tokenize(texts, skip_special_tokens=skip_special_tokens)
        
        with torch.no_grad():
            outputs = self.model(all_input_ids.to(self.device),
                                 all_attention_mask.to(self.device),
                                 output_hidden_states=True,
                                 return_dict=True)
            all_logits = outputs["logits"].detach().cpu().numpy()
            all_hidden_states = outputs["hidden_states"][-1].detach().cpu().numpy()

        results = []
        for i, (text, input_ids, logits, hidden_states) in enumerate(zip(texts, all_input_ids, all_logits, all_hidden_states)):
            input_ids = input_ids.numpy().tolist()
            if self.mask_id in input_ids:
                idx = input_ids.index(self.mask_id)
                assert gt_input_ids is not None
                prob = softmax(logits[idx])[gt_input_ids[i]]
                results.append((prob, hidden_states[idx]))
            else:
                _input_ids = [_id for _id in input_ids if _id not in [0, 2]]
                _hidden_states = [h for _id, h in zip(input_ids, hidden_states) if _id not in [0, 2]]
                results.append((_input_ids, _hidden_states))

        return results

    def get_probabilty(self, topic, fallback_topic, question, search_word):
        retrieved = get_passages(topic, fallback_topic, question, search_word, k=3, only_intro=True, word_lang='de')        
        passages = [p["text"].strip() for p in retrieved.get('passages')]
        
        encoded = self.encode(passages, skip_special_tokens=True)
        stacked_passage_tokens, stacked_passage_vectors = [], []
        for input_ids, vectors in encoded:
            stacked_passage_tokens += input_ids
            if len(vectors)>0:
                stacked_passage_vectors.append(vectors)
        stacked_passage_vectors = np.concatenate(stacked_passage_vectors, 0)
            
        question_input_ids = self.tokenize(["Fact: " + question], skip_special_tokens=False, padding=False)[0]
        if 2 in question_input_ids:
            question_input_ids = question_input_ids[:question_input_ids.index(2)]
        question_input_ids = question_input_ids[1:]

        triples = []
        batch = []
        gt_input_ids = []
        prefix = True
        for i, input_id in enumerate(question_input_ids):
            if prefix:
                if input_id==35: # the end of prefix
                    prefix = False
                continue
            if input_id in [0, 2] or input_id in self.stopwords:
                continue
            batch.append(self.decode(question_input_ids[:i] + [self.mask_id] + question_input_ids[i+1:]))
            gt_input_ids.append(input_id)
        for (prob, vector), gt_input_id in zip(self.encode(batch, gt_input_ids=gt_input_ids), gt_input_ids):
            triples.append((prob, vector, gt_input_id))

        stacked_question_vectors = np.stack([v for _, v, _ in triples], 0)
        all_scores = np.exp(np.inner(stacked_question_vectors, stacked_passage_vectors) / np.sqrt(stacked_passage_vectors.shape[-1]))

        probs = []
        for (softmax_prob, vector, input_id), scores in zip(triples, all_scores):
            assert len(stacked_passage_tokens)==len(scores)
            if input_id not in stacked_passage_tokens:
                probs.append(0)
            else:
                aggregated_scores = defaultdict(list)
                for token, score in zip(stacked_passage_tokens, scores):
                    aggregated_scores[token].append(score)
                tot = np.sum([np.sum(v) for v in aggregated_scores.values()])
                prob = np.sum(aggregated_scores[input_id]) / tot
                probs.append(prob)
        return np.mean(probs)

In [68]:
npm = NPM('npm-single')

def calc_factscore(topic, fallback_topic, search_word, generated_answer, atom, use_npm=True):
    is_supported = parse_model_answer(generated_answer)
    if is_supported == 'SUPPORTED' and use_npm:
        npprob = npm.get_probabilty(topic, fallback_topic, atom, search_word)
        is_supported = is_supported if npprob > 0.3 else 'NOT_SUPPORTED'
    return is_supported

In [69]:
from dataset.def_dataset import Fact

data_dict = {}
for entry in dataset:
    if entry['in_wiki'] == 'No':
        continue

    data_dict[entry['id']] = {
        'id': entry['id'],
        'word': entry['word'],
        'claim': entry['claim'],
        'label': entry['label'],
        'predicted': -1,
        'atoms': [],
        'selected_evidences': []
    }
        
for res in tqdm(results):
    # Getting index from task id
    task_id = res['custom_id']
    index = int(task_id.split('-')[1])
    atom_index = int(task_id.split('-')[2])
    
    entry = dataset[index]
    generated_answer = res['response']['body']['choices'][0]['message']['content'].lower()
    atom = entry['factscore_facts'].split('--;--')[atom_index]
    predicted = calc_factscore(entry['word'], entry['english_word'], entry['document_search_word'], generated_answer, atom, use_npm=True)
    data_dict[entry['id']]['atoms'].append({"atom": atom, "predicted": predicted})

for entry_id, entry in data_dict.items():
    all_predictions = []
    for decision in entry['atoms']:
        all_predictions.append(decision['predicted'] == 'SUPPORTED')
    average_is_supported = np.mean(all_predictions)
    data_dict[entry_id]['predicted'] = Fact.SUPPORTED.name if average_is_supported == 1 else Fact.NOT_SUPPORTED.name

fh.write(PROJECT_DIR.joinpath(f'dataset/openai/output_{identification}.jsonl'), data_dict.values())

100%|██████████| 495/495 [11:27<00:00,  1.39s/it]


In [30]:
data_dict.values()

dict_values([{'id': 1, 'word': 'Elektromotor', 'claim': 'elektrische Leistung in mechanische Leistung umwandelt', 'label': 'SUPPORTED', 'predicted': 'SUPPORTED', 'atoms': [{'atom': 'An electric motor represents electrical power.', 'predicted': 'SUPPORTED'}, {'atom': 'Electrical power is converted into mechanical power.', 'predicted': 'SUPPORTED'}, {'atom': 'Electrical power is converted into mechanical power.', 'predicted': 'SUPPORTED'}], 'selected_evidences': []}, {'id': 2, 'word': 'Zensur', 'claim': 'im engeren Wortsinn das Eingreifen staatlicher oder sonstiger machtgestützter Institutionen, durch die im Sinne dieser Institutionen unerwünschte Inhalte von der Medienöffentlichkeit ferngehalten werden sollen', 'label': 'SUPPORTED', 'predicted': 'NOT_SUPPORTED', 'atoms': [{'atom': 'Censorship refers to intervention by state or power-based institutions.', 'predicted': 'NOT_SUPPORTED'}, {'atom': 'Censorship aims to keep unwanted content away from the media public.', 'predicted': 'NOT_SUPP