In [1]:
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

%cd /content/drive/My Drive/

repository = 'evaluating_factuality_word_definitions'
%cd {repository}

Mounted at /content/drive
/content/drive/My Drive
/content/drive/My Drive/evaluating_factuality_word_definitions


In [2]:
%%capture
!pip install flash_attn
!pip install accelerate
!pip install datasets~=2.18.0
!pip install einops~=0.8.0
!pip install rank_bm25~=0.2.2
!pip install openai~=1.35.10
!pip install sacremoses~=0.1.1
!pip install git+https://github.com/hdaSprachtechnologie/odenet
!pip install git+https://github.com/tatuylonen/wiktextract.git

!python -m spacy download en_core_web_lg
!python -m spacy download de_core_news_lg

In [3]:
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [4]:
from collections import defaultdict
from datasets import load_dataset, concatenate_datasets, Dataset, DatasetDict
import random
from tqdm import tqdm

from fetchers.wikipedia import Wikipedia

from general_utils.word_replacer import WordReplacer

from pipeline_module.translator import OpusMTTranslator
from pipeline_module.sentence_connector import PhiSentenceConnector
from pipeline_module.evidence_fetcher import WikipediaEvidenceFetcher
from pipeline_module.pipeline import Pipeline
from pipeline_module.claim_splitter import DisSimSplitter, T5SplitRephraseSplitter, FactscoreSplitter
import os
import numpy as np
import torch

In [5]:
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [22]:
dataset_name = 'lukasellinger/squad-claim_verification'
dataset = load_dataset(dataset_name, split='test')
batch_size = 16

Downloading readme:   0%|          | 0.00/783 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 71.5k/71.5k [00:00<00:00, 221kB/s]


Generating test split:   0%|          | 0/158 [00:00<?, ? examples/s]

In [23]:
len(dataset)

158

# 1 Add negatives

In [24]:
word_replacer = WordReplacer()



In [25]:
negatives = []
word_set = list(set([entry['word'] for entry in dataset]))
stats = defaultdict(int)
dataset = dataset.add_column('neg_source', len(dataset) * [None])

for entry in tqdm(dataset):
  neg_entry = {'id': 0,
               'claim': entry['claim'],
               'label': 'NOT_SUPPORTED'}
  neg_entry['word'], stat = word_replacer.get_replacement(entry['word'], word_set)
  neg_entry['neg_source'] = stat
  stats[stat] += 1
  negatives.append(neg_entry)

neg_dataset = Dataset.from_list(negatives)
dataset = concatenate_datasets([dataset, neg_dataset])

dataset = dataset.map(lambda examples, idx: {'id': idx + 1}, with_indices=True)

data_dict = DatasetDict()
data_dict['test'] = dataset

data_dict.push_to_hub(dataset_name)
print(stats)

100%|██████████| 158/158 [00:01<00:00, 106.09it/s]

defaultdict(<class 'int'>, {'word_set': 78, 'english_antonym': 1})





# 2 Add Translations

In [None]:
translator = OpusMTTranslator()

tokenizer_config.json:   0%|          | 0.00/42.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/1.38k [00:00<?, ?B/s]

source.spm:   0%|          | 0.00/797k [00:00<?, ?B/s]

target.spm:   0%|          | 0.00/768k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.27M [00:00<?, ?B/s]

In [None]:
word_translations, claim_translations = [], []

renamed_dataset = dataset.rename_column('claim', 'text')

list_dataset = renamed_dataset.to_list()

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = list_dataset[i:i + batch_size]
    translated_batch = translator(batch)
    word_translations.extend([translation.get('word') for translation in translated_batch])
    claim_translations.extend([translation.get('text') for translation in translated_batch])

dataset = dataset.add_column('english_word', word_translations)
dataset = dataset.add_column('english_claim', claim_translations)

data_dict = DatasetDict()
data_dict['test'] = dataset

data_dict.push_to_hub(dataset_name)

100%|██████████| 313/313 [26:09<00:00,  5.01s/it]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/455 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/german_wiktionary-claim-verification-large/commit/4e77c7532ec2a5aa76be4961e63c2b70c62178f3', commit_message='Upload dataset', commit_description='', oid='4e77c7532ec2a5aa76be4961e63c2b70c62178f3', pr_url=None, pr_revision=None, pr_num=None)

# 3 Add Connected Sentence

In [None]:
sent_connector = PhiSentenceConnector(use_flash_attn=False)

In [None]:
connected_claims = []
renamed_dataset = dataset.remove_columns(['word'])
renamed_dataset = renamed_dataset.rename_column('claim', 'text')
renamed_dataset = renamed_dataset.rename_column('english_word', 'word')
list_dataset = renamed_dataset.to_list()

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = list_dataset[i:i + batch_size]
    connected_batch = sent_connector(batch)
    connected_claims.extend([entry.get('text') for entry in connected_batch])

dataset = dataset.add_column('connected_claim', connected_claims)

data_dict = DatasetDict()
data_dict['test'] = dataset

data_dict.push_to_hub(dataset_name)

  0%|          | 0/36 [00:00<?, ?it/s]Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


Downloading shards:   0%|          | 0/2 [00:00<?, ?it/s]

model-00001-of-00002.safetensors:  30%|###       | 1.51G/4.97G [00:00<?, ?B/s]

model-00002-of-00002.safetensors:   0%|          | 0.00/2.67G [00:00<?, ?B/s]

You are attempting to use Flash Attention 2.0 with a model not initialized on GPU. Make sure to move the model to GPU after initializing it on CPU with `model.to('cuda')`.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

generation_config.json:   0%|          | 0.00/181 [00:00<?, ?B/s]

The `seen_tokens` attribute is deprecated and will be removed in v4.41. Use the `cache_position` model input instead.
 28%|██▊       | 10/36 [03:52<08:39, 19.99s/it]You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
100%|██████████| 36/36 [12:21<00:00, 20.61s/it]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/631 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/shroom-claim_verification/commit/99c8fc140e5dcb722661e79512047c60afd60055', commit_message='Upload dataset', commit_description='', oid='99c8fc140e5dcb722661e79512047c60afd60055', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
renamed_dataset.column_names

# 4 Add T5SplitRephrase Facts

In [None]:
splitter = T5SplitRephraseSplitter()

tokenizer_config.json:   0%|          | 0.00/286 [00:00<?, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/21.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/141 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In [None]:
splitted_claims = []
renamed_dataset = dataset.rename_column('connected_claim', 'text')
list_dataset = renamed_dataset.to_list()
batch_size = 32

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = list_dataset[i:i + batch_size]
    batch = [entry['text'] for entry in batch]
    splitted_batch = splitter(batch)
    splitted_claims.extend('--;--'.join(entry.get('splits')) for entry in splitted_batch)

dataset = dataset.add_column('T5SplitRephrase_facts', splitted_claims)

data_dict = DatasetDict()
data_dict['test'] = dataset

data_dict.push_to_hub(dataset_name)

  0%|          | 0/18 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/852 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/892M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/133 [00:00<?, ?B/s]

100%|██████████| 18/18 [01:55<00:00,  6.40s/it]


Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/719 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/shroom-claim_verification/commit/aeaab347e9bc166bbb2cb3ea1dfde4eeb8aa89a3', commit_message='Upload dataset', commit_description='', oid='aeaab347e9bc166bbb2cb3ea1dfde4eeb8aa89a3', pr_url=None, pr_revision=None, pr_num=None)

In [None]:
data_dict.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/10 [00:00<?, ?ba/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/german_wiktionary-claim-verification-large/commit/ad6bfa77e29fdd08aedd35ab29c1bb397d64b34e', commit_message='Upload dataset', commit_description='', oid='ad6bfa77e29fdd08aedd35ab29c1bb397d64b34e', pr_url=None, pr_revision=None, pr_num=None)

# 5 Add DisSim Facts

In [None]:
splitter = DisSimSplitter()

In [None]:
splitted_claims = []
renamed_dataset = dataset.rename_column('connected_claim', 'text')
list_dataset = renamed_dataset.to_list()
batch_size = len(dataset)

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = list_dataset[i:i + batch_size]
    batch = [entry['text'] for entry in batch]
    splitted_batch = splitter(batch)
    splitted_claims.extend('--;--'.join(entry.get('splits')) for entry in splitted_batch)

dataset = dataset.add_column('DisSim_facts', splitted_claims)

data_dict = DatasetDict()
data_dict['test'] = dataset

data_dict.push_to_hub(dataset_name)

  0%|          | 0/1 [00:00<?, ?it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'mvn'

# 6 (Optional) FactsScore Facts
Is cost expensive

In [None]:
splitter = FactscoreSplitter()

In [None]:
splitted_claims = []
renamed_dataset = dataset.rename_column('connected_claim', 'text')
list_dataset = renamed_dataset.to_list()
batch_size = len(dataset)

for i in tqdm(range(0, len(dataset), batch_size)):
    batch = list_dataset[i:i + batch_size]
    batch = [entry['text'] for entry in batch]
    splitted_batch = splitter(batch)
    splitted_claims.extend('--;--'.join(entry.get('splits')) for entry in splitted_batch)

dataset = dataset.add_column('Factscore_facts', splitted_claims)

data_dict = DatasetDict()
data_dict['test'] = dataset

data_dict.push_to_hub(dataset_name)

# 7 Download Wiki related

In [None]:
datasets = {
    'lukasellinger/german_wiktionary-claim_verification-large': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
        'lang': 'de'
    },
    'lukasellinger/german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'lukasellinger/german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'lukasellinger/german-claim_verification': {
        'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
        'lang': 'de'
    },
    'lukasellinger/squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    'shroom-claim_verification': {
        'dataset': load_dataset('lukasellinger/shroom-claim_verification', split='test'),
        'lang': 'en'
    }
}

## 7.1 Download wiki data + set document_search_word

In [None]:
all_docs = []
processed_words = {}

def download(example):
    word = example['word']
    fallback_word = example.get('english_word', word)

    if f'{word}{fallback_word}' in processed_words:
        example['document_search_word'] = processed_words[f'{word}{fallback_word}']
        return example

    full_docs, _ = wiki.get_pages(word, fallback_word, word_lang, only_intro=False, return_raw=True)
    intro_docs, document_search_word = wiki.get_pages(word, fallback_word, word_lang, only_intro=True, return_raw=True)

    example['document_search_word'] = document_search_word
    processed_words[f'{word}{fallback_word}'] = document_search_word

    full_docs = {key: value for key, value in full_docs}
    intro_docs = {key: value for key, value in intro_docs}

    docs = []
    for title in full_docs:
        docs.append({'search_word': document_search_word,
                     'title': title,
                     'raw_full_text': full_docs.get(title),
                     'raw_intro_text': intro_docs.get(title)})
    all_docs.extend(docs)
    return example

In [None]:
for dataset_name, dataset_info in tqdm(datasets.items()):
    dataset = dataset_info['dataset']
    word_lang = dataset_info['lang']
    assert dataset[0].get('word'), f'No word in dataset {dataset_name}'


wiki = Wikipedia()
for dataset_name, dataset_info in tqdm(datasets.items()):
    dataset = dataset_info['dataset']
    word_lang = dataset_info['lang']
    dataset = dataset.map(download)

    data_dict = DatasetDict()
    data_dict['test'] = dataset

    data_dict.push_to_hub(dataset_name)

In [None]:
all_docs_dataset = Dataset.from_list(all_docs)
all_docs_dataset = all_docs_dataset.map(lambda examples, idx: {'id': idx + 1}, with_indices=True)
all_docs_dataset.push_to_hub('lukasellinger/wiki_dump_2024-08-15')

## 7.2 Add Intro End to Wiki dump

In [None]:
dataset_name = 'lukasellinger/wiki_dump_2024-08-14'
dataset = load_dataset(dataset_name).get('train')

df = dataset.to_pandas()
unique_search_words = df['search_word'].unique()

fetcher = WikipediaEvidenceFetcher()
pipeline = Pipeline(translator=None,
                    sent_connector=None,
                    claim_splitter=None,
                    evid_fetcher=fetcher,
                    evid_selector=None,
                    stm_verifier=None,
                    lang=None)

batch_size = 4
intro_ends = {}
for i in range(0, len(unique_search_words), batch_size):
    batch = unique_search_words[i:i + batch_size]
    batch = [{'document_search_word': entry} for entry in batch]
    intro_ends.update(pipeline.mark_summary_sents_test_batch(batch))


def add_intro_end(entry):
    entry['intro_end_sent_idx'] = intro_ends.get(entry['title'], -1)
    return entry


dataset = dataset.map(add_intro_end)
dataset.push_to_hub(dataset_name)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/430 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/lukasellinger/wiki_dump_2024-09-02_shroom/commit/f3056144b5703d4eb3b4e582cb498f9229441dd5', commit_message='Upload dataset', commit_description='', oid='f3056144b5703d4eb3b4e582cb498f9229441dd5', pr_url=None, pr_revision=None, pr_num=None)

## 7.3 Add in_wiki column

In [None]:
for dataset_name, dataset_info in tqdm(datasets.items()):
    dataset = dataset_info['dataset']
    word_lang = dataset_info['lang']
    
    not_in_wiki = defaultdict(int)
    in_wiki_col = []
    for entry in tqdm(dataset):
        texts, _ = wiki.get_pages('', '', only_intro=True, return_raw=True,
                                  search_word=entry['document_search_word'])
        if not texts:
            texts_long, _ = wiki.get_pages('', '', only_intro=False, return_raw=True,
                                           search_word=entry['document_search_word'])
            if not texts_long:
                in_wiki_col.append('No')
                not_in_wiki['No'] += 1
            else:
                in_wiki_col.append('No intro')
                not_in_wiki['No intro'] += 1
        else:
            in_wiki_col.append('Yes')
            not_in_wiki['Yes'] += 1
    dataset = dataset.add_column('in_wiki', in_wiki_col)

    data_dict = DatasetDict()
    data_dict['test'] = dataset
    data_dict.push_to_hub(dataset_name)
    print(f'{dataset_name} - not in Wiki: {not_in_wiki}')