In [1]:
from datasets import load_dataset, concatenate_datasets
from tqdm import tqdm
from pipeline_module.evidence_fetcher import WikipediaEvidenceFetcher

In [6]:
dataset = load_dataset("lukasellinger/fever_evidence_selection-v1")
combined_dataset = concatenate_datasets([dataset['train'], dataset['dev'], dataset['test']])

In [10]:
combined_dataset[3]['evidence_lines']

'0;1;6;7;14;16;15'

In [15]:
# Count entries with more than 3 evidence lines
count = 0
for entry in combined_dataset:
    evidences = entry['evidence_lines'].split(';')
    min_evidence = 4
    for evidence in evidences:
        evidence_len = len(evidence.split(','))
        if min_evidence > evidence_len:
            min_evidence = evidence_len
    if min_evidence > 3:
        count += 1
        
print(f"Number of entries with evidence lines > 3: {count}")

Number of entries with evidence lines > 3: 9


In [13]:
len(combined_dataset)

32900

In [2]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    'shroom-claim_verification': {
        'dataset': load_dataset('lukasellinger/shroom-claim_verification', split='test'),
        'lang': 'en'
    }
    # optional (contains 10k entries)
    #'german_wiktionary-claim_verification-large': {
    #    'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
    #    'lang': 'de'
    #},
    # outdated
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
}

fever = load_dataset('lukasellinger/filtered_fever-claim_verification')

In [10]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    data_dict = {}
    
    not_in_wiki = 0
    avg_claim_count_wiki = {'DisSim_facts': 0,
                            'Factscore_facts': 0,
                            'T5SplitRephrase_facts': 0}
    # Filter out entries not in the wiki and prepare the data_dict
    for entry in dataset:
        if entry['in_wiki'] == 'No':
            not_in_wiki += 1
        else:
            for key in avg_claim_count_wiki.keys():
                avg_claim_count_wiki[key] += len(entry[key].split('--;--'))        
    for key, value in avg_claim_count_wiki.items():
        avg_claim_count_wiki[key] = round(value / (len(dataset) - not_in_wiki), 2)
    print(f'{dataset_name}: {1 - round(not_in_wiki / len(dataset), 4)}, {not_in_wiki}')
    print(avg_claim_count_wiki)
    print('-----------------')

german_dpr-claim_verification: 0.8274, 29
{'DisSim_facts': 1.87, 'Factscore_facts': 3.56, 'T5SplitRephrase_facts': 1.47}
-----------------
german_wiktionary-claim_verification-mini: 0.8, 40
{'DisSim_facts': 1.7, 'Factscore_facts': 3.76, 'T5SplitRephrase_facts': 1.62}
-----------------
squad-claim_verification: 0.7975, 32
{'DisSim_facts': 1.12, 'Factscore_facts': 2.39, 'T5SplitRephrase_facts': 1.06}
-----------------
shroom-claim_verification: 0.9627, 21
{'DisSim_facts': 1.27, 'Factscore_facts': 2.73, 'T5SplitRephrase_facts': 1.22}
-----------------


In [9]:
dpr = 3.56
wiki = 3.76
squad = 2.39
shroom = 2.73

result = (136 * dpr + 160 * wiki + 126 * squad + 542 * shroom) / 967
print(result)

2.9643846949327823


In [7]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    
    def claim_word_length(example):
        # Split the claim into words and count the number of words
        return {"word_length": len(example['claim'].split())}
    
    # Apply the function to the entire dataset using map
    dataset_with_lengths = dataset.map(claim_word_length)
    
    # Now compute the average word length
    avg_word_length = sum(dataset_with_lengths['word_length']) / len(dataset_with_lengths)
    
    # Print the result
    print(f"{dataset_name}: Avg Claim Length: {avg_word_length:.2f}")


german_dpr-claim_verification: Avg Claim Length: 11.93
german_wiktionary-claim_verification-mini: Avg Claim Length: 11.19
squad-claim_verification: Avg Claim Length: 3.32
shroom-claim_verification: Avg Claim Length: 6.48


In [10]:
for dataset_name, dataset in fever.items():    
    def claim_word_length(example):
        # Split the claim into words and count the number of words
        return {"word_length": len(example['claim'].split())}
    
    # Apply the function to the entire dataset using map
    dataset_with_lengths = dataset.map(claim_word_length)
    
    # Now compute the average word length
    avg_word_length = sum(dataset_with_lengths['word_length']) / len(dataset_with_lengths)
    
    # Print the result
    print(f"{dataset_name}: Avg Claim Length: {avg_word_length:.2f}")

train: Avg Claim Length: 7.23
dev: Avg Claim Length: 7.25
test: Avg Claim Length: 7.35


In [6]:
offline_evid_fetcher = WikipediaEvidenceFetcher(offline=True)

for dataset_name, config in datasets.items():
    dataset = config['dataset']    
    evidence_presence = {'in_both': 0,
                         'only_wikipedia': 0,
                         'only_wiktionary': 0,
                         'in_none': 0}
    for entry in tqdm(dataset):
        if entry['in_wiki'] == 'No':
            evidence_presence['in_none'] += 1
            continue
            
        _, evidences = offline_evid_fetcher.fetch_evidences(search_word=entry['document_search_word'])
        
        in_wikipedia = False
        in_wiktionary = False
        for evidence in evidences:
            if in_wikipedia and in_wiktionary:
                break
            
            if evidence.get('title').endswith('(wikipedia)'):
                in_wikipedia = True
            else:
                in_wiktionary = True
        
        assert in_wikipedia or in_wiktionary, f"Evidence must be in wikipedia or wiktionary. But found else {entry}"
        if in_wikipedia and in_wiktionary:
            evidence_presence['in_both'] += 1
        elif in_wikipedia:
            evidence_presence['only_wikipedia'] += 1
        elif in_wiktionary:
            evidence_presence['only_wiktionary'] += 1
        
    print(f'{dataset_name}:')
    print(evidence_presence)
    print('-----------------')

100%|██████████| 168/168 [00:30<00:00,  5.58it/s]


german_dpr-claim_verification:
{'in_both': 89, 'only_wikipedia': 47, 'only_wiktionary': 3, 'in_none': 29}
-----------------


 12%|█▏        | 23/200 [00:03<00:36,  4.86it/s]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 30%|██▉       | 59/200 [00:06<00:08, 17.42it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 64/200 [00:07<00:13,  9.71it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 68/200 [00:08<00:12, 10.38it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 128/200 [00:15<00:11,  6.12it/s]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 137/200 [00:21<00:45,  1.39it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 89%|████████▉ | 178/200 [00:26<00:03,  5.82it/s]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 195/200 [00:28<00:00, 10.75it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 200/200 [00:28<00:00,  7.02it/s]


german_wiktionary-claim_verification-mini:
{'in_both': 120, 'only_wikipedia': 18, 'only_wiktionary': 22, 'in_none': 40}
-----------------


100%|██████████| 158/158 [00:15<00:00, 10.01it/s]


squad-claim_verification:
{'in_both': 82, 'only_wikipedia': 40, 'only_wiktionary': 4, 'in_none': 32}
-----------------


  5%|▌         | 29/563 [00:00<00:12, 43.72it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 59%|█████▊    | 330/563 [00:19<00:15, 15.26it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 563/563 [00:33<00:00, 16.97it/s]

shroom-claim_verification:
{'in_both': 147, 'only_wikipedia': 0, 'only_wiktionary': 395, 'in_none': 21}
-----------------



