In [2]:
from datasets import load_dataset, concatenate_datasets

In [6]:
dataset = load_dataset("lukasellinger/fever_evidence_selection-v1")
combined_dataset = concatenate_datasets([dataset['train'], dataset['dev'], dataset['test']])

In [10]:
combined_dataset[3]['evidence_lines']

'0;1;6;7;14;16;15'

In [15]:
# Count entries with more than 3 evidence lines
count = 0
for entry in combined_dataset:
    evidences = entry['evidence_lines'].split(';')
    min_evidence = 4
    for evidence in evidences:
        evidence_len = len(evidence.split(','))
        if min_evidence > evidence_len:
            min_evidence = evidence_len
    if min_evidence > 3:
        count += 1
        
print(f"Number of entries with evidence lines > 3: {count}")

Number of entries with evidence lines > 3: 9


In [13]:
len(combined_dataset)

32900

In [8]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    'shroom-claim_verification': {
        'dataset': load_dataset('lukasellinger/shroom-claim_verification', split='test'),
        'lang': 'en'
    }
    # optional (contains 10k entries)
    #'german_wiktionary-claim_verification-large': {
    #    'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
    #    'lang': 'de'
    #},
    # outdated
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
}

fever = load_dataset('lukasellinger/filtered_fever-claim_verification')

In [10]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    data_dict = {}
    
    not_in_wiki = 0
    avg_claim_count_wiki = {'DisSim_facts': 0,
                            'Factscore_facts': 0,
                            'T5SplitRephrase_facts': 0}
    # Filter out entries not in the wiki and prepare the data_dict
    for entry in dataset:
        if entry['in_wiki'] == 'No':
            not_in_wiki += 1
        else:
            for key in avg_claim_count_wiki.keys():
                avg_claim_count_wiki[key] += len(entry[key].split('--;--'))        
    for key, value in avg_claim_count_wiki.items():
        avg_claim_count_wiki[key] = round(value / (len(dataset) - not_in_wiki), 2)
    print(f'{dataset_name}: {1 - round(not_in_wiki / len(dataset), 4)}, {not_in_wiki}')
    print(avg_claim_count_wiki)
    print('-----------------')

german_dpr-claim_verification: 0.8274, 29
{'DisSim_facts': 1.87, 'Factscore_facts': 3.56, 'T5SplitRephrase_facts': 1.47}
-----------------
german_wiktionary-claim_verification-mini: 0.8, 40
{'DisSim_facts': 1.7, 'Factscore_facts': 3.76, 'T5SplitRephrase_facts': 1.62}
-----------------
squad-claim_verification: 0.7975, 32
{'DisSim_facts': 1.12, 'Factscore_facts': 2.39, 'T5SplitRephrase_facts': 1.06}
-----------------
shroom-claim_verification: 0.9627, 21
{'DisSim_facts': 1.27, 'Factscore_facts': 2.73, 'T5SplitRephrase_facts': 1.22}
-----------------


In [12]:
dpr = 82.01
wiki = 70.63
squad = 87.30
shroom = 69.37

result = (136 * dpr + 160 * wiki + 126 * squad + 542 * shroom) / 967
print(result)

73.47724922440538


In [7]:
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    
    def claim_word_length(example):
        # Split the claim into words and count the number of words
        return {"word_length": len(example['claim'].split())}
    
    # Apply the function to the entire dataset using map
    dataset_with_lengths = dataset.map(claim_word_length)
    
    # Now compute the average word length
    avg_word_length = sum(dataset_with_lengths['word_length']) / len(dataset_with_lengths)
    
    # Print the result
    print(f"{dataset_name}: Avg Claim Length: {avg_word_length:.2f}")


german_dpr-claim_verification: Avg Claim Length: 11.93
german_wiktionary-claim_verification-mini: Avg Claim Length: 11.19
squad-claim_verification: Avg Claim Length: 3.32
shroom-claim_verification: Avg Claim Length: 6.48


In [10]:
for dataset_name, dataset in fever.items():    
    def claim_word_length(example):
        # Split the claim into words and count the number of words
        return {"word_length": len(example['claim'].split())}
    
    # Apply the function to the entire dataset using map
    dataset_with_lengths = dataset.map(claim_word_length)
    
    # Now compute the average word length
    avg_word_length = sum(dataset_with_lengths['word_length']) / len(dataset_with_lengths)
    
    # Print the result
    print(f"{dataset_name}: Avg Claim Length: {avg_word_length:.2f}")

train: Avg Claim Length: 7.23
dev: Avg Claim Length: 7.25
test: Avg Claim Length: 7.35
