# 0 Preparations
Before starting, ensure that you have cloned the repository to your Google Drive.
We will connect to this:

In [1]:
from google.colab import drive

drive.mount('/content/drive', force_remount=True)
repository = 'evaluating_factuality_word_definitions'

%cd /content/drive/My Drive/{repository}

Mounted at /content/drive
/content/drive/My Drive/evaluating_factuality_word_definitions


Next, we install the packages and import the modules needed in this notebook:

In [2]:
%%capture
!pip install datasets~=2.18.0
!pip install einops~=0.8.0
!pip install rank_bm25~=0.2.2
!pip install openai~=1.35.10
!pip install git+https://github.com/tatuylonen/wiktextract.git

!python -m spacy download en_core_web_lg
!python -m spacy download de_core_news_lg

In [1]:
from collections import defaultdict
from typing import Dict, List

import json
import random
import os
import numpy as np
import torch
from datasets import load_dataset
from tqdm import tqdm

from config import PROJECT_DIR
from general_utils.reader import JSONLineReader, JSONReader
from general_utils.utils import (calc_bin_stats, print_classification_report,
                                 print_fever_classification_report, rank_docs)
from pipeline_module.claim_splitter import (DisSimSplitter, FactscoreSplitter,
                                            T5SplitRephraseSplitter)
from pipeline_module.evidence_fetcher import WikipediaEvidenceFetcher
from pipeline_module.evidence_selector import ModelEvidenceSelector
from pipeline_module.pipeline import FeverPipeline, Pipeline
from pipeline_module.sentence_connector import (ColonSentenceConnector,
                                                PhiSentenceConnector)
from pipeline_module.statement_verifier import ModelStatementVerifier
from pipeline_module.translator import OpusMTTranslator

# 1 Setup: Define Models and Datasets
Now we set the seed and define our models and datasets we want to evaluate:

In [2]:
seed = 42
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
np.random.seed(seed)
random.seed(seed)

# to be deterministic
torch.use_deterministic_algorithms(True)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False
os.environ["CUBLAS_WORKSPACE_CONFIG"] = ":4096:8"

In [3]:
# Base Models
base_selection_model = 'Snowflake/snowflake-arctic-embed-m-long'
base_verification_model = 'MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7'

# Finetuned Models
finetuned_selection_model = 'lukasellinger/evidence-selection-model'
finetuned_verification_model = 'lukasellinger/claim-verification-model-top_last'

In [4]:
# Datasets with language information
datasets = {
    'german_dpr-claim_verification': {
        'dataset': load_dataset('lukasellinger/german_dpr-claim_verification', split='test'),
        'lang': 'de'
    },
    'german_wiktionary-claim_verification-mini': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-mini', split='test'),
        'lang': 'de'
    },
    'squad-claim_verification': {
        'dataset': load_dataset('lukasellinger/squad-claim_verification', split='test'),
        'lang': 'en'
    },
    'shroom-claim_verification': {
        'dataset': load_dataset('lukasellinger/shroom-claim_verification', split='test'),
        'lang': 'en'
    }
    # outdated
    #'german-claim_verification': {
    #    'dataset': load_dataset('lukasellinger/german-claim_verification', split='test'),
    #    'lang': 'de'
    #},
}

optional_datasets = {
    'german_wiktionary-claim_verification-large': {
        'dataset': load_dataset('lukasellinger/german_wiktionary-claim_verification-large', split='test'),
        'lang': 'de'
    }
}

In [5]:
EVALUATION_DIR = PROJECT_DIR / 'data/evaluation'

## 1.1 Evaluation Util Functions
Next we define some helper functions

In [9]:
def evaluate_pipeline(pipeline: Pipeline, dataset, batch_size=4, output_file_name='',
                      only_intro=True, max_evidence_count: int = 3, top_k: int = 3, only_wikipedia: bool = False):
    outputs, report, not_in_wiki = pipeline.verify_test_dataset(dataset, batch_size,
                                                                output_file_name, only_intro, max_evidence_count, top_k, only_wikipedia)

    total_claim_count = sum(len(entry['atoms']) for entry in outputs if entry.get('atoms'))
    total_entries_with_atoms = sum(1 for entry in outputs if entry.get('atoms'))

    avg_claim_count = total_claim_count / total_entries_with_atoms if total_entries_with_atoms > 0 else 0

    print_classification_report(report, not_in_wiki, avg_claim_count)

    return outputs


def calc_claim_lengths_stats(outputs: List[Dict]):
    pr_labels, gt_labels, claim_lengths = [], [], []
    for output in outputs:
        pr_labels.append(1 if output['predicted'] == 'SUPPORTED' else 0)
        gt_labels.append(1 if output['label'] == 'SUPPORTED' else 0)
        claim_lengths.append(len(output['connected_claim'].split()))
    return calc_bin_stats(pr_labels, gt_labels, claim_lengths)


def calc_additional_stats(outputs: List[Dict], output_file_name=''):
    evid_line_number_dist = defaultdict(int)
    total_wikipedia = 0
    total_wiktionary = 0
    in_intro = 0

    for output in outputs:
        evidences = output.get('evidence', [])
        for evidence in evidences:
            if evidence.get('title').endswith('(wikipedia)'):
                if evidence.get('in_intro'):
                    in_intro += 1
                total_wikipedia += 1
            else:
                total_wiktionary += 1

            evid_line_number_dist[evidence.get('line_idx')] += 1
    total_evidences = total_wikipedia + total_wiktionary

    stats =  {
        'evid_line_number_dist': dict(sorted(evid_line_number_dist.items())),
        'claim_length_stats': calc_claim_lengths_stats(outputs),
        'avg_sent_0_selected': evid_line_number_dist.get(0) / sum(evid_line_number_dist.values()),
        'avg_in_intro': in_intro / total_wikipedia if total_wikipedia > 0 else 0,
        'in_intro': in_intro,
        'avg_wikipedia': total_wikipedia / total_evidences if total_evidences > 0 else 0,
        'total_wikipedia': total_wikipedia,
        'avg_wiktionary': total_wiktionary / total_evidences if total_evidences > 0 else 0,
        'total_wiktionary': total_wiktionary
    }
    if output_file_name:
        JSONReader().write(f'{output_file_name}.json', stats)

    return stats

## 1.2 Initialization of the pipeline modules
Here, we initialize the pipeline modules that will be used later.
These modules will be loaded onto your device when the first inference step is performed.
In this notebook, the translator and sentence connector are not directly utilized, but their results are already included in the datasets. We use their names to ensure the correct output is retrieved.

In [7]:
%%capture
# Translator
translator = OpusMTTranslator()

# Sentence Connectors
colon_sentence_connector = ColonSentenceConnector()
phi_sentence_connector = PhiSentenceConnector()

# Evidence Fetcher
offline_evid_fetcher = WikipediaEvidenceFetcher()
online_evid_fetcher = WikipediaEvidenceFetcher(offline=False)

In [8]:
pipeline_models = {
    'base': {
        'evid_selector': ModelEvidenceSelector(model_name=base_selection_model, min_similarity=0, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=base_verification_model, premise_sent_order='keep')
    },
    'finetuned_wo_threshold': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=finetuned_verification_model, premise_sent_order='top_last')
    },
    'finetuned_soft_threshold': { # best performing
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0.5, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=finetuned_verification_model, premise_sent_order='top_last')
    },
    'finetuned': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0.585, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=finetuned_verification_model, premise_sent_order='top_last')
    }
}

tokenizer_config.json:   0%|          | 0.00/1.36k [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/971 [00:00<?, ?B/s]

# 2 Evaluating Finetuned vs. Base Pipeline (Optimal Setup)

In this section, we assess the performance of the base and finetuned pipeline using the identified optimal setup.

#### Datasets and Models 
**Datasets:**
- [`lukasellinger/german_dpr-claim_verification`](https://huggingface.co/datasets/lukasellinger/german_dpr-claim_verification)
- [`lukasellinger/german_wiktionary-claim_verification-mini`](https://huggingface.co/datasets/lukasellinger/german_wiktionary-claim_verification-mini)
- [`lukasellinger/squad-claim_verification`](https://huggingface.co/datasets/lukasellinger/squad-claim_verification)
- [`lukasellinger/shroom-claim_verification`](https://huggingface.co/datasets/lukasellinger/shroom-claim_verification)
- [`lukasellinger/german_wiktionary-claim_verification-large`](https://huggingface.co/datasets/lukasellinger/german_wiktionary-claim_verification-large)

**Base Models:**
- Selection: [`Snowflake/snowflake-arctic-embed-m-long`](https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long)
- Verification: [`MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7`](https://huggingface.co/MoritzLaurer/mDeBERTa-v3-base-xnli-multilingual-nli-2mil7)

**Finetuned Models:**
- Selection: [`lukasellinger/evidence-selection-model`](https://huggingface.co/lukasellinger/evidence-selection-model)
- Verification: [`lukasellinger/claim-verification-model-top_last`](https://huggingface.co/lukasellinger/claim-verification-model-top_last)


#### Evaluation Strategy

- **Configuration:**
  - Using `OpusMTTranslator`
  - Using `PhiSentenceConnector`
  - No claim splitting
  - Offline Evidence Fetcher for reproducibility (state of 08.07.2024)

## 2.1 Evaluation of Datasets
First we test our different pipeline versions on our test datasets.

In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_{model}")
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for model_name, models in pipeline_models.items():
        print(f"Evaluating {dataset_name} with pipeline {model_name}...")
        pipeline = Pipeline(translator=translator,
                            sent_connector=phi_sentence_connector,
                            claim_splitter=None,
                            evid_fetcher=offline_evid_fetcher,
                            evid_selector=models.get('evid_selector'),
                            stm_verifier=models.get('stm_verifier'),
                            lang=lang)
        outputs = evaluate_pipeline(pipeline, dataset,
                                    output_file_name=output_file_base_name.format(
                                        dataset=dataset_name, model=model_name))
        additional_stats = calc_additional_stats(outputs, f'{output_file_base_name.format(dataset=dataset_name, model=model_name)}_additional_stats')

Evaluating german_dpr-claim_verification with pipeline base...


100%|██████████| 42/42 [01:13<00:00,  1.76s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6915    0.9420    0.7975        69
           1     0.9111    0.5857    0.7130        70

    accuracy                         0.7626       139
   macro avg     0.8013    0.7639    0.7553       139
weighted avg     0.8021    0.7626    0.7550       139

################################
Evaluating german_dpr-claim_verification with pipeline finetuned_wo_threshold...


100%|██████████| 42/42 [01:13<00:00,  1.75s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8451    0.8696    0.8571        69
           1     0.8676    0.8429    0.8551        70

    accuracy                         0.8561       139
   macro avg     0.8564    0.8562    0.8561       139
weighted avg     0.8564    0.8561    0.8561       139

################################
Evaluating german_dpr-claim_verification with pipeline finetuned_soft_threshold...


100%|██████████| 42/42 [01:05<00:00,  1.56s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8356    0.8841    0.8592        69
           1     0.8788    0.8286    0.8529        70

    accuracy                         0.8561       139
   macro avg     0.8572    0.8563    0.8560       139
weighted avg     0.8574    0.8561    0.8560       139

################################
Evaluating german_dpr-claim_verification with pipeline finetuned...


100%|██████████| 42/42 [01:13<00:00,  1.76s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8125    0.9420    0.8725        69
           1     0.9322    0.7857    0.8527        70

    accuracy                         0.8633       139
   macro avg     0.8724    0.8639    0.8626       139
weighted avg     0.8728    0.8633    0.8625       139

################################
Evaluating german_wiktionary-claim_verification-mini with pipeline base...


 12%|█▏        | 6/50 [00:10<01:19,  1.81s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:17<00:28,  1.27it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:19<00:30,  1.10it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:20<00:27,  1.19it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:41<00:24,  1.38s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:47<00:38,  2.40s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [01:03<00:09,  1.67s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [01:08<00:01,  1.17s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [01:09<00:00,  1.39s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6667    0.9630    0.7879        81
           1     0.9302    0.5063    0.6557        79

    accuracy                         0.7375       160
   macro avg     0.7984    0.7346    0.7218       160
weighted avg     0.7968    0.7375    0.7226       160

################################
Evaluating german_wiktionary-claim_verification-mini with pipeline finetuned_wo_threshold...


 12%|█▏        | 6/50 [00:06<00:49,  1.13s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:16<00:40,  1.14s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:20<00:54,  1.59s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:21<00:47,  1.43s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:38<00:25,  1.40s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:48<00:53,  3.33s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [01:00<00:06,  1.15s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [01:05<00:01,  1.05s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [01:06<00:00,  1.33s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7143    0.8642    0.7821        81
           1     0.8226    0.6456    0.7234        79

    accuracy                         0.7562       160
   macro avg     0.7684    0.7549    0.7528       160
weighted avg     0.7678    0.7562    0.7531       160

################################
Evaluating german_wiktionary-claim_verification-mini with pipeline finetuned_soft_threshold...


 12%|█▏        | 6/50 [00:10<01:16,  1.73s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:18<00:29,  1.23it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:20<00:31,  1.07it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:21<00:28,  1.17it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:41<00:30,  1.69s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:47<00:41,  2.60s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [01:00<00:09,  1.52s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [01:06<00:01,  1.27s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [01:07<00:00,  1.36s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7200    0.8889    0.7956        81
           1     0.8500    0.6456    0.7338        79

    accuracy                         0.7688       160
   macro avg     0.7850    0.7672    0.7647       160
weighted avg     0.7842    0.7688    0.7651       160

################################
Evaluating german_wiktionary-claim_verification-mini with pipeline finetuned...


 12%|█▏        | 6/50 [00:07<00:51,  1.18s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:14<00:30,  1.20it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:41,  1.21s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:18<00:38,  1.16s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:36<00:19,  1.07s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:44<00:41,  2.62s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:59<00:09,  1.51s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [01:03<00:00,  1.13it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [01:03<00:00,  1.27s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6727    0.9136    0.7749        81
           1     0.8600    0.5443    0.6667        79

    accuracy                         0.7312       160
   macro avg     0.7664    0.7289    0.7208       160
weighted avg     0.7652    0.7312    0.7214       160

################################
Evaluating squad-claim_verification with pipeline base...


100%|██████████| 40/40 [00:53<00:00,  1.35s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7564    0.9365    0.8369        63
           1     0.9167    0.6984    0.7928        63

    accuracy                         0.8175       126
   macro avg     0.8365    0.8175    0.8148       126
weighted avg     0.8365    0.8175    0.8148       126

################################
Evaluating squad-claim_verification with pipeline finetuned_wo_threshold...


100%|██████████| 40/40 [00:43<00:00,  1.09s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8182    0.8571    0.8372        63
           1     0.8500    0.8095    0.8293        63

    accuracy                         0.8333       126
   macro avg     0.8341    0.8333    0.8332       126
weighted avg     0.8341    0.8333    0.8332       126

################################
Evaluating squad-claim_verification with pipeline finetuned_soft_threshold...


100%|██████████| 40/40 [00:47<00:00,  1.18s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8116    0.8889    0.8485        63
           1     0.8772    0.7937    0.8333        63

    accuracy                         0.8413       126
   macro avg     0.8444    0.8413    0.8409       126
weighted avg     0.8444    0.8413    0.8409       126

################################
Evaluating squad-claim_verification with pipeline finetuned...


100%|██████████| 40/40 [00:41<00:00,  1.04s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7867    0.9365    0.8551        63
           1     0.9216    0.7460    0.8246        63

    accuracy                         0.8413       126
   macro avg     0.8541    0.8413    0.8398       126
weighted avg     0.8541    0.8413    0.8398       126

################################
Evaluating shroom-claim_verification with pipeline base...


  5%|▍         | 7/141 [00:06<01:46,  1.26it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:16<00:40,  1.47it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [02:12<00:00,  1.06it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7426    0.6331    0.6835       278
           1     0.6656    0.7689    0.7135       264

    accuracy                         0.6993       542
   macro avg     0.7041    0.7010    0.6985       542
weighted avg     0.7051    0.6993    0.6981       542

################################
Evaluating shroom-claim_verification with pipeline finetuned_wo_threshold...


  5%|▍         | 7/141 [00:05<01:39,  1.35it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:25<00:53,  1.09it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [02:22<00:00,  1.01s/it]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7614    0.4820    0.5903       278
           1     0.6066    0.8409    0.7048       264

    accuracy                         0.6568       542
   macro avg     0.6840    0.6615    0.6475       542
weighted avg     0.6860    0.6568    0.6461       542

################################
Evaluating shroom-claim_verification with pipeline finetuned_soft_threshold...


  5%|▍         | 7/141 [00:03<01:06,  2.02it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:13<01:02,  1.07s/it]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [02:08<00:00,  1.10it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7062    0.5360    0.6094       278
           1     0.6103    0.7652    0.6790       264

    accuracy                         0.6476       542
   macro avg     0.6582    0.6506    0.6442       542
weighted avg     0.6595    0.6476    0.6433       542

################################
Evaluating shroom-claim_verification with pipeline finetuned...


  5%|▍         | 7/141 [00:06<01:42,  1.31it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:10<00:39,  1.48it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [02:13<00:00,  1.06it/s]

################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6468    0.6259    0.6362       278
           1     0.6190    0.6402    0.6294       264

    accuracy                         0.6328       542
   macro avg     0.6329    0.6330    0.6328       542
weighted avg     0.6333    0.6328    0.6329       542

################################





## 2.2 Evaluation using only Wikipedia
Now we test our finetuned pipeline when using only Wikipedia Evidences (excluding Wiktionary).

In [14]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_finetuned_soft_threshold_only_wikipedia")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    print(f"Evaluating {dataset_name}...")
    pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                        lang=lang)
    outputs = evaluate_pipeline(pipeline, dataset, only_wikipedia=True,
                                output_file_name=output_file_base_name.format(dataset=dataset_name))

Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [08:20<00:00, 11.91s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8310    0.8806    0.8551        67
           1     0.8769    0.8261    0.8507        69

    accuracy                         0.8529       136
   macro avg     0.8540    0.8533    0.8529       136
weighted avg     0.8543    0.8529    0.8529       136

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:20<02:36,  3.55s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:43<01:22,  2.28s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:49<01:39,  2.91s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:52<01:28,  2.67s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [01:33<01:00,  3.34s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [01:43<01:13,  4.61s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [02:13<00:18,  3.06s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [02:25<00:02,  2.43s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [02:27<00:00,  2.94s/it]


################################
Not in wikipedia: 62
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6957    0.9143    0.7901        70
           1     0.8696    0.5882    0.7018        68

    accuracy                         0.7536       138
   macro avg     0.7826    0.7513    0.7459       138
weighted avg     0.7813    0.7536    0.7466       138

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [02:13<00:00,  3.35s/it]


################################
Not in wikipedia: 36
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8485    0.9180    0.8819        61
           1     0.9107    0.8361    0.8718        61

    accuracy                         0.8770       122
   macro avg     0.8796    0.8770    0.8768       122
weighted avg     0.8796    0.8770    0.8768       122

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:05<01:14,  1.80it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:26<01:00,  1.03s/it]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [02:40<00:00,  1.14s/it]


################################
Not in wikipedia: 416
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6842    0.6420    0.6624        81
           1     0.5915    0.6364    0.6131        66

    accuracy                         0.6395       147
   macro avg     0.6379    0.6392    0.6378       147
weighted avg     0.6426    0.6395    0.6403       147

################################


## 2.3 Evaluation using ColonSentenceConnector
In our best performance we used PhiSentenceConnector. 
However this models takes up over 90% of our total parameters. 
Therefore, we test the perfomance when adding Word and Definition with: `{word}: {definition}`

In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_{model}_colon_connector")
for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for model_name, models in pipeline_models.items():
        print(f"Evaluating {dataset_name} with pipeline {model_name}...")
        pipeline = Pipeline(translator=translator,
                            sent_connector=colon_sentence_connector,
                            claim_splitter=None,
                            evid_fetcher=offline_evid_fetcher,
                            evid_selector=models.get('evid_selector'),
                            stm_verifier=models.get('stm_verifier'),
                            lang=lang)
        outputs = evaluate_pipeline(pipeline, dataset,
                                    output_file_name=output_file_base_name.format(
                                        dataset=dataset_name, model=model_name))
        additional_stats = calc_additional_stats(outputs, f'{output_file_base_name.format(dataset=dataset_name, model=model_name)}_additional_stats')

## 2.4 Evaluation of Wiktionary-Large Dataset (optional)
Next we test our finetuned pipeline on the Wiktionary-Large Dataset containing 10k entries.

In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_finetuned_soft_threshold")
dataset_name = 'german_wiktionary-claim_verification-large'
dataset = optional_datasets[dataset_name]['dataset']
lang = optional_datasets[dataset_name]['lang']
print(f"Evaluating {dataset_name}...")
pipeline = Pipeline(translator=translator,
                    sent_connector=phi_sentence_connector,
                    claim_splitter=None,
                    evid_fetcher=offline_evid_fetcher,
                    evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                    stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                    lang=lang)
outputs = evaluate_pipeline(pipeline, dataset,
                            output_file_name=output_file_base_name.format(dataset=dataset_name))

Evaluating german_wiktionary-claim_verification-large...


  0%|          | 6/2500 [00:06<46:00,  1.11s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


  1%|          | 14/2500 [00:15<42:09,  1.02s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


  1%|          | 16/2500 [00:17<42:04,  1.02s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


  1%|          | 17/2500 [00:18<37:26,  1.11it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


  2%|▏         | 51/2500 [01:08<1:06:22,  1.63s/it]

cross/English/verb: DEBUG: unrecognized sense qualifier: cricket, reciprocally at ['cross']


  2%|▏         | 52/2500 [01:10<1:11:53,  1.76s/it]

not/English/adverb: DEBUG: unrecognized sense qualifier: litotes at ['not']


  5%|▌         | 125/2500 [03:09<47:36,  1.20s/it]

Errors parsing wiktionary page for magic word: recursively_extract: unhandled kind NodeKind.MAGIC_WORD. Skipping page.


  5%|▌         | 127/2500 [03:11<52:03,  1.32s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


  7%|▋         | 166/2500 [04:04<57:10,  1.47s/it]

port/English/nouns: DEBUG: usually in singular in section nouns at ['port']
port/English/noun: DEBUG: unrecognized sense qualifier: computing, BSD at ['port']
port/English/noun: DEBUG: unrecognized sense qualifier: Queensland at ['port']


  7%|▋         | 184/2500 [04:33<1:16:28,  1.98s/it]

split/English/noun: DEBUG: unrecognized sense qualifier: athletics, speedrunning at ['split']


  9%|▊         | 214/2500 [05:16<52:13,  1.37s/it]  

split/English/noun: DEBUG: unrecognized sense qualifier: athletics, speedrunning at ['split']


 10%|█         | 261/2500 [06:19<52:57,  1.42s/it]

record/English/noun: DEBUG: gloss may contain unhandled list items: A data structure similar to a struct, in some programming languages such as C# and Java based on classes and designed for storing immutable data. at ['record']


 11%|█▏        | 284/2500 [06:58<52:18,  1.42s/it]

aufgeben/German/verb: DEBUG: unrecognized sense qualifier: transitive, homework at ['aufgeben']


 11%|█▏        | 285/2500 [07:00<58:48,  1.59s/it]

pothole/English/noun: DEBUG: unrecognized sense qualifier: fandom slang, TV Tropes at ['pothole']
pothole/English/noun: DEBUG: unrecognized sense qualifier: fandom slang, TV Tropes at ['pothole']


 12%|█▏        | 288/2500 [07:04<48:37,  1.32s/it]

Errors parsing wiktionary page for magic word: recursively_extract: unhandled kind NodeKind.MAGIC_WORD. Skipping page.


 12%|█▏        | 292/2500 [07:08<37:50,  1.03s/it]

drive/English/noun: DEBUG: unrecognized sense qualifier: philanthropy at ['drive']


 12%|█▏        | 311/2500 [07:32<41:26,  1.14s/it]

chop/English/verb: DEBUG: unrecognized sense qualifier: computing, transitive, Perl at ['chop']


 13%|█▎        | 318/2500 [07:40<31:34,  1.15it/s]

crank/English/noun: DEBUG: unrecognized sense qualifier: archaic, baseball, slang, 1800s at ['crank']


 13%|█▎        | 323/2500 [07:45<39:55,  1.10s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 14%|█▎        | 338/2500 [08:06<42:29,  1.18s/it]

bayer: DEBUG: HTML tag <i> not properly closed at ['bayer'] parsing bayer/French/Etymology/Pronunciation/Verb/Conjugation
started on line 38, detected on line 50


 15%|█▍        | 372/2500 [09:07<40:42,  1.15s/it]

Abriss/German/noun: DEBUG: unrecognized sense qualifier: wikis at ['Abriss']


 18%|█▊        | 449/2500 [11:01<1:10:57,  2.08s/it]

local/English/adjective: DEBUG: unrecognized head form: 1 at ['local']
local/English/adjective: DEBUG: unrecognized head form: 2 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']
local/English/adjective: DEBUG: unrecognized head form: 1 at ['local']
local/English/adjective: DEBUG: unrecognized head form: 2 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']


 19%|█▉        | 473/2500 [11:37<39:13,  1.16s/it]

march/English/noun: DEBUG: unrecognized sense qualifier: euchre at ['march']
split/English/noun: DEBUG: unrecognized sense qualifier: athletics, speedrunning at ['split']


 20%|█▉        | 498/2500 [12:05<50:05,  1.50s/it]

green/English/nouns: DEBUG: usually in singular in section nouns at ['green']


 22%|██▏       | 559/2500 [13:50<37:32,  1.16s/it]

past/English/adjective: DEBUG: unrecognized sense qualifier: postmodifier at ['past']


 23%|██▎       | 566/2500 [13:58<34:52,  1.08s/it]

wet/English/verb: DEBUG: unrecognized sense qualifier: US, MLE, MTE, slang at ['wet']


 23%|██▎       | 570/2500 [14:02<33:32,  1.04s/it]

dishwasher/English/noun: DEBUG: unrecognized sense qualifier: UK, dialect, Wiltshire at ['dishwasher']


 25%|██▌       | 625/2500 [15:23<1:00:50,  1.95s/it]

football/English/noun: DEBUG: unrecognized sense qualifier: Australia, New South Wales, Queensland, uncountable at ['football']


 25%|██▌       | 633/2500 [15:38<51:22,  1.65s/it]

roll/English/noun: DEBUG: unrecognized sense qualifier: US, paddlesport at ['roll']
roll/English/noun: DEBUG: unrecognized sense qualifier: paddlesport at ['roll']


 25%|██▌       | 636/2500 [15:40<29:14,  1.06it/s]

fool/English/noun: DEBUG: unrecognized sense qualifier: slang, chiefly African-American Vernacular, Hispanic at ['fool']


 26%|██▌       | 653/2500 [16:00<30:44,  1.00it/s]

order/English/noun: DEBUG: unrecognized sense qualifier: i at ['order']
order/English/noun: DEBUG: unrecognized sense qualifier: ii at ['order']


 26%|██▋       | 658/2500 [16:09<44:08,  1.44s/it]

ranken/German/verb: DEBUG: unrecognized sense qualifier: intransitive or reflexive, sich ranken at ['ranken']


 27%|██▋       | 683/2500 [16:40<40:35,  1.34s/it]

port/English/nouns: DEBUG: usually in singular in section nouns at ['port']
port/English/noun: DEBUG: unrecognized sense qualifier: computing, BSD at ['port']
port/English/noun: DEBUG: unrecognized sense qualifier: Queensland at ['port']


 27%|██▋       | 685/2500 [16:43<44:07,  1.46s/it]

Gspusi/German/noun: DEBUG: unrecognized sense qualifier: semi-secret at ['Gspusi']


 28%|██▊       | 692/2500 [16:50<30:56,  1.03s/it]

run/English/noun: DEBUG: unrecognized sense qualifier: video games, speedrunning at ['run']
run/English/noun: DEBUG: unrecognized sense qualifier: video games, speedrunning at ['run']


 28%|██▊       | 695/2500 [17:01<1:13:10,  2.43s/it]

one/English/pronoun: DEBUG: unrecognized linkage prefix: (Singlish and Manglish particles): ah, hor, know, lah, leh, liao, lor, mah, meh, sia, what desc=Singlish and Manglish particles rest=ah, hor, know, lah, leh, liao, lor, mah, meh, sia, what cls=romanization cls2=romanization e1=False e2=False at ['one']


 28%|██▊       | 698/2500 [17:08<1:01:11,  2.04s/it]

pound/English/noun: DEBUG: gloss may contain unhandled list items: The symbol # (octothorpe, hash, number sign) at ['pound']


 28%|██▊       | 705/2500 [17:18<38:55,  1.30s/it]

Gspusi/German/noun: DEBUG: unrecognized sense qualifier: semi-secret at ['Gspusi']


 29%|██▉       | 735/2500 [18:04<47:33,  1.62s/it]

beef/English/verb: DEBUG: unrecognized sense qualifier: chiefly African-American Vernacular, MLE, MTE, intransitive, slang at ['beef']


 30%|██▉       | 745/2500 [18:20<45:14,  1.55s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 30%|██▉       | 749/2500 [18:27<50:44,  1.74s/it]

wacker/German/adjective: DEBUG: unrecognized sense qualifier: dated, literary, except in sich wacker schlagen at ['wacker']


 31%|███       | 772/2500 [18:58<35:05,  1.22s/it]

beta/English/noun: DEBUG: unrecognized sense qualifier: slang, manosphere, masculism at ['beta']


 32%|███▏      | 805/2500 [19:42<40:12,  1.42s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 33%|███▎      | 828/2500 [20:10<41:39,  1.49s/it]

Ringkampf/German/noun: DEBUG: unrecognized sense qualifier: olympic at ['Ringkampf']


 34%|███▍      | 845/2500 [20:33<49:02,  1.78s/it]

record/English/noun: DEBUG: gloss may contain unhandled list items: A data structure similar to a struct, in some programming languages such as C# and Java based on classes and designed for storing immutable data. at ['record']


 35%|███▍      | 869/2500 [21:12<49:01,  1.80s/it]

churl/English/noun: DEBUG: unrecognized sense qualifier: Theodism at ['churl']


 35%|███▌      | 876/2500 [21:17<20:25,  1.32it/s]

Ringkampf/German/noun: DEBUG: unrecognized sense qualifier: olympic at ['Ringkampf']


 35%|███▌      | 885/2500 [21:30<35:44,  1.33s/it]

top/English/noun: DEBUG: unrecognized sense qualifier: slang, vulgar, African-American Vernacular, MLE, MTE at ['top']


 37%|███▋      | 919/2500 [22:13<1:03:47,  2.42s/it]

waste/English/adjective: DEBUG: unrecognized sense qualifier: MTE, slang, derogatory at ['waste']


 38%|███▊      | 939/2500 [22:36<17:31,  1.48it/s]

wacker/German/adjective: DEBUG: unrecognized sense qualifier: dated, literary, except in sich wacker schlagen at ['wacker']


 38%|███▊      | 943/2500 [22:41<32:55,  1.27s/it]

leprosy/English/noun: DEBUG: suspicious alt_of/form_of with '. ': murine leprosy. and feline leprosy at ['leprosy']
leprosy/English/noun: DEBUG: suspicious alt_of/form_of with '. ': murine leprosy. and feline leprosy at ['leprosy']


 38%|███▊      | 953/2500 [22:59<37:32,  1.46s/it]

green/English/nouns: DEBUG: usually in singular in section nouns at ['green']


 39%|███▊      | 965/2500 [23:27<43:19,  1.69s/it]

bear/English/verb: DEBUG: gloss may contain unhandled list items: 1732–4, Alexander Pope, An Essay on Man, Longmans, Green & Co, 1879, bear%20him%20company%20pope&hl=de&pg=PA10#v=onepage&q&f=false p. 10: at ['bear']


 40%|███▉      | 990/2500 [24:03<38:29,  1.53s/it]

Neugier/German/noun: DEBUG: unrecognized sense qualifier: inquisitiveness; tendency to ask questions, investigate, or explore at ['Neugier']


 41%|████      | 1021/2500 [24:43<34:39,  1.41s/it]

non-smoking: DEBUG: no corresponding start tag found for </ref> at ['non-smoking'] parsing non-smoking/English/Etymology/Adjective/Further reading


 41%|████      | 1031/2500 [24:57<33:49,  1.38s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 41%|████▏     | 1035/2500 [25:01<30:07,  1.23s/it]

bear/English/verb: DEBUG: gloss may contain unhandled list items: 1732–4, Alexander Pope, An Essay on Man, Longmans, Green & Co, 1879, bear%20him%20company%20pope&hl=de&pg=PA10#v=onepage&q&f=false p. 10: at ['bear']


 42%|████▏     | 1049/2500 [25:13<17:37,  1.37it/s]

aufgeben/German/verb: DEBUG: unrecognized sense qualifier: transitive, homework at ['aufgeben']


 42%|████▏     | 1062/2500 [25:35<43:47,  1.83s/it]

100%/English/noun: DEBUG: unrecognized sense qualifier: speedrunning at ['100%']


 43%|████▎     | 1073/2500 [25:49<30:38,  1.29s/it]

mod/English/adjective: DEBUG: unrecognized sense qualifier: MTE, slang at ['mod']


 43%|████▎     | 1087/2500 [26:14<37:10,  1.58s/it]

Abriss/German/noun: DEBUG: unrecognized sense qualifier: wikis at ['Abriss']


 44%|████▍     | 1106/2500 [26:38<32:23,  1.39s/it]

run/English/noun: DEBUG: unrecognized sense qualifier: video games, speedrunning at ['run']
run/English/noun: DEBUG: unrecognized sense qualifier: video games, speedrunning at ['run']


 45%|████▌     | 1130/2500 [27:21<46:18,  2.03s/it]

coal/English/noun: DEBUG: unrecognized sense qualifier: Internet slang, 4chan at ['coal']
coal/English/noun: DEBUG: unrecognized sense qualifier: military slang, World War I– World War II at ['coal']
coal/English/adjective: DEBUG: unrecognized sense qualifier: Internet slang, soyjak.party slang, 4chan at ['coal']
coal/English/verb: DEBUG: unrecognized sense qualifier: Internet slang, 4chan at ['coal']


 46%|████▌     | 1144/2500 [27:41<28:35,  1.27s/it]

churl/English/noun: DEBUG: unrecognized sense qualifier: Theodism at ['churl']


 50%|████▉     | 1249/2500 [30:05<39:30,  1.90s/it]

one/English/pronoun: DEBUG: unrecognized linkage prefix: (Singlish and Manglish particles): ah, hor, know, lah, leh, liao, lor, mah, meh, sia, what desc=Singlish and Manglish particles rest=ah, hor, know, lah, leh, liao, lor, mah, meh, sia, what cls=romanization cls2=romanization e1=False e2=False at ['one']


 50%|█████     | 1257/2500 [30:18<36:29,  1.76s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 51%|█████     | 1264/2500 [30:27<25:18,  1.23s/it]

allow/English/verb: DEBUG: unrecognized sense qualifier: transitive, MTE, MLE at ['allow']


 51%|█████     | 1273/2500 [30:39<19:46,  1.03it/s]

churl/English/noun: DEBUG: unrecognized sense qualifier: Theodism at ['churl']


 52%|█████▏    | 1301/2500 [31:15<44:37,  2.23s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 52%|█████▏    | 1311/2500 [31:23<19:11,  1.03it/s]

beef/English/verb: DEBUG: unrecognized sense qualifier: chiefly African-American Vernacular, MLE, MTE, intransitive, slang at ['beef']


 53%|█████▎    | 1328/2500 [31:53<37:24,  1.91s/it]

leprosy/English/noun: DEBUG: suspicious alt_of/form_of with '. ': murine leprosy. and feline leprosy at ['leprosy']
leprosy/English/noun: DEBUG: suspicious alt_of/form_of with '. ': murine leprosy. and feline leprosy at ['leprosy']


 54%|█████▍    | 1350/2500 [32:19<27:35,  1.44s/it]

dishwasher/English/noun: DEBUG: unrecognized sense qualifier: UK, dialect, Wiltshire at ['dishwasher']


 55%|█████▍    | 1363/2500 [32:35<26:27,  1.40s/it]

glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, of an animal, Judaism at ['glatt']
glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, by extension, of food, Judaism at ['glatt']
glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, of an animal, Judaism at ['glatt']
glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, by extension, of food, Judaism at ['glatt']


 55%|█████▍    | 1371/2500 [32:44<17:38,  1.07it/s]

glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, of an animal, Judaism at ['glatt']
glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, by extension, of food, Judaism at ['glatt']
glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, of an animal, Judaism at ['glatt']
glatt/English/adjective: DEBUG: unrecognized sense qualifier: Yinglish, by extension, of food, Judaism at ['glatt']


 55%|█████▍    | 1373/2500 [32:50<38:48,  2.07s/it]

lift/English/verb: DEBUG: suspicious related form tags ['canonical']: 'lift c. 1490' in 'lift (third-person singular simple present lifts, present participle lifting, simple past lifted or (rare, regional, obsolete) lift, past participle lifted or (rare, regional, obsolete) lift or (obsolete) yleft)c. 1490, Of Penance and Confession be master Jhon Yrlandː Liftand (lifting) thy hands and thy eyen to Heaven.(transitive) to cause to move upwards.To try to raise something; to exert the strength for raising or bearing.' at ['lift']


 58%|█████▊    | 1453/2500 [34:47<35:49,  2.05s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 59%|█████▉    | 1481/2500 [35:31<16:29,  1.03it/s]

port/English/nouns: DEBUG: usually in singular in section nouns at ['port']
port/English/noun: DEBUG: unrecognized sense qualifier: computing, BSD at ['port']
port/English/noun: DEBUG: unrecognized sense qualifier: Queensland at ['port']


 60%|█████▉    | 1495/2500 [35:55<30:36,  1.83s/it]

split/English/noun: DEBUG: unrecognized sense qualifier: athletics, speedrunning at ['split']


 60%|██████    | 1509/2500 [36:16<31:52,  1.93s/it]

beef/English/verb: DEBUG: unrecognized sense qualifier: chiefly African-American Vernacular, MLE, MTE, intransitive, slang at ['beef']


 60%|██████    | 1512/2500 [36:20<24:52,  1.51s/it]

line/English/noun: DEBUG: unrecognized sense qualifier: baseball, slang, 1800s, with "the" at ['line']


 61%|██████    | 1527/2500 [36:52<32:49,  2.02s/it]

dishwasher/English/noun: DEBUG: unrecognized sense qualifier: UK, dialect, Wiltshire at ['dishwasher']


 61%|██████    | 1528/2500 [36:53<27:04,  1.67s/it]

peaceful/English/noun: DEBUG: unrecognized sense qualifier: India, Islam, Islamophobic, Internet slang, offensive at ['peaceful']


 62%|██████▏   | 1551/2500 [37:25<17:33,  1.11s/it]

up/English/adverb: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adjective: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adverb: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adjective: DEBUG: unrecognized sense qualifier: US, bartending at ['up']


 62%|██████▏   | 1556/2500 [37:32<17:23,  1.11s/it]

past/English/adjective: DEBUG: unrecognized sense qualifier: postmodifier at ['past']


 63%|██████▎   | 1568/2500 [37:45<13:24,  1.16it/s]

churl/English/noun: DEBUG: unrecognized sense qualifier: Theodism at ['churl']


 63%|██████▎   | 1580/2500 [37:59<13:46,  1.11it/s]

west/English/adjective: DEBUG: unrecognized sense qualifier: ecclesiastial at ['west']


 64%|██████▍   | 1599/2500 [38:22<15:20,  1.02s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 64%|██████▍   | 1600/2500 [38:24<19:32,  1.30s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 65%|██████▍   | 1620/2500 [38:54<27:26,  1.87s/it]

micro/English/noun: DEBUG: unrecognized sense qualifier: fetishism at ['micro']


 65%|██████▌   | 1632/2500 [39:11<19:09,  1.32s/it]

football/English/noun: DEBUG: unrecognized sense qualifier: Australia, New South Wales, Queensland, uncountable at ['football']


 66%|██████▌   | 1652/2500 [39:35<21:50,  1.55s/it]

up/English/adverb: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adjective: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adverb: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adjective: DEBUG: unrecognized sense qualifier: US, bartending at ['up']


 67%|██████▋   | 1667/2500 [40:01<25:51,  1.86s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 67%|██████▋   | 1684/2500 [40:27<21:10,  1.56s/it]

past/English/adjective: DEBUG: unrecognized sense qualifier: postmodifier at ['past']


 68%|██████▊   | 1688/2500 [40:32<15:27,  1.14s/it]

not/English/adverb: DEBUG: unrecognized sense qualifier: litotes at ['not']


 68%|██████▊   | 1700/2500 [40:53<24:14,  1.82s/it]

Abriss/German/noun: DEBUG: unrecognized sense qualifier: wikis at ['Abriss']


 69%|██████▉   | 1736/2500 [41:44<14:19,  1.13s/it]

green/English/nouns: DEBUG: usually in singular in section nouns at ['green']


 70%|███████   | 1754/2500 [42:09<15:23,  1.24s/it]

split/English/noun: DEBUG: unrecognized sense qualifier: athletics, speedrunning at ['split']


 71%|███████   | 1765/2500 [42:22<14:08,  1.15s/it]

up/English/adverb: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adjective: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adverb: DEBUG: unrecognized sense qualifier: US, bartending at ['up']
up/English/adjective: DEBUG: unrecognized sense qualifier: US, bartending at ['up']


 71%|███████   | 1770/2500 [42:32<15:54,  1.31s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 72%|███████▏  | 1788/2500 [42:54<12:04,  1.02s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 72%|███████▏  | 1806/2500 [43:15<13:40,  1.18s/it]

drive/English/noun: DEBUG: unrecognized sense qualifier: philanthropy at ['drive']


 72%|███████▏  | 1810/2500 [43:23<19:50,  1.72s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 72%|███████▏  | 1812/2500 [43:26<16:46,  1.46s/it]

insert/English/noun: DEBUG: unrecognized sense qualifier: childcare, informal at ['insert']


 73%|███████▎  | 1820/2500 [43:39<17:10,  1.52s/it]

pothole/English/noun: DEBUG: unrecognized sense qualifier: fandom slang, TV Tropes at ['pothole']
pothole/English/noun: DEBUG: unrecognized sense qualifier: fandom slang, TV Tropes at ['pothole']


 73%|███████▎  | 1824/2500 [43:42<08:57,  1.26it/s]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 73%|███████▎  | 1828/2500 [43:46<11:48,  1.05s/it]

heavy/English/adjective: DEBUG: unrecognized sense qualifier: dated, late 1960s, 1970s, US at ['heavy']


 73%|███████▎  | 1829/2500 [43:49<19:27,  1.74s/it]

wet/English/verb: DEBUG: unrecognized sense qualifier: US, MLE, MTE, slang at ['wet']


 74%|███████▎  | 1838/2500 [44:04<15:26,  1.40s/it]

dishwasher/English/noun: DEBUG: unrecognized sense qualifier: UK, dialect, Wiltshire at ['dishwasher']


 75%|███████▌  | 1883/2500 [45:07<09:14,  1.11it/s]

churl/English/noun: DEBUG: unrecognized sense qualifier: Theodism at ['churl']


 75%|███████▌  | 1887/2500 [45:11<09:26,  1.08it/s]

line/English/noun: DEBUG: unrecognized sense qualifier: baseball, slang, 1800s, with "the" at ['line']


 76%|███████▌  | 1889/2500 [45:14<12:34,  1.23s/it]

run/English/noun: DEBUG: unrecognized sense qualifier: video games, speedrunning at ['run']
run/English/noun: DEBUG: unrecognized sense qualifier: video games, speedrunning at ['run']


 76%|███████▌  | 1894/2500 [45:27<20:26,  2.02s/it]

line/English/noun: DEBUG: unrecognized sense qualifier: baseball, slang, 1800s, with "the" at ['line']


 77%|███████▋  | 1916/2500 [45:57<12:26,  1.28s/it]

pound/English/noun: DEBUG: gloss may contain unhandled list items: The symbol # (octothorpe, hash, number sign) at ['pound']


 77%|███████▋  | 1931/2500 [46:16<11:48,  1.25s/it]

local/English/adjective: DEBUG: unrecognized head form: 1 at ['local']
local/English/adjective: DEBUG: unrecognized head form: 2 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']
local/English/adjective: DEBUG: unrecognized head form: 1 at ['local']
local/English/adjective: DEBUG: unrecognized head form: 2 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']
local/English/adjective: DEBUG: unrecognized sense qualifier: 3 at ['local']


 78%|███████▊  | 1939/2500 [46:25<11:57,  1.28s/it]

Abriss/German/noun: DEBUG: unrecognized sense qualifier: wikis at ['Abriss']


 78%|███████▊  | 1940/2500 [46:28<16:44,  1.79s/it]

line/English/noun: DEBUG: unrecognized sense qualifier: baseball, slang, 1800s, with "the" at ['line']


 78%|███████▊  | 1950/2500 [46:41<13:42,  1.50s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 79%|███████▉  | 1975/2500 [47:21<19:14,  2.20s/it]

leprosy/English/noun: DEBUG: suspicious alt_of/form_of with '. ': murine leprosy. and feline leprosy at ['leprosy']
leprosy/English/noun: DEBUG: suspicious alt_of/form_of with '. ': murine leprosy. and feline leprosy at ['leprosy']


 79%|███████▉  | 1977/2500 [47:23<14:04,  1.61s/it]

insert/English/noun: DEBUG: unrecognized sense qualifier: childcare, informal at ['insert']


 81%|████████  | 2024/2500 [48:43<11:03,  1.39s/it]

wacker/German/adjective: DEBUG: unrecognized sense qualifier: dated, literary, except in sich wacker schlagen at ['wacker']


 82%|████████▏ | 2039/2500 [48:58<08:39,  1.13s/it]

lösen/German/verb: DEBUG: unrecognized sense qualifier: etwas at ['lösen']


 82%|████████▏ | 2048/2500 [49:09<08:19,  1.11s/it]

heavy/English/adjective: DEBUG: unrecognized sense qualifier: dated, late 1960s, 1970s, US at ['heavy']


 82%|████████▏ | 2050/2500 [49:14<13:07,  1.75s/it]

wacker/German/adjective: DEBUG: unrecognized sense qualifier: dated, literary, except in sich wacker schlagen at ['wacker']


 84%|████████▎ | 2092/2500 [50:11<14:07,  2.08s/it]

mod/English/adjective: DEBUG: unrecognized sense qualifier: MTE, slang at ['mod']


 84%|████████▍ | 2110/2500 [50:47<07:53,  1.21s/it]

Errors parsing wiktionary page for magic word: recursively_extract: unhandled kind NodeKind.MAGIC_WORD. Skipping page.


 85%|████████▌ | 2135/2500 [51:31<09:04,  1.49s/it]

dishwasher/English/noun: DEBUG: unrecognized sense qualifier: UK, dialect, Wiltshire at ['dishwasher']


 86%|████████▌ | 2140/2500 [51:40<08:06,  1.35s/it]

Abriss/German/noun: DEBUG: unrecognized sense qualifier: wikis at ['Abriss']


 86%|████████▌ | 2144/2500 [51:43<05:22,  1.10it/s]

waste/English/adjective: DEBUG: unrecognized sense qualifier: MTE, slang, derogatory at ['waste']


 86%|████████▋ | 2160/2500 [52:10<11:41,  2.06s/it]

100%/English/noun: DEBUG: unrecognized sense qualifier: speedrunning at ['100%']


 87%|████████▋ | 2168/2500 [52:19<06:16,  1.13s/it]

furniture/English/noun: DEBUG: unrecognized sense qualifier: bookselling at ['furniture']


 89%|████████▊ | 2217/2500 [53:20<07:15,  1.54s/it]

ranken/German/verb: DEBUG: unrecognized sense qualifier: intransitive or reflexive, sich ranken at ['ranken']


 89%|████████▉ | 2219/2500 [53:23<06:13,  1.33s/it]

ranken/German/verb: DEBUG: unrecognized sense qualifier: intransitive or reflexive, sich ranken at ['ranken']


 89%|████████▉ | 2228/2500 [53:37<08:30,  1.88s/it]

coal/English/noun: DEBUG: unrecognized sense qualifier: Internet slang, 4chan at ['coal']
coal/English/noun: DEBUG: unrecognized sense qualifier: military slang, World War I– World War II at ['coal']
coal/English/adjective: DEBUG: unrecognized sense qualifier: Internet slang, soyjak.party slang, 4chan at ['coal']
coal/English/verb: DEBUG: unrecognized sense qualifier: Internet slang, 4chan at ['coal']


 91%|█████████ | 2281/2500 [54:48<05:26,  1.49s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 92%|█████████▏| 2299/2500 [55:11<03:29,  1.04s/it]

dishwasher/English/noun: DEBUG: unrecognized sense qualifier: UK, dialect, Wiltshire at ['dishwasher']


 92%|█████████▏| 2302/2500 [55:15<04:22,  1.33s/it]

beef/English/verb: DEBUG: unrecognized sense qualifier: chiefly African-American Vernacular, MLE, MTE, intransitive, slang at ['beef']


 93%|█████████▎| 2315/2500 [55:34<06:20,  2.06s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 94%|█████████▍| 2348/2500 [56:08<03:18,  1.31s/it]

insert/English/noun: DEBUG: unrecognized sense qualifier: childcare, informal at ['insert']


 94%|█████████▍| 2351/2500 [56:13<03:39,  1.47s/it]

beta/English/noun: DEBUG: unrecognized sense qualifier: slang, manosphere, masculism at ['beta']


 95%|█████████▍| 2368/2500 [56:35<03:17,  1.49s/it]

pothole/English/noun: DEBUG: unrecognized sense qualifier: fandom slang, TV Tropes at ['pothole']
pothole/English/noun: DEBUG: unrecognized sense qualifier: fandom slang, TV Tropes at ['pothole']


 95%|█████████▌| 2376/2500 [56:47<02:26,  1.18s/it]

score/English/verb: DEBUG: unrecognized sense qualifier: transitiveb at ['score']


 95%|█████████▌| 2382/2500 [57:03<04:35,  2.33s/it]

what/English/determiner: DEBUG: unrecognized sense qualifier: exclamative at ['what']


 96%|█████████▌| 2401/2500 [57:28<02:04,  1.26s/it]

100%/English/noun: DEBUG: unrecognized sense qualifier: speedrunning at ['100%']


 96%|█████████▋| 2407/2500 [57:37<02:08,  1.38s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 98%|█████████▊| 2439/2500 [58:24<01:27,  1.43s/it]

ranken/German/verb: DEBUG: unrecognized sense qualifier: intransitive or reflexive, sich ranken at ['ranken']


 98%|█████████▊| 2449/2500 [58:34<00:47,  1.08it/s]

Errors parsing wiktionary page for magic word: recursively_extract: unhandled kind NodeKind.MAGIC_WORD. Skipping page.


 98%|█████████▊| 2457/2500 [58:41<00:34,  1.26it/s]

implode/English/verb: DEBUG: unrecognized sense qualifier: computing, programming, PHP at ['implode']


 99%|█████████▉| 2479/2500 [59:11<00:34,  1.63s/it]

100%/English/noun: DEBUG: unrecognized sense qualifier: speedrunning at ['100%']


 99%|█████████▉| 2487/2500 [59:21<00:12,  1.03it/s]

split/English/noun: DEBUG: unrecognized sense qualifier: athletics, speedrunning at ['split']


100%|██████████| 2500/2500 [59:41<00:00,  1.43s/it]


################################
Not in wikipedia: 2170
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6854    0.8720    0.7675      3938
           1     0.8213    0.5951    0.6901      3892

    accuracy                         0.7344      7830
   macro avg     0.7534    0.7335    0.7288      7830
weighted avg     0.7530    0.7344    0.7291      7830

################################


# 3 Evaluating Finetuned Pipeline Online (Best Setup)
In the previous section, we evaluated our finetuned pipeline using the offline evidence fetcher. We can also connect our pipeline to the Wikipedia API to retrieve the most current knowledge available.

Let's check on that:

In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_finetuned_soft_threshold_online")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    print(f"Evaluating {dataset_name}...")
    pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=online_evid_fetcher,
                        evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                        lang=lang)
    evaluate_pipeline(pipeline, dataset,
                      output_file_name=output_file_base_name.format(dataset=dataset_name))

Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [02:34<00:00,  3.67s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8219    0.8696    0.8451        69
           1     0.8636    0.8143    0.8382        70

    accuracy                         0.8417       139
   macro avg     0.8428    0.8419    0.8417       139
weighted avg     0.8429    0.8417    0.8416       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:28<03:31,  4.80s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [01:01<01:58,  3.30s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [01:10<02:11,  3.86s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [01:12<01:52,  3.42s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [02:04<00:56,  3.11s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [02:18<01:28,  5.53s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [02:44<00:16,  2.72s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [02:55<00:02,  2.44s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [02:58<00:00,  3.56s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7200    0.8889    0.7956        81
           1     0.8500    0.6456    0.7338        79

    accuracy                         0.7688       160
   macro avg     0.7850    0.7672    0.7647       160
weighted avg     0.7842    0.7688    0.7651       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [01:47<00:00,  2.69s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7971    0.8730    0.8333        63
           1     0.8596    0.7778    0.8167        63

    accuracy                         0.8254       126
   macro avg     0.8284    0.8254    0.8250       126
weighted avg     0.8284    0.8254    0.8250       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:24<06:43,  3.01s/it]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [05:15<04:13,  4.30s/it]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [08:55<00:00,  3.80s/it]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7042    0.5396    0.6110       278
           1     0.6109    0.7614    0.6779       264

    accuracy                         0.6476       542
   macro avg     0.6576    0.6505    0.6445       542
weighted avg     0.6588    0.6476    0.6436       542

################################


# 4 Evaluation of Different Claim Splitters

A sentence can be split into multiple facts, where the combination of these facts represents the entire sentence.

For this evaluation, we test four different splitters:

- **`DisSimSplitter`**: Based on [DiscourseSimplification](https://github.com/Lambda-3/DiscourseSimplification)
- **`T5SplitRephraseSplitter`**: Based on [T5 Split and Rephrase](https://huggingface.co/unikei/t5-base-split-and-rephrase)
- **`FactscoreSplitter`**: Based on [FActScore](https://github.com/shmsw25/FActScore)
- **`None`**: No splitting

In [None]:
%%capture
claim_splitters = {
    'DisSimSplitter': DisSimSplitter(),
    'T5SplitRephraseSplitter': T5SplitRephraseSplitter(),
    'FactscoreSplitter': FactscoreSplitter()
}

You are using the default legacy behaviour of the <class 'transformers.models.t5.tokenization_t5.T5Tokenizer'>. This is expected, and simply means that the `legacy` (previous) behavior will be used so nothing changes for you. If you want to use the new behaviour, set `legacy=False`. This should only be set if you understand what it means, and thoroughly read the reason why this was added as explained in https://github.com/huggingface/transformers/pull/24565


In this setting, the splits are not calculated. Instead, they are reused as they have already been precomputed and are present in the datasets.

In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_finetuned_soft_threshold_{splitter}")

for dataset_name, config in datasets.items():
    dataset = config['dataset']
    lang = config['lang']
    for name, splitter in claim_splitters.items():
        print(f"Evaluating {dataset_name} with claim splitter {name}...")
        pipeline = Pipeline(translator=translator,
                            sent_connector=phi_sentence_connector,
                            claim_splitter=splitter,
                            evid_fetcher=offline_evid_fetcher,
                            evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                            stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                            lang=lang)
        evaluate_pipeline(pipeline, dataset,
                          output_file_name=output_file_base_name.format(dataset=dataset_name,
                                                                        splitter=name))

Evaluating german_dpr-claim_verification with claim splitter DisSimSplitter...


100%|██████████| 42/42 [00:50<00:00,  1.21s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.870503597122302
              precision    recall  f1-score   support

           0     0.7283    0.9710    0.8323        69
           1     0.9574    0.6429    0.7692        70

    accuracy                         0.8058       139
   macro avg     0.8429    0.8069    0.8008       139
weighted avg     0.8437    0.8058    0.8005       139

################################
Evaluating german_dpr-claim_verification with claim splitter T5SplitRephraseSplitter...


100%|██████████| 42/42 [00:49<00:00,  1.17s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.474820143884892
              precision    recall  f1-score   support

           0     0.7158    0.9855    0.8293        69
           1     0.9773    0.6143    0.7544        70

    accuracy                         0.7986       139
   macro avg     0.8465    0.7999    0.7918       139
weighted avg     0.8475    0.7986    0.7916       139

################################
Evaluating german_dpr-claim_verification with claim splitter FactscoreSplitter...


100%|██████████| 42/42 [00:51<00:00,  1.22s/it]


################################
Not in wikipedia: 29
Avg claim count: 3.5611510791366907
              precision    recall  f1-score   support

           0     0.6296    0.9855    0.7684        69
           1     0.9677    0.4286    0.5941        70

    accuracy                         0.7050       139
   macro avg     0.7987    0.7070    0.6812       139
weighted avg     0.7999    0.7050    0.6806       139

################################
Evaluating german_wiktionary-claim_verification-mini with claim splitter DisSimSplitter...


 12%|█▏        | 6/50 [00:07<00:50,  1.15s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:13<00:26,  1.37it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:16<00:38,  1.13s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:35,  1.09s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:32<00:22,  1.22s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:38<00:38,  2.40s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:49<00:07,  1.26s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:53<00:00,  1.18it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:54<00:00,  1.08s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.7
              precision    recall  f1-score   support

           0     0.6583    0.9753    0.7861        81
           1     0.9500    0.4810    0.6387        79

    accuracy                         0.7312       160
   macro avg     0.8042    0.7282    0.7124       160
weighted avg     0.8023    0.7312    0.7133       160

################################
Evaluating german_wiktionary-claim_verification-mini with claim splitter T5SplitRephraseSplitter...


 12%|█▏        | 6/50 [00:06<00:46,  1.06s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:14<00:30,  1.19it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:16<00:31,  1.07it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:27,  1.18it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:32<00:18,  1.04s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:37<00:32,  2.05s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:48<00:05,  1.01it/s]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:52<00:00,  1.31it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:52<00:00,  1.06s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.61875
              precision    recall  f1-score   support

           0     0.6393    0.9630    0.7685        81
           1     0.9211    0.4430    0.5983        79

    accuracy                         0.7063       160
   macro avg     0.7802    0.7030    0.6834       160
weighted avg     0.7784    0.7063    0.6844       160

################################
Evaluating german_wiktionary-claim_verification-mini with claim splitter FactscoreSplitter...


 12%|█▏        | 6/50 [00:08<01:00,  1.36s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:14<00:26,  1.37it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:16<00:29,  1.16it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:29,  1.13it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:32<00:17,  1.04it/s]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:39<00:42,  2.66s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:49<00:06,  1.03s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:54<00:01,  1.03s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:55<00:00,  1.10s/it]


################################
Not in wikipedia: 40
Avg claim count: 3.7625
              precision    recall  f1-score   support

           0     0.6045    1.0000    0.7535        81
           1     1.0000    0.3291    0.4952        79

    accuracy                         0.6687       160
   macro avg     0.8022    0.6646    0.6244       160
weighted avg     0.7998    0.6687    0.6260       160

################################
Evaluating squad-claim_verification with claim splitter DisSimSplitter...


100%|██████████| 40/40 [00:34<00:00,  1.15it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.119047619047619
              precision    recall  f1-score   support

           0     0.7126    0.9841    0.8267        63
           1     0.9744    0.6032    0.7451        63

    accuracy                         0.7937       126
   macro avg     0.8435    0.7937    0.7859       126
weighted avg     0.8435    0.7937    0.7859       126

################################
Evaluating squad-claim_verification with claim splitter T5SplitRephraseSplitter...


100%|██████████| 40/40 [00:35<00:00,  1.14it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0634920634920635
              precision    recall  f1-score   support

           0     0.7294    0.9841    0.8378        63
           1     0.9756    0.6349    0.7692        63

    accuracy                         0.8095       126
   macro avg     0.8525    0.8095    0.8035       126
weighted avg     0.8525    0.8095    0.8035       126

################################
Evaluating squad-claim_verification with claim splitter FactscoreSplitter...


100%|██████████| 40/40 [00:35<00:00,  1.14it/s]


################################
Not in wikipedia: 32
Avg claim count: 2.388888888888889
              precision    recall  f1-score   support

           0     0.7159    1.0000    0.8344        63
           1     1.0000    0.6032    0.7525        63

    accuracy                         0.8016       126
   macro avg     0.8580    0.8016    0.7935       126
weighted avg     0.8580    0.8016    0.7935       126

################################
Evaluating shroom-claim_verification with claim splitter DisSimSplitter...


  5%|▍         | 7/141 [00:05<01:15,  1.78it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:00<00:36,  1.62it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:45<00:00,  1.34it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.2730627306273063
              precision    recall  f1-score   support

           0     0.6980    0.6151    0.6539       278
           1     0.6397    0.7197    0.6774       264

    accuracy                         0.6661       542
   macro avg     0.6688    0.6674    0.6656       542
weighted avg     0.6696    0.6661    0.6653       542

################################
Evaluating shroom-claim_verification with claim splitter T5SplitRephraseSplitter...


  5%|▍         | 7/141 [00:04<01:21,  1.65it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:02<00:36,  1.60it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:48<00:00,  1.29it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.2177121771217712
              precision    recall  f1-score   support

           0     0.7108    0.6367    0.6717       278
           1     0.6553    0.7273    0.6894       264

    accuracy                         0.6808       542
   macro avg     0.6831    0.6820    0.6806       542
weighted avg     0.6838    0.6808    0.6803       542

################################
Evaluating shroom-claim_verification with claim splitter FactscoreSplitter...


  5%|▍         | 7/141 [00:03<01:01,  2.18it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:01<00:46,  1.27it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:46<00:00,  1.33it/s]

################################
Not in wikipedia: 21
Avg claim count: 2.726937269372694
              precision    recall  f1-score   support

           0     0.6667    0.7554    0.7083       278
           1     0.7004    0.6023    0.6477       264

    accuracy                         0.6808       542
   macro avg     0.6836    0.6788    0.6780       542
weighted avg     0.6831    0.6808    0.6787       542

################################





# 5 Whole Wiki Page Run (Best Setup)

In our previous setting, we limited the text fetched to the intro sections of Wikipedia pages. However, we could extend this to use the entire Wikipedia page, which would provide more information but would significantly increase the processing time.

Let us try it out:


In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_finetuned_soft_threshold_whole_page")

for dataset_name, config in datasets.items():
    print(f"Evaluating {dataset_name}...")
    dataset = config['dataset']
    lang = config['lang']

    pipeline = Pipeline(translator=translator,
                        sent_connector=phi_sentence_connector,
                        claim_splitter=None,
                        evid_fetcher=offline_evid_fetcher,
                        evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                        stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                        lang=lang)
    outputs = evaluate_pipeline(pipeline, dataset,
                                output_file_name=output_file_base_name.format(dataset=dataset_name),
                                only_intro=False)
    additional_stats = calc_additional_stats(outputs, f'{output_file_base_name.format(dataset=dataset_name)}_additional_stats')

Evaluating german_dpr-claim_verification...


  0%|          | 0/42 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

100%|██████████| 42/42 [07:13<00:00, 10.32s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6566    0.9420    0.7738        69
           1     0.9000    0.5143    0.6545        70

    accuracy                         0.7266       139
   macro avg     0.7783    0.7282    0.7142       139
weighted avg     0.7792    0.7266    0.7137       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:51<05:55,  8.08s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [01:31<03:05,  5.14s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [01:46<03:45,  6.65s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [01:49<03:00,  5.47s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [02:52<01:47,  5.96s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [03:32<03:56, 14.81s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [04:24<00:36,  6.04s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [04:42<00:03,  3.71s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [04:44<00:00,  5.69s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6697    0.9012    0.7684        81
           1     0.8431    0.5443    0.6615        79

    accuracy                         0.7250       160
   macro avg     0.7564    0.7228    0.7150       160
weighted avg     0.7553    0.7250    0.7156       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [04:38<00:00,  6.97s/it]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6977    0.9524    0.8054        63
           1     0.9250    0.5873    0.7184        63

    accuracy                         0.7698       126
   macro avg     0.8113    0.7698    0.7619       126
weighted avg     0.8113    0.7698    0.7619       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:06<01:37,  1.38it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [03:05<02:00,  2.04s/it]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [05:37<00:00,  2.39s/it]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6875    0.5540    0.6135       278
           1     0.6101    0.7348    0.6667       264

    accuracy                         0.6421       542
   macro avg     0.6488    0.6444    0.6401       542
weighted avg     0.6498    0.6421    0.6394       542

################################


# 6 Evaluate FEVER Score

Here, we test our fine-tuning process with the FEVER dataset using three evaluation metrics: FeverScore, Gold Label, and Gold Evidence. 

1. **FeverScore**: This metric evaluates strict label accuracy. It considers an entry to be correctly classified only if at least one relevant piece of evidence is present in the selected evidence sentences. This ensures that a correct classification without supporting evidence is not counted as correctly classified. Entries labeled *Not Enough Info* are considered correctly classified without evidence, as no annotated evidence is available. Similarly, claims classified as unsupported due to topic modeling are considered correct as no evidence is selected.

2. **Gold Label**: In this metric, we provide the gold labels to the FeverScore. This allows us to measure the pipeline's ability to retrieve correct evidence without considering downstream statement classification.

3. **Gold Evidence**: We provide the gold evidence directly to the statement classification component for this metric. This allows us to assess its accuracy independently of the evidence retrieval step.

Important: Gold Label + Gold Evidence does not necessarly lead a Gold Label socre of 1 as there exist entries which have evidence groups that are larger than 3 and thus cannot be fully captured as we do only select 3 sentences.

In [None]:
def evaluate_fever_pipeline(pipeline: FeverPipeline, dataset, batch_size=4, output_file_name='', gold_evidence=False):
    outputs, report, fever_report = pipeline.verify_test_dataset(dataset, batch_size, output_file_name, gold_evidence=gold_evidence)
    print_fever_classification_report(report, fever_report)
    return outputs

## 6.1 Including Not Enough Info entries

We report the results on the FEVER dataset when including Not Enough Info entries.
In training we excluded them, as this improved performance.

In [None]:
output_file_base_name = str(EVALUATION_DIR / "fever/fever_{name}")
dataset = load_dataset("lukasellinger/filtered_fever-claim_verification", split='test')
# dataset = dataset.filter(lambda x: x['label'] != 'NOT_ENOUGH_INFO')

Downloading readme:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 53.7M/53.7M [00:02<00:00, 25.7MB/s]
Downloading data: 100%|██████████| 1.48M/1.48M [00:00<00:00, 3.79MB/s]
Downloading data: 100%|██████████| 1.56M/1.56M [00:00<00:00, 1.90MB/s]


Generating train split:   0%|          | 0/38466 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/2936 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2469 [00:00<?, ? examples/s]

In [None]:
for name, models in pipeline_models.items():
    print(f"Evaluating with {name}...")
    pipeline = FeverPipeline(claim_splitter=None,
                             evid_selector=models.get('evid_selector'),
                             stm_verifier=models.get('stm_verifier'))
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=name), gold_evidence=False)
    print(f'Evaluating with {name} and gold evidence....')
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=name), gold_evidence=True)

Evaluating with base...


  0%|          | 0/618 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

100%|██████████| 618/618 [05:39<00:00,  1.82it/s]


################################
FeverScore: 0.8833535844471446
Gold Label: 0.9623329283110571
              precision    recall  f1-score   support

           0     0.9562    0.9089    0.9320      1658
           1     0.8309    0.9149    0.8709       811

    accuracy                         0.9109      2469
   macro avg     0.8936    0.9119    0.9014      2469
weighted avg     0.9151    0.9109    0.9119      2469

################################
Evaluating with base and gold evidence....


100%|██████████| 618/618 [01:27<00:00,  7.05it/s]


################################
FeverScore: 0.9376854599406528
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9345    0.9474    0.9409       874
           1     0.9424    0.9285    0.9354       811

    accuracy                         0.9383      1685
   macro avg     0.9385    0.9379    0.9382      1685
weighted avg     0.9383    0.9383    0.9383      1685

################################
Evaluating with finetuned_wo_threshold...


  0%|          | 0/618 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.08k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.12G [00:00<?, ?B/s]

100%|██████████| 618/618 [06:01<00:00,  1.71it/s]


################################
FeverScore: 0.8712029161603888
Gold Label: 0.9777237748076144
              precision    recall  f1-score   support

           0     0.9799    0.8534    0.9123      1658
           1     0.7629    0.9642    0.8519       811

    accuracy                         0.8898      2469
   macro avg     0.8714    0.9088    0.8821      2469
weighted avg     0.9086    0.8898    0.8925      2469

################################
Evaluating with finetuned_wo_threshold and gold evidence....


100%|██████████| 618/618 [01:27<00:00,  7.05it/s]


################################
FeverScore: 0.9459940652818991
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9689    0.9279    0.9480       874
           1     0.9257    0.9679    0.9464       811

    accuracy                         0.9472      1685
   macro avg     0.9473    0.9479    0.9472      1685
weighted avg     0.9481    0.9472    0.9472      1685

################################
Evaluating with finetuned_soft_threshold...


100%|██████████| 618/618 [05:33<00:00,  1.85it/s]


################################
FeverScore: 0.8732280275415147
Gold Label: 0.97893884163629
              precision    recall  f1-score   support

           0     0.9806    0.8546    0.9133      1658
           1     0.7646    0.9655    0.8534       811

    accuracy                         0.8910      2469
   macro avg     0.8726    0.9101    0.8834      2469
weighted avg     0.9097    0.8910    0.8936      2469

################################
Evaluating with finetuned_soft_threshold and gold evidence....


100%|██████████| 618/618 [01:31<00:00,  6.79it/s]


################################
FeverScore: 0.9459940652818991
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9689    0.9279    0.9480       874
           1     0.9257    0.9679    0.9464       811

    accuracy                         0.9472      1685
   macro avg     0.9473    0.9479    0.9472      1685
weighted avg     0.9481    0.9472    0.9472      1685

################################
Evaluating with finetuned...


100%|██████████| 618/618 [05:38<00:00,  1.83it/s]


################################
FeverScore: 0.8683677602268125
Gold Label: 0.97893884163629
              precision    recall  f1-score   support

           0     0.9661    0.8601    0.9100      1658
           1     0.7664    0.9383    0.8437       811

    accuracy                         0.8858      2469
   macro avg     0.8662    0.8992    0.8768      2469
weighted avg     0.9005    0.8858    0.8882      2469

################################
Evaluating with finetuned and gold evidence....


100%|██████████| 618/618 [01:40<00:00,  6.14it/s]


################################
FeverScore: 0.9459940652818991
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9689    0.9279    0.9480       874
           1     0.9257    0.9679    0.9464       811

    accuracy                         0.9472      1685
   macro avg     0.9473    0.9479    0.9472      1685
weighted avg     0.9481    0.9472    0.9472      1685

################################


## 6.2 Excluding Not Enough Info Entries

Next, we report the results on the FEVER dataset when we exlcude the Not Enough Info entries.
In our training we also excluded them.

In [None]:
output_file_base_name = str(EVALUATION_DIR / "fever/fever_filtered_{name}")
dataset = load_dataset("lukasellinger/filtered_fever-claim_verification", split='test')
dataset = dataset.filter(lambda x: x['label'] != 'NOT_ENOUGH_INFO')

In [None]:
for name, models in pipeline_models.items():
    print(f"Evaluating with {name}...")
    pipeline = FeverPipeline(claim_splitter=None,
                             evid_selector=models.get('evid_selector'),
                             stm_verifier=models.get('stm_verifier'))
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=name), gold_evidence=False)
    print(f'Evaluating with {name} and gold evidence....')
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=name), gold_evidence=True)

Evaluating with base...


  0%|          | 0/422 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.60k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

100%|██████████| 422/422 [04:02<00:00,  1.74it/s]


################################
FeverScore: 0.8925816023738873
Gold Label: 0.9448071216617211
              precision    recall  f1-score   support

           0     0.9232    0.9497    0.9363       874
           1     0.9440    0.9149    0.9292       811

    accuracy                         0.9329      1685
   macro avg     0.9336    0.9323    0.9328      1685
weighted avg     0.9332    0.9329    0.9329      1685

################################
Evaluating with base and gold evidence....


100%|██████████| 422/422 [01:27<00:00,  4.80it/s]


################################
FeverScore: 0.9376854599406528
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9345    0.9474    0.9409       874
           1     0.9424    0.9285    0.9354       811

    accuracy                         0.9383      1685
   macro avg     0.9385    0.9379    0.9382      1685
weighted avg     0.9383    0.9383    0.9383      1685

################################
Evaluating with finetuned_wo_threshold...


100%|██████████| 422/422 [03:57<00:00,  1.78it/s]


################################
FeverScore: 0.9169139465875371
Gold Label: 0.9673590504451038
              precision    recall  f1-score   support

           0     0.9654    0.9256    0.9451       874
           1     0.9233    0.9642    0.9433       811

    accuracy                         0.9442      1685
   macro avg     0.9443    0.9449    0.9442      1685
weighted avg     0.9451    0.9442    0.9442      1685

################################
Evaluating with finetuned_wo_threshold and gold evidence....


100%|██████████| 422/422 [01:27<00:00,  4.82it/s]


################################
FeverScore: 0.9459940652818991
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9689    0.9279    0.9480       874
           1     0.9257    0.9679    0.9464       811

    accuracy                         0.9472      1685
   macro avg     0.9473    0.9479    0.9472      1685
weighted avg     0.9481    0.9472    0.9472      1685

################################
Evaluating with finetuned_soft_threshold...


100%|██████████| 422/422 [03:44<00:00,  1.88it/s]


################################
FeverScore: 0.9186943620178042
Gold Label: 0.9691394658753709
              precision    recall  f1-score   support

           0     0.9665    0.9256    0.9456       874
           1     0.9233    0.9655    0.9439       811

    accuracy                         0.9448      1685
   macro avg     0.9449    0.9456    0.9448      1685
weighted avg     0.9458    0.9448    0.9448      1685

################################
Evaluating with finetuned_soft_threshold and gold evidence....


100%|██████████| 422/422 [01:28<00:00,  4.79it/s]


################################
FeverScore: 0.9459940652818991
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9689    0.9279    0.9480       874
           1     0.9257    0.9679    0.9464       811

    accuracy                         0.9472      1685
   macro avg     0.9473    0.9479    0.9472      1685
weighted avg     0.9481    0.9472    0.9472      1685

################################
Evaluating with finetuned...


100%|██████████| 422/422 [03:48<00:00,  1.84it/s]


################################
FeverScore: 0.9068249258160237
Gold Label: 0.9691394658753709
              precision    recall  f1-score   support

           0     0.9419    0.9268    0.9343       874
           1     0.9224    0.9383    0.9303       811

    accuracy                         0.9323      1685
   macro avg     0.9321    0.9326    0.9323      1685
weighted avg     0.9325    0.9323    0.9324      1685

################################
Evaluating with finetuned and gold evidence....


100%|██████████| 422/422 [01:29<00:00,  4.71it/s]


################################
FeverScore: 0.9459940652818991
Gold Label: 0.998813056379822
              precision    recall  f1-score   support

           0     0.9689    0.9279    0.9480       874
           1     0.9257    0.9679    0.9464       811

    accuracy                         0.9472      1685
   macro avg     0.9473    0.9479    0.9472      1685
weighted avg     0.9481    0.9472    0.9472      1685

################################


## 6.1 MMR vs Top Evidence Selection
We check on our development set, which evidence selection method works best. We test:
- Maximal Marginal Relevance (MMR)
- Maximum Similarity

In [None]:
output_file_base_name = str(EVALUATION_DIR / "fever/fever_{name}_selection")
dataset = load_dataset("lukasellinger/filtered_fever-claim_verification", split='dev')
dataset = dataset.filter(lambda x: x['label'] != 'NOT_ENOUGH_INFO')

Downloading readme:   0%|          | 0.00/782 [00:00<?, ?B/s]

Downloading data: 100%|██████████| 53.7M/53.7M [00:00<00:00, 67.1MB/s]
Downloading data: 100%|██████████| 1.48M/1.48M [00:00<00:00, 4.10MB/s]
Downloading data: 100%|██████████| 1.56M/1.56M [00:00<00:00, 7.47MB/s]


Generating train split:   0%|          | 0/38466 [00:00<?, ? examples/s]

Generating dev split:   0%|          | 0/2936 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/2469 [00:00<?, ? examples/s]

Filter:   0%|          | 0/2936 [00:00<?, ? examples/s]

In [None]:
pipeline = FeverPipeline(claim_splitter=None,
                         evid_selector=ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0, evidence_selection='mmr'),
                         stm_verifier= ModelStatementVerifier(model_name=base_verification_model, premise_sent_order='top_last'))

tokenizer_config.json:   0%|          | 0.00/1.42k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/712k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/695 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/467 [00:00<?, ?B/s]

spm.model:   0%|          | 0.00/4.31M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/16.3M [00:00<?, ?B/s]

added_tokens.json:   0%|          | 0.00/23.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/173 [00:00<?, ?B/s]



In [None]:
evidence_selections = ['mmr', 'top']
for evidence_selection in evidence_selections:
    print(f"Evaluating with {evidence_selection}...")
    pipeline.evid_selector.set_evidence_selection(evidence_selection)
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=evidence_selection))

Evaluating with mmr...


  0%|          | 0/495 [00:00<?, ?it/s]

config.json:   0%|          | 0.00/1.62k [00:00<?, ?B/s]

configuration_hf_nomic_bert.py:   0%|          | 0.00/1.96k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- configuration_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


modeling_hf_nomic_bert.py:   0%|          | 0.00/52.6k [00:00<?, ?B/s]

A new version of the following files was downloaded from https://huggingface.co/Snowflake/snowflake-arctic-embed-m-long:
- modeling_hf_nomic_bert.py
. Make sure to double-check they do not contain any added malicious code. To avoid downloading new versions of the code file, you can pin a revision.


model.safetensors:   0%|          | 0.00/547M [00:00<?, ?B/s]



config.json:   0%|          | 0.00/1.09k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/558M [00:00<?, ?B/s]

100%|██████████| 495/495 [04:19<00:00,  1.91it/s]


################################
FeverScore: 0.9277047522750252
Gold Label: 0.9691607684529828
              precision    recall  f1-score   support

           0     0.9426    0.9640    0.9532      1056
           1     0.9577    0.9328    0.9451       922

    accuracy                         0.9494      1978
   macro avg     0.9501    0.9484    0.9491      1978
weighted avg     0.9496    0.9494    0.9494      1978

################################
Evaluating with top...


100%|██████████| 495/495 [04:07<00:00,  2.00it/s]

################################
FeverScore: 0.9322548028311426
Gold Label: 0.9732052578361982
              precision    recall  f1-score   support

           0     0.9470    0.9650    0.9559      1056
           1     0.9590    0.9382    0.9485       922

    accuracy                         0.9525      1978
   macro avg     0.9530    0.9516    0.9522      1978
weighted avg     0.9526    0.9525    0.9524      1978

################################





## 6.2 Find Sentence Ordering
Next, we check on our development set, which sentence ordering in the premise of our statement classifier works best. We test:
- Original (keep)
- Reversed (reverse) 
- Most promising at End (top_last)

In [None]:
dataset = load_dataset("lukasellinger/filtered_fever-claim_verification", split='dev')
#dataset = dataset.filter(lambda x: x['label'] != 'NOT_ENOUGH_INFO')
output_file_base_name = str(EVALUATION_DIR / "fever/fever_{name}_order")

In [None]:
model_name = 'lukasellinger/claim-verification-model'
pipeline_models = {
    'keep': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=f'{model_name}-keep', premise_sent_order='keep')
    },
    'reverse': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=f'{model_name}-reverse', premise_sent_order='reverse')
    },
    'top_last': {
        'evid_selector': ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0, evidence_selection='top'),
        'stm_verifier': ModelStatementVerifier(model_name=f'{model_name}-top_last', premise_sent_order='top_last')
    }
}

In [None]:
for name, models in pipeline_models.items():
    print(f"Evaluating with sentence order: {name}...")
    pipeline = FeverPipeline(claim_splitter=None,
                             evid_selector=models.get('evid_selector'),
                             stm_verifier=models.get('stm_verifier'))
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(name=name))

Evaluating with sentence order: keep...


100%|██████████| 734/734 [06:10<00:00,  1.98it/s]


################################
FeverScore: 0.8746594005449592
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9817    0.8540    0.9134      2014
           1     0.7517    0.9653    0.8452       922

    accuracy                         0.8890      2936
   macro avg     0.8667    0.9097    0.8793      2936
weighted avg     0.9095    0.8890    0.8920      2936

################################
Evaluating with sentence order: reverse...


100%|██████████| 734/734 [06:14<00:00,  1.96it/s]


################################
FeverScore: 0.8763623978201635
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9780    0.8600    0.9152      2014
           1     0.7579    0.9577    0.8462       922

    accuracy                         0.8907      2936
   macro avg     0.8680    0.9088    0.8807      2936
weighted avg     0.9089    0.8907    0.8935      2936

################################
Evaluating with sentence order: top_last...


100%|██████████| 734/734 [06:08<00:00,  1.99it/s]


################################
FeverScore: 0.877724795640327
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9775    0.8630    0.9167      2014
           1     0.7617    0.9566    0.8481       922

    accuracy                         0.8924      2936
   macro avg     0.8696    0.9098    0.8824      2936
weighted avg     0.9097    0.8924    0.8951      2936

################################


## 6.3 Find Threshold
Finally, we test on our development set, which sentence similarity threshold should be used in our Evidence Selection (Topic Modeling)

In [None]:
dataset = load_dataset("lukasellinger/filtered_fever-claim_verification", split='dev')
#dataset = dataset.filter(lambda x: x['label'] != 'NOT_ENOUGH_INFO')
output_file_base_name = str(EVALUATION_DIR / "fever/fever_{threshold}_threshold")

In [None]:
model_name = 'lukasellinger/claim-verification-model-top_last'
pipeline = FeverPipeline(claim_splitter=None,
                         evid_selector= ModelEvidenceSelector(model_name=finetuned_selection_model, min_similarity=0, evidence_selection='top'),
                         stm_verifier= ModelStatementVerifier(model_name=model_name, premise_sent_order='top_last'))

In [None]:
start = 480
stop = 536
step = 5  # corresponds to 0.005 increments

for threshold_int in range(start, stop, step):
    print(f"Evaluating with threshold: {threshold_int}...")
    threshold = threshold_int / 1000.0  # Convert back to the floating-point value
    pipeline.evid_selector.set_min_similarity(threshold)
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(threshold=threshold_int))

Evaluating with threshold: 480...


100%|██████████| 734/734 [06:06<00:00,  2.00it/s]


################################
FeverScore: 0.877724795640327
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9759    0.8649    0.9171      2014
           1     0.7637    0.9534    0.8480       922

    accuracy                         0.8927      2936
   macro avg     0.8698    0.9092    0.8826      2936
weighted avg     0.9093    0.8927    0.8954      2936

################################
Evaluating with threshold: 485...


100%|██████████| 734/734 [06:01<00:00,  2.03it/s]


################################
FeverScore: 0.877724795640327
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9754    0.8654    0.9171      2014
           1     0.7641    0.9523    0.8479       922

    accuracy                         0.8927      2936
   macro avg     0.8698    0.9089    0.8825      2936
weighted avg     0.9090    0.8927    0.8954      2936

################################
Evaluating with threshold: 490...


100%|██████████| 734/734 [06:01<00:00,  2.03it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9754    0.8654    0.9171      2014
           1     0.7641    0.9523    0.8479       922

    accuracy                         0.8927      2936
   macro avg     0.8698    0.9089    0.8825      2936
weighted avg     0.9090    0.8927    0.8954      2936

################################
Evaluating with threshold: 495...


100%|██████████| 734/734 [06:02<00:00,  2.02it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9754    0.8654    0.9171      2014
           1     0.7641    0.9523    0.8479       922

    accuracy                         0.8927      2936
   macro avg     0.8698    0.9089    0.8825      2936
weighted avg     0.9090    0.8927    0.8954      2936

################################
Evaluating with threshold: 500...


100%|██████████| 734/734 [06:01<00:00,  2.03it/s]


################################
FeverScore: 0.877724795640327
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9754    0.8654    0.9171      2014
           1     0.7641    0.9523    0.8479       922

    accuracy                         0.8927      2936
   macro avg     0.8698    0.9089    0.8825      2936
weighted avg     0.9090    0.8927    0.8954      2936

################################
Evaluating with threshold: 505...


100%|██████████| 734/734 [06:01<00:00,  2.03it/s]


################################
FeverScore: 0.8784059945504087
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9754    0.8659    0.9174      2014
           1     0.7648    0.9523    0.8483       922

    accuracy                         0.8931      2936
   macro avg     0.8701    0.9091    0.8829      2936
weighted avg     0.9093    0.8931    0.8957      2936

################################
Evaluating with threshold: 510...


100%|██████████| 734/734 [06:03<00:00,  2.02it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9748    0.8659    0.9172      2014
           1     0.7646    0.9512    0.8478       922

    accuracy                         0.8927      2936
   macro avg     0.8697    0.9086    0.8825      2936
weighted avg     0.9088    0.8927    0.8954      2936

################################
Evaluating with threshold: 515...


100%|██████████| 734/734 [06:04<00:00,  2.02it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9743    0.8664    0.9172      2014
           1     0.7651    0.9501    0.8476       922

    accuracy                         0.8927      2936
   macro avg     0.8697    0.9083    0.8824      2936
weighted avg     0.9086    0.8927    0.8954      2936

################################
Evaluating with threshold: 520...


100%|██████████| 734/734 [06:02<00:00,  2.02it/s]


################################
FeverScore: 0.8784059945504087
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9743    0.8669    0.9175      2014
           1     0.7657    0.9501    0.8480       922

    accuracy                         0.8931      2936
   macro avg     0.8700    0.9085    0.8828      2936
weighted avg     0.9088    0.8931    0.8957      2936

################################
Evaluating with threshold: 525...


100%|██████████| 734/734 [06:02<00:00,  2.03it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9812670299727521
              precision    recall  f1-score   support

           0     0.9743    0.8669    0.9175      2014
           1     0.7657    0.9501    0.8480       922

    accuracy                         0.8931      2936
   macro avg     0.8700    0.9085    0.8828      2936
weighted avg     0.9088    0.8931    0.8957      2936

################################
Evaluating with threshold: 530...


100%|██████████| 734/734 [06:02<00:00,  2.02it/s]


################################
FeverScore: 0.8784059945504087
Gold Label: 0.9812670299727521
              precision    recall  f1-score   support

           0     0.9738    0.8679    0.9178      2014
           1     0.7669    0.9490    0.8483       922

    accuracy                         0.8934      2936
   macro avg     0.8703    0.9085    0.8831      2936
weighted avg     0.9088    0.8934    0.8960      2936

################################
Evaluating with threshold: 535...


100%|██████████| 734/734 [06:02<00:00,  2.03it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9812670299727521
              precision    recall  f1-score   support

           0     0.9733    0.8679    0.9176      2014
           1     0.7667    0.9479    0.8477       922

    accuracy                         0.8931      2936
   macro avg     0.8700    0.9079    0.8827      2936
weighted avg     0.9084    0.8931    0.8956      2936

################################


In [None]:
start = 540
stop = 601
step = 5  # corresponds to 0.005 increments

for threshold_int in range(start, stop, step):
    print(f"Evaluating with threshold: {threshold_int}...")
    threshold = threshold_int / 1000.0  # Convert back to the floating-point value
    pipeline.evid_selector.set_min_similarity(threshold)
    evaluate_fever_pipeline(pipeline, dataset, output_file_name=output_file_base_name.format(threshold=threshold_int))

Evaluating with threshold: 540...


100%|██████████| 734/734 [05:59<00:00,  2.04it/s]


################################
FeverScore: 0.8787465940054496
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9722    0.8694    0.9180      2014
           1     0.7683    0.9458    0.8478       922

    accuracy                         0.8934      2936
   macro avg     0.8703    0.9076    0.8829      2936
weighted avg     0.9082    0.8934    0.8959      2936

################################
Evaluating with threshold: 545...


100%|██████████| 734/734 [05:56<00:00,  2.06it/s]


################################
FeverScore: 0.8787465940054496
Gold Label: 0.9822888283378747
              precision    recall  f1-score   support

           0     0.9712    0.8699    0.9178      2014
           1     0.7686    0.9436    0.8471       922

    accuracy                         0.8931      2936
   macro avg     0.8699    0.9068    0.8824      2936
weighted avg     0.9075    0.8931    0.8956      2936

################################
Evaluating with threshold: 550...


100%|██████████| 734/734 [05:59<00:00,  2.04it/s]


################################
FeverScore: 0.8787465940054496
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9712    0.8709    0.9183      2014
           1     0.7699    0.9436    0.8480       922

    accuracy                         0.8937      2936
   macro avg     0.8706    0.9073    0.8831      2936
weighted avg     0.9080    0.8937    0.8962      2936

################################
Evaluating with threshold: 555...


100%|██████████| 734/734 [05:56<00:00,  2.06it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9812670299727521
              precision    recall  f1-score   support

           0     0.9707    0.8709    0.9181      2014
           1     0.7697    0.9425    0.8474       922

    accuracy                         0.8934      2936
   macro avg     0.8702    0.9067    0.8827      2936
weighted avg     0.9076    0.8934    0.8959      2936

################################
Evaluating with threshold: 560...


100%|██████████| 734/734 [05:59<00:00,  2.04it/s]


################################
FeverScore: 0.8787465940054496
Gold Label: 0.9816076294277929
              precision    recall  f1-score   support

           0     0.9702    0.8719    0.9184      2014
           1     0.7709    0.9414    0.8477       922

    accuracy                         0.8937      2936
   macro avg     0.8705    0.9067    0.8830      2936
weighted avg     0.9076    0.8937    0.8962      2936

################################
Evaluating with threshold: 565...


100%|██████████| 734/734 [05:55<00:00,  2.07it/s]


################################
FeverScore: 0.8784059945504087
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9686    0.8724    0.9180      2014
           1     0.7709    0.9382    0.8464       922

    accuracy                         0.8931      2936
   macro avg     0.8698    0.9053    0.8822      2936
weighted avg     0.9065    0.8931    0.8955      2936

################################
Evaluating with threshold: 570...


100%|██████████| 734/734 [05:57<00:00,  2.06it/s]


################################
FeverScore: 0.8790871934604905
Gold Label: 0.9826294277929155
              precision    recall  f1-score   support

           0     0.9675    0.8734    0.9181      2014
           1     0.7719    0.9360    0.8461       922

    accuracy                         0.8931      2936
   macro avg     0.8697    0.9047    0.8821      2936
weighted avg     0.9061    0.8931    0.8955      2936

################################
Evaluating with threshold: 575...


100%|██████████| 734/734 [05:54<00:00,  2.07it/s]


################################
FeverScore: 0.8801089918256131
Gold Label: 0.9822888283378747
              precision    recall  f1-score   support

           0     0.9671    0.8754    0.9189      2014
           1     0.7745    0.9349    0.8472       922

    accuracy                         0.8941      2936
   macro avg     0.8708    0.9051    0.8831      2936
weighted avg     0.9066    0.8941    0.8964      2936

################################
Evaluating with threshold: 580...


100%|██████████| 734/734 [05:51<00:00,  2.09it/s]


################################
FeverScore: 0.8797683923705722
Gold Label: 0.9826294277929155
              precision    recall  f1-score   support

           0     0.9666    0.8754    0.9187      2014
           1     0.7743    0.9338    0.8466       922

    accuracy                         0.8937      2936
   macro avg     0.8704    0.9046    0.8827      2936
weighted avg     0.9062    0.8937    0.8961      2936

################################
Evaluating with threshold: 585...


100%|██████████| 734/734 [05:51<00:00,  2.09it/s]


################################
FeverScore: 0.8814713896457765
Gold Label: 0.9826294277929155
              precision    recall  f1-score   support

           0     0.9661    0.8784    0.9202      2014
           1     0.7783    0.9328    0.8485       922

    accuracy                         0.8954      2936
   macro avg     0.8722    0.9056    0.8844      2936
weighted avg     0.9071    0.8954    0.8977      2936

################################
Evaluating with threshold: 590...


100%|██████████| 734/734 [05:50<00:00,  2.09it/s]


################################
FeverScore: 0.8790871934604905
Gold Label: 0.9819482288828338
              precision    recall  f1-score   support

           0     0.9625    0.8793    0.9190      2014
           1     0.7783    0.9252    0.8454       922

    accuracy                         0.8937      2936
   macro avg     0.8704    0.9023    0.8822      2936
weighted avg     0.9047    0.8937    0.8959      2936

################################
Evaluating with threshold: 595...


100%|██████████| 734/734 [05:48<00:00,  2.11it/s]


################################
FeverScore: 0.8804495912806539
Gold Label: 0.9822888283378747
              precision    recall  f1-score   support

           0     0.9621    0.8813    0.9199      2014
           1     0.7809    0.9241    0.8465       922

    accuracy                         0.8948      2936
   macro avg     0.8715    0.9027    0.8832      2936
weighted avg     0.9052    0.8948    0.8969      2936

################################
Evaluating with threshold: 600...


100%|██████████| 734/734 [05:46<00:00,  2.12it/s]


################################
FeverScore: 0.8780653950953679
Gold Label: 0.9812670299727521
              precision    recall  f1-score   support

           0     0.9580    0.8833    0.9191      2014
           1     0.7822    0.9154    0.8436       922

    accuracy                         0.8934      2936
   macro avg     0.8701    0.8994    0.8814      2936
weighted avg     0.9028    0.8934    0.8954      2936

################################


# 7 Check on incorrect predicted

Similar to the approach used in FactScore, we sample 30 incorrect predictions from our test sets, excluding those cases where no evidence is available on Wikipedia. These sampled examples are saved to a separate file for detailed manual evaluation. The analysis focuses on identifying the following types of issues:
- Predicted as Supported
- Annotation
- No Evidence in Wiki
- No Evidence in Selection
- Pre-processing (e.g translation error)

In [None]:
def sample_incorrect_predictions(dataset_names: List[str], file_pattern: str, output_file: str,
                                 sample_size: int = 30, seed: int = 42) -> None:
    """
    Samples incorrect predictions from test sets and saves them to a file for manual assessment.

    Args:
        dataset_names (List[str]): List of dataset names to process.
        file_pattern (str): Pattern to match file names for loading predictions.
        output_file (str): Path to the output file for saving the sampled incorrect predictions.
        sample_size (int): Number of incorrect predictions to sample.
        seed (int): Random seed for reproducibility.
    """
    random.seed(seed)
    all_outputs = []

    for dataset_name in dataset_names:
        file_path = EVALUATION_DIR / file_pattern.format(dataset_name=dataset_name)
        outputs = JSONLineReader().read(file_path)
        all_outputs.extend(outputs)
    false_predicted_outputs = [output for output in all_outputs if
                               output['label'] != output['predicted'] and output['predicted'] != -1]

    sample_size = min(sample_size, len(false_predicted_outputs))
    sampled_outputs = random.sample(false_predicted_outputs, sample_size)

    # Print sampled outputs for review
    print(f"Sampled {len(sampled_outputs)} incorrect predictions:")
    for sample in sampled_outputs:
        print(json.dumps(sample, indent=4))
        print('############################')
    JSONLineReader().write(output_file, sampled_outputs)

In [None]:
# Sample incorrect predictions from the fine-tuned pipeline without Claim Splitting
sample_incorrect_predictions(
    dataset_names=list(datasets.keys()),
    file_pattern="{dataset_name}/pipeline/{dataset_name}_finetuned_soft_threshold.jsonl",
    output_file=str(EVALUATION_DIR / "incorrect_pred_samples.jsonl")
)

# Sample incorrect predictions from models with DisSimSplitter
sample_incorrect_predictions(
    dataset_names=list(datasets.keys()),
    file_pattern="{dataset_name}/pipeline/{dataset_name}_finetuned_soft_threshold_DisSimSplitter.jsonl",
    output_file=str(EVALUATION_DIR / "incorrect_pred_samples_DisSimSplitter.jsonl")
)

Sampled 30 incorrect predictions:
{
    "id": 6,
    "word": "dharmas",
    "claim": "phenomena",
    "connected_claim": "Dharmas signifies phenomena.",
    "label": "SUPPORTED",
    "predicted": "NOT_SUPPORTED",
    "factuality": 0.0,
    "atoms": [
        {
            "atom": "Dharmas signifies phenomena.",
            "predicted": "NOT_SUPPORTED"
        }
    ],
    "evidence": [
        {
            "title": "Abhidharma (wikipedia)",
            "line_idx": 7,
            "text": "These texts developed out of early Buddhist lists or matrices (m\u0101t\u1e5bk\u0101s) of key teachings.",
            "sim": 0.5447394847869873,
            "in_intro": true
        },
        {
            "title": "Abhidharma (wikipedia)",
            "line_idx": 0,
            "text": "The Abhidharma are a collection of Buddhist texts dating from the 3rd century BCE onwards, which contain detailed scholastic presentations of doctrinal material appearing in the canonical Buddhist scriptures and com

# 8 BM25 Selection (Whole Page vs. Intro Page) (Best Setup)

In our document selection process, we first retrieve all pages related to a specific word. Given the potentially large number of pages and the associated computation time, we use the BM25 algorithm to select the top 3 most relevant pages for the claim.

In this section, we evaluate whether using only the intro sections of these pages is sufficient for effective BM25 selection, which also reduces computation time. We will compare the results of selecting from intro sections versus the full pages to determine if the selections are consistent and if the top-ranked pages are the same in both scenarios.


We check on:
- All Evidence Match
- First Evidence Match
- First Summary Evidence in Whole Page Evidences
- First Whole Page Evidence in in Summary Evidences

In [11]:
for dataset_name, config in datasets.items():
    print(f"Evaluating {dataset_name}...")
    dataset = config['dataset']
    lang = config['lang']

    equal_evids = 0
    first_evid_match = 0
    first_intro_match = 0
    first_full_match = 0
    for entry in tqdm(dataset):
        if entry['in_wiki'] == 'No':
            continue

        _, intro_evids = offline_evid_fetcher.fetch_evidences(
            search_word=entry['document_search_word'], only_intro=True)
        _, full_evids = offline_evid_fetcher.fetch_evidences(
            search_word=entry['document_search_word'], only_intro=False)

        intro_evids_indices = rank_docs(entry['connected_claim'],
                                        [" ".join(evidence.get('lines')) for evidence in
                                         intro_evids], k=3)
        selected_intro_pages = [intro_evids[idx].get('title') for idx in intro_evids_indices]
        full_evids_indices = rank_docs(entry['connected_claim'],
                                       [" ".join(evidence.get('lines')) for evidence in full_evids],
                                       k=3)
        selected_full_pages = [full_evids[idx].get('title') for idx in full_evids_indices]

        if set(selected_intro_pages) == set(selected_full_pages):
            equal_evids += 1
        if selected_intro_pages[0] == selected_full_pages[0]:
            first_evid_match += 1
        if selected_intro_pages[0] in selected_full_pages:
            first_intro_match += 1
        if selected_full_pages[0] in selected_intro_pages:
            first_full_match += 1

    print(f"All Evidence Match: {round(100 * equal_evids / len(dataset), 2)}%")
    print(f"First Evidence Match: {round(100 * first_evid_match / len(dataset), 2)}%")
    print(f"First Intro in Full Pages: {round(100 * first_intro_match / len(dataset), 2)}%")
    print(f"First Full Page in Intro: {round(100 * first_full_match / len(dataset), 2)}%")

Evaluating german_dpr-claim_verification...


100%|██████████| 168/168 [05:34<00:00,  1.99s/it]


All Evidence Match: 58.93%
First Evidence Match: 56.55%
First Intro in Full Pages: 78.57%
First Full Page in Intro: 76.19%
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 23/200 [00:42<06:37,  2.25s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']
Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 30%|██▉       | 59/200 [01:15<01:40,  1.40it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']
unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 64/200 [01:25<03:15,  1.44s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']
Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 69/200 [01:30<02:36,  1.19s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']
request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 129/200 [02:22<01:33,  1.32s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']
Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 137/200 [03:07<08:22,  7.98s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']
unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 89%|████████▉ | 178/200 [03:45<00:14,  1.50it/s]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']
Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 196/200 [03:58<00:01,  2.05it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']
specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 200/200 [03:59<00:00,  1.20s/it]


All Evidence Match: 64.0%
First Evidence Match: 61.0%
First Intro in Full Pages: 77.5%
First Full Page in Intro: 75.5%
Evaluating squad-claim_verification...


100%|██████████| 158/158 [03:22<00:00,  1.28s/it]


All Evidence Match: 60.13%
First Evidence Match: 51.27%
First Intro in Full Pages: 71.52%
First Full Page in Intro: 75.32%
Evaluating shroom-claim_verification...


  5%|▌         | 29/563 [00:03<00:34, 15.26it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']
piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 329/563 [02:17<01:15,  3.08it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']
demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 563/563 [04:13<00:00,  2.22it/s]

All Evidence Match: 87.21%
First Evidence Match: 84.01%
First Intro in Full Pages: 93.43%
First Full Page in Intro: 93.25%





# 9 GermanDPR: Analysis of Sentence and Document Selection on Accuracy
We assumed that a word definition can be effectively evaluated using up to three sentences. Additionally, we select the top three Wiki articles using the BM25 algorithm before passing them to the evidence selector. 

Here, we test the accuracy across combinations of selected sentences and documents, with selections ranging from one to five.

In [None]:
output_file_base_name = str(EVALUATION_DIR / "german_dpr-claim_verification/hyperparam/german_dpr-claim_verification_finetuned_soft_threshold_evd{max_evidence}_top{top_k}")

for max_evidence_count in range(1, 6):
    for top_k in range(1, 6):
        print(f"Evaluating with max evidence: {max_evidence_count}, top k sentences: {top_k}...")
        config = datasets.get('german_dpr-claim_verification')
        dataset =config['dataset']
        lang = config['lang']
        pipeline = Pipeline(translator=translator,
                            sent_connector=phi_sentence_connector,
                            claim_splitter=None,
                            evid_fetcher=offline_evid_fetcher,
                            evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                            stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                            lang=lang)
        outputs = evaluate_pipeline(pipeline, dataset, only_intro=True, max_evidence_count=max_evidence_count, top_k=top_k, output_file_name=output_file_base_name.format(max_evidence=max_evidence_count, top_k=top_k))

Evaluating with max evidence: 1, top k sentences: 1...


100%|██████████| 42/42 [00:46<00:00,  1.12s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6465    0.9275    0.7619        69
           1     0.8750    0.5000    0.6364        70

    accuracy                         0.7122       139
   macro avg     0.7607    0.7138    0.6991       139
weighted avg     0.7616    0.7122    0.6987       139

################################
Evaluating with max evidence: 1, top k sentences: 2...


100%|██████████| 42/42 [00:48<00:00,  1.16s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7647    0.9420    0.8442        69
           1     0.9259    0.7143    0.8065        70

    accuracy                         0.8273       139
   macro avg     0.8453    0.8282    0.8253       139
weighted avg     0.8459    0.8273    0.8252       139

################################
Evaluating with max evidence: 1, top k sentences: 3...


100%|██████████| 42/42 [00:46<00:00,  1.12s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7821    0.8841    0.8299        69
           1     0.8689    0.7571    0.8092        70

    accuracy                         0.8201       139
   macro avg     0.8255    0.8206    0.8195       139
weighted avg     0.8258    0.8201    0.8195       139

################################
Evaluating with max evidence: 1, top k sentences: 4...


100%|██████████| 42/42 [00:46<00:00,  1.12s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8000    0.8696    0.8333        69
           1     0.8594    0.7857    0.8209        70

    accuracy                         0.8273       139
   macro avg     0.8297    0.8276    0.8271       139
weighted avg     0.8299    0.8273    0.8271       139

################################
Evaluating with max evidence: 1, top k sentences: 5...


100%|██████████| 42/42 [00:46<00:00,  1.12s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8108    0.8696    0.8392        69
           1     0.8615    0.8000    0.8296        70

    accuracy                         0.8345       139
   macro avg     0.8362    0.8348    0.8344       139
weighted avg     0.8364    0.8345    0.8344       139

################################
Evaluating with max evidence: 2, top k sentences: 1...


100%|██████████| 42/42 [00:50<00:00,  1.21s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6809    0.9275    0.7853        69
           1     0.8889    0.5714    0.6957        70

    accuracy                         0.7482       139
   macro avg     0.7849    0.7495    0.7405       139
weighted avg     0.7856    0.7482    0.7401       139

################################
Evaluating with max evidence: 2, top k sentences: 2...


100%|██████████| 42/42 [00:50<00:00,  1.20s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8125    0.9420    0.8725        69
           1     0.9322    0.7857    0.8527        70

    accuracy                         0.8633       139
   macro avg     0.8724    0.8639    0.8626       139
weighted avg     0.8728    0.8633    0.8625       139

################################
Evaluating with max evidence: 2, top k sentences: 3...


100%|██████████| 42/42 [00:50<00:00,  1.19s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8333    0.8696    0.8511        69
           1     0.8657    0.8286    0.8467        70

    accuracy                         0.8489       139
   macro avg     0.8495    0.8491    0.8489       139
weighted avg     0.8496    0.8489    0.8489       139

################################
Evaluating with max evidence: 2, top k sentences: 4...


100%|██████████| 42/42 [00:50<00:00,  1.20s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8551    0.8551    0.8551        69
           1     0.8571    0.8571    0.8571        70

    accuracy                         0.8561       139
   macro avg     0.8561    0.8561    0.8561       139
weighted avg     0.8561    0.8561    0.8561       139

################################
Evaluating with max evidence: 2, top k sentences: 5...


100%|██████████| 42/42 [00:50<00:00,  1.21s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8788    0.8406    0.8593        69
           1     0.8493    0.8857    0.8671        70

    accuracy                         0.8633       139
   macro avg     0.8641    0.8631    0.8632       139
weighted avg     0.8639    0.8633    0.8632       139

################################
Evaluating with max evidence: 3, top k sentences: 1...


100%|██████████| 42/42 [00:53<00:00,  1.27s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6809    0.9275    0.7853        69
           1     0.8889    0.5714    0.6957        70

    accuracy                         0.7482       139
   macro avg     0.7849    0.7495    0.7405       139
weighted avg     0.7856    0.7482    0.7401       139

################################
Evaluating with max evidence: 3, top k sentences: 2...


100%|██████████| 42/42 [00:54<00:00,  1.29s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8228    0.9420    0.8784        69
           1     0.9333    0.8000    0.8615        70

    accuracy                         0.8705       139
   macro avg     0.8781    0.8710    0.8700       139
weighted avg     0.8785    0.8705    0.8699       139

################################
Evaluating with max evidence: 3, top k sentences: 3...


100%|██████████| 42/42 [00:53<00:00,  1.27s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8356    0.8841    0.8592        69
           1     0.8788    0.8286    0.8529        70

    accuracy                         0.8561       139
   macro avg     0.8572    0.8563    0.8560       139
weighted avg     0.8574    0.8561    0.8560       139

################################
Evaluating with max evidence: 3, top k sentences: 4...


100%|██████████| 42/42 [00:53<00:00,  1.27s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8451    0.8696    0.8571        69
           1     0.8676    0.8429    0.8551        70

    accuracy                         0.8561       139
   macro avg     0.8564    0.8562    0.8561       139
weighted avg     0.8564    0.8561    0.8561       139

################################
Evaluating with max evidence: 3, top k sentences: 5...


100%|██████████| 42/42 [00:52<00:00,  1.26s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8806    0.8551    0.8676        69
           1     0.8611    0.8857    0.8732        70

    accuracy                         0.8705       139
   macro avg     0.8709    0.8704    0.8704       139
weighted avg     0.8708    0.8705    0.8705       139

################################
Evaluating with max evidence: 4, top k sentences: 1...


100%|██████████| 42/42 [00:57<00:00,  1.36s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6774    0.9130    0.7778        69
           1     0.8696    0.5714    0.6897        70

    accuracy                         0.7410       139
   macro avg     0.7735    0.7422    0.7337       139
weighted avg     0.7742    0.7410    0.7334       139

################################
Evaluating with max evidence: 4, top k sentences: 2...


100%|██████████| 42/42 [00:55<00:00,  1.32s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8205    0.9275    0.8707        69
           1     0.9180    0.8000    0.8550        70

    accuracy                         0.8633       139
   macro avg     0.8693    0.8638    0.8629       139
weighted avg     0.8696    0.8633    0.8628       139

################################
Evaluating with max evidence: 4, top k sentences: 3...


100%|██████████| 42/42 [00:55<00:00,  1.33s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8356    0.8841    0.8592        69
           1     0.8788    0.8286    0.8529        70

    accuracy                         0.8561       139
   macro avg     0.8572    0.8563    0.8560       139
weighted avg     0.8574    0.8561    0.8560       139

################################
Evaluating with max evidence: 4, top k sentences: 4...


100%|██████████| 42/42 [00:56<00:00,  1.35s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8451    0.8696    0.8571        69
           1     0.8676    0.8429    0.8551        70

    accuracy                         0.8561       139
   macro avg     0.8564    0.8562    0.8561       139
weighted avg     0.8564    0.8561    0.8561       139

################################
Evaluating with max evidence: 4, top k sentences: 5...


100%|██████████| 42/42 [00:55<00:00,  1.31s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8806    0.8551    0.8676        69
           1     0.8611    0.8857    0.8732        70

    accuracy                         0.8705       139
   macro avg     0.8709    0.8704    0.8704       139
weighted avg     0.8708    0.8705    0.8705       139

################################
Evaluating with max evidence: 5, top k sentences: 1...


100%|██████████| 42/42 [00:58<00:00,  1.38s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6774    0.9130    0.7778        69
           1     0.8696    0.5714    0.6897        70

    accuracy                         0.7410       139
   macro avg     0.7735    0.7422    0.7337       139
weighted avg     0.7742    0.7410    0.7334       139

################################
Evaluating with max evidence: 5, top k sentences: 2...


100%|██████████| 42/42 [00:57<00:00,  1.36s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8205    0.9275    0.8707        69
           1     0.9180    0.8000    0.8550        70

    accuracy                         0.8633       139
   macro avg     0.8693    0.8638    0.8629       139
weighted avg     0.8696    0.8633    0.8628       139

################################
Evaluating with max evidence: 5, top k sentences: 3...


100%|██████████| 42/42 [00:58<00:00,  1.38s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8356    0.8841    0.8592        69
           1     0.8788    0.8286    0.8529        70

    accuracy                         0.8561       139
   macro avg     0.8572    0.8563    0.8560       139
weighted avg     0.8574    0.8561    0.8560       139

################################
Evaluating with max evidence: 5, top k sentences: 4...


100%|██████████| 42/42 [00:57<00:00,  1.38s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8451    0.8696    0.8571        69
           1     0.8676    0.8429    0.8551        70

    accuracy                         0.8561       139
   macro avg     0.8564    0.8562    0.8561       139
weighted avg     0.8564    0.8561    0.8561       139

################################
Evaluating with max evidence: 5, top k sentences: 5...


100%|██████████| 42/42 [00:57<00:00,  1.37s/it]

################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8806    0.8551    0.8676        69
           1     0.8611    0.8857    0.8732        70

    accuracy                         0.8705       139
   macro avg     0.8709    0.8704    0.8704       139
weighted avg     0.8708    0.8705    0.8705       139

################################





# 10 Sentence Ordering / Selection on Testsets
The performance of individual hyperparameters showed minimal variation. Therefore, we evaluate the impact of different sentence selection and ordering methods on our test sets by testing each possible combination. The evidence selection threshold is fixed at 0.5 throughout.

In [None]:
sentence_selection = ['mmr', 'top']
sentence_order = ['reverse', 'keep', 'top_last']
pipeline = Pipeline(translator=translator,
                    sent_connector=phi_sentence_connector,
                    claim_splitter=None,
                    evid_fetcher=offline_evid_fetcher,
                    evid_selector=pipeline_models['finetuned_soft_threshold']['evid_selector'],
                    stm_verifier=pipeline_models['finetuned_soft_threshold']['stm_verifier'],
                    lang='')

In [None]:
output_file_base_name = str(EVALUATION_DIR / "{dataset}/pipeline/{dataset}_finetuned_soft_threshold_{selection}_{order}")

for selection in sentence_selection:
    for order in sentence_order:
        print(f"Evaluating with sentence selection: {selection}, sentence order: {order}...")
        pipeline.evid_selector.set_evidence_selection(selection)
        pipeline.stm_verifier.set_premise_sent_order(order)
        for dataset_name, config in datasets.items():
            print(f"Evaluating {dataset_name}...")
            dataset = config['dataset']
            pipeline.lang = config['lang']
            evaluate_pipeline(pipeline, dataset, only_intro=True, output_file_name=output_file_base_name.format(dataset=dataset_name, selection=selection, order=order))

Evaluating with sentence selection: mmr, sentence order: reverse...
Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [00:55<00:00,  1.33s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8289    0.9130    0.8690        69
           1     0.9048    0.8143    0.8571        70

    accuracy                         0.8633       139
   macro avg     0.8669    0.8637    0.8631       139
weighted avg     0.8671    0.8633    0.8630       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:06<00:47,  1.08s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:15<00:35,  1.01it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:33,  1.01it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:29,  1.13it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:34<00:23,  1.28s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:39<00:36,  2.25s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:52<00:07,  1.17s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:55<00:00,  1.18it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:56<00:00,  1.13s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7059    0.8889    0.7869        81
           1     0.8448    0.6203    0.7153        79

    accuracy                         0.7562       160
   macro avg     0.7754    0.7546    0.7511       160
weighted avg     0.7745    0.7562    0.7516       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [00:39<00:00,  1.01it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7887    0.8889    0.8358        63
           1     0.8727    0.7619    0.8136        63

    accuracy                         0.8254       126
   macro avg     0.8307    0.8254    0.8247       126
weighted avg     0.8307    0.8254    0.8247       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:04<01:11,  1.87it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:02<00:37,  1.57it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:49<00:00,  1.28it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6887    0.5252    0.5959       278
           1     0.6000    0.7500    0.6667       264

    accuracy                         0.6347       542
   macro avg     0.6443    0.6376    0.6313       542
weighted avg     0.6455    0.6347    0.6304       542

################################
Evaluating with sentence selection: mmr, sentence order: keep...
Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [00:53<00:00,  1.28s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8108    0.8696    0.8392        69
           1     0.8615    0.8000    0.8296        70

    accuracy                         0.8345       139
   macro avg     0.8362    0.8348    0.8344       139
weighted avg     0.8364    0.8345    0.8344       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:08<01:09,  1.59s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:15<00:26,  1.37it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:29,  1.14it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:26,  1.24it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:34<00:19,  1.07s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:41<00:42,  2.68s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:52<00:06,  1.05s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:57<00:01,  1.11s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:58<00:00,  1.17s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7087    0.9012    0.7935        81
           1     0.8596    0.6203    0.7206        79

    accuracy                         0.7625       160
   macro avg     0.7842    0.7607    0.7570       160
weighted avg     0.7833    0.7625    0.7575       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [00:39<00:00,  1.02it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7808    0.9048    0.8382        63
           1     0.8868    0.7460    0.8103        63

    accuracy                         0.8254       126
   macro avg     0.8338    0.8254    0.8243       126
weighted avg     0.8338    0.8254    0.8243       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:03<01:03,  2.12it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:02<00:53,  1.09it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:48<00:00,  1.30it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6991    0.5432    0.6113       278
           1     0.6104    0.7538    0.6746       264

    accuracy                         0.6458       542
   macro avg     0.6548    0.6485    0.6430       542
weighted avg     0.6559    0.6458    0.6421       542

################################
Evaluating with sentence selection: mmr, sentence order: top_last...
Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [00:54<00:00,  1.29s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8158    0.8986    0.8552        69
           1     0.8889    0.8000    0.8421        70

    accuracy                         0.8489       139
   macro avg     0.8523    0.8493    0.8486       139
weighted avg     0.8526    0.8489    0.8486       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:08<00:54,  1.24s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:14<00:25,  1.41it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:35,  1.05s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:18<00:34,  1.04s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:34<00:22,  1.24s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:42<00:44,  2.81s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:52<00:07,  1.26s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:58<00:01,  1.09s/it]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:58<00:00,  1.18s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7100    0.8765    0.7845        81
           1     0.8333    0.6329    0.7194        79

    accuracy                         0.7562       160
   macro avg     0.7717    0.7547    0.7520       160
weighted avg     0.7709    0.7562    0.7524       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [00:38<00:00,  1.04it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7887    0.8889    0.8358        63
           1     0.8727    0.7619    0.8136        63

    accuracy                         0.8254       126
   macro avg     0.8307    0.8254    0.8247       126
weighted avg     0.8307    0.8254    0.8247       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:03<01:02,  2.14it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:04<00:51,  1.14it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:50<00:00,  1.28it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6938    0.5216    0.5955       278
           1     0.6006    0.7576    0.6700       264

    accuracy                         0.6365       542
   macro avg     0.6472    0.6396    0.6327       542
weighted avg     0.6484    0.6365    0.6318       542

################################
Evaluating with sentence selection: top, sentence order: reverse...
Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [00:54<00:00,  1.30s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8714    0.8841    0.8777        69
           1     0.8841    0.8714    0.8777        70

    accuracy                         0.8777       139
   macro avg     0.8777    0.8777    0.8777       139
weighted avg     0.8778    0.8777    0.8777       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:06<00:47,  1.08s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:14<00:34,  1.05it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:43,  1.29s/it]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:18<00:37,  1.14s/it]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:34<00:27,  1.54s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:40<00:38,  2.40s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:52<00:08,  1.42s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:56<00:00,  1.13it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:57<00:00,  1.15s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7245    0.8765    0.7933        81
           1     0.8387    0.6582    0.7376        79

    accuracy                         0.7688       160
   macro avg     0.7816    0.7674    0.7654       160
weighted avg     0.7809    0.7688    0.7658       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [00:39<00:00,  1.02it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8000    0.8889    0.8421        63
           1     0.8750    0.7778    0.8235        63

    accuracy                         0.8333       126
   macro avg     0.8375    0.8333    0.8328       126
weighted avg     0.8375    0.8333    0.8328       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:05<01:39,  1.35it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:04<00:38,  1.53it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:53<00:00,  1.25it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7056    0.5432    0.6138       278
           1     0.6128    0.7614    0.6791       264

    accuracy                         0.6494       542
   macro avg     0.6592    0.6523    0.6464       542
weighted avg     0.6604    0.6494    0.6456       542

################################
Evaluating with sentence selection: top, sentence order: keep...
Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [00:54<00:00,  1.30s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8026    0.8841    0.8414        69
           1     0.8730    0.7857    0.8271        70

    accuracy                         0.8345       139
   macro avg     0.8378    0.8349    0.8342       139
weighted avg     0.8381    0.8345    0.8342       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:06<00:47,  1.07s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:15<00:31,  1.14it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:32,  1.05it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:28,  1.17it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:34<00:21,  1.19s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:40<00:35,  2.24s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:52<00:06,  1.10s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:56<00:00,  1.21it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:57<00:00,  1.14s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7048    0.9136    0.7957        81
           1     0.8727    0.6076    0.7164        79

    accuracy                         0.7625       160
   macro avg     0.7887    0.7606    0.7561       160
weighted avg     0.7877    0.7625    0.7566       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [00:39<00:00,  1.00it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7703    0.9048    0.8321        63
           1     0.8846    0.7302    0.8000        63

    accuracy                         0.8175       126
   macro avg     0.8274    0.8175    0.8161       126
weighted avg     0.8274    0.8175    0.8161       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:05<01:13,  1.82it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:02<00:37,  1.58it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:49<00:00,  1.28it/s]


################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.6944    0.5396    0.6073       278
           1     0.6074    0.7500    0.6712       264

    accuracy                         0.6421       542
   macro avg     0.6509    0.6448    0.6392       542
weighted avg     0.6520    0.6421    0.6384       542

################################
Evaluating with sentence selection: top, sentence order: top_last...
Evaluating german_dpr-claim_verification...


100%|██████████| 42/42 [00:54<00:00,  1.29s/it]


################################
Not in wikipedia: 29
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8356    0.8841    0.8592        69
           1     0.8788    0.8286    0.8529        70

    accuracy                         0.8561       139
   macro avg     0.8572    0.8563    0.8560       139
weighted avg     0.8574    0.8561    0.8560       139

################################
Evaluating german_wiktionary-claim_verification-mini...


 12%|█▏        | 6/50 [00:06<00:45,  1.04s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 28%|██▊       | 14/50 [00:15<00:30,  1.18it/s]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 32%|███▏      | 16/50 [00:17<00:31,  1.09it/s]

Tüte/German/proper noun: DEBUG: unrecognized sense qualifier: Bielefeld, colloquial at ['Tüte']


 34%|███▍      | 17/50 [00:17<00:27,  1.20it/s]

request/English/verb: DEBUG: unrecognized sense qualifier: transitive or with a subjunctive clause at ['request']


 64%|██████▍   | 32/50 [00:34<00:21,  1.20s/it]

Verbesserung/German/noun: DEBUG: unrecognized sense qualifier: homework at ['Verbesserung']


 68%|██████▊   | 34/50 [00:39<00:35,  2.19s/it]

unterbinden/German/verb: DEBUG: unrecognized sense qualifier: transitive or dative reflexive, dated or regional at ['unterbinden']


 88%|████████▊ | 44/50 [00:52<00:06,  1.14s/it]

Imam/German/noun: DEBUG: unrecognized sense qualifier: Twelver Shiism at ['Imam']
imam/English/noun: DEBUG: unrecognized sense qualifier: Twelver Shi'ism at ['imam']


 98%|█████████▊| 49/50 [00:56<00:00,  1.19it/s]

specific/English/adjective: DEBUG: unrecognized sense qualifier: bioscience, taxonomy at ['specific']


100%|██████████| 50/50 [00:56<00:00,  1.13s/it]


################################
Not in wikipedia: 40
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7200    0.8889    0.7956        81
           1     0.8500    0.6456    0.7338        79

    accuracy                         0.7688       160
   macro avg     0.7850    0.7672    0.7647       160
weighted avg     0.7842    0.7688    0.7651       160

################################
Evaluating squad-claim_verification...


100%|██████████| 40/40 [00:39<00:00,  1.01it/s]


################################
Not in wikipedia: 32
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.8088    0.8730    0.8397        63
           1     0.8621    0.7937    0.8264        63

    accuracy                         0.8333       126
   macro avg     0.8354    0.8333    0.8331       126
weighted avg     0.8354    0.8333    0.8331       126

################################
Evaluating shroom-claim_verification...


  5%|▍         | 7/141 [00:05<01:15,  1.77it/s]

piss/English/noun: DEBUG: gloss may contain unhandled list items: 1999, Tin House #2 (→ISBN, Win McCormack, Rob Spillman, Elissa Schappell), page 170: at ['piss']


 58%|█████▊    | 82/141 [01:02<00:37,  1.57it/s]

demosophy/English/noun: DEBUG: unrecognized sense qualifier: folkloristics, sociology, rare at ['demosophy']


100%|██████████| 141/141 [01:50<00:00,  1.28it/s]

################################
Not in wikipedia: 21
Avg claim count: 1.0
              precision    recall  f1-score   support

           0     0.7028    0.5360    0.6082       278
           1     0.6091    0.7614    0.6768       264

    accuracy                         0.6458       542
   macro avg     0.6560    0.6487    0.6425       542
weighted avg     0.6572    0.6458    0.6416       542

################################



