# Fact Checking
A simple example on how to use the vector database for a fact-checking system with Wikidata statements

In [23]:
import requests

HEADERS = {
    'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
}
LANG = 'en'
INCLUDE_EXTERNAL_IDS = False

# Define the claim to be checked
claim = 'Albert Einstein was a theoretical physicist who developed the theory of relativity.'

### Get from the vector database the Wikidata items and properties that are relevant to the query

In [37]:
# Get relevant Wikidata items
items = requests.get(
    'https://wd-vectordb.wmcloud.org/item/query',
    params={'query': claim, 'lang': LANG},
    headers=HEADERS,
)
items = items.json()

# Get relevant Wikidata properties
properties = requests.get(
    'https://wd-vectordb.wmcloud.org/property/query',
    params={'query': claim, 'lang': LANG, 'exclude_external_ids': True},
    headers=HEADERS,
)
properties = properties.json()

### Get all statements of each item

In [None]:
def get_statements(qid, pids):
    params = {
        'id': qid,
        'pids': pids,
        'external_ids': INCLUDE_EXTERNAL_IDS,
        'format': 'json'
    }

    url = "https://wd-textify.toolforge.org"
    results = requests.get(url, params=params, headers=HEADERS)
    results.raise_for_status()

    text = results.json()
    return text

qids = [q['QID'] for q in items]
pids = [p['PID'] for p in properties]
items_info = get_statements(','.join(qids), ','.join(pids))

for i in range(len(items)):
    items[i]['label'] = items_info[items[i]['QID']]['label']
    items[i]['claims'] = items_info[items[i]['QID']]['claims']

### Sort statements by vector similarity

In [None]:
result_statements = []
for item in items:
    for property in properties:
        for statement in item['claims']:
            if property['PID'] == statement['PID']:
                result_statements.append({
                    'statement': {
                        **statement,
                        'QID': item['QID'],
                        'item_label': item['label'],
                    },
                    'similarity_score': item['similarity_score'] * property['similarity_score']
                })

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)

### Prepare NLI model for textual entailment detection

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
label_names = ["entailment", "neutral", "contradiction"]

def predict_entailment(claim, wd_fact):
    label_names = ["entailment", "neutral", "contradiction"]
    input = tokenizer(wd_fact, claim, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

def predict_entailment_bulk(claim, wd_facts, batch_size=8):
    label_names = ["entailment", "neutral", "contradiction"]
    results = []
    with torch.no_grad():
        for i in range(0, len(wd_facts), batch_size):
            batch_premises = wd_facts[i:i + batch_size]

            # Premise varies, hypothesis (claim) is the same
            enc = tokenizer(
                batch_premises,
                [claim] * len(batch_premises),
                truncation=True,
                padding=True,
                return_tensors="pt",
            )

            enc = {k: v.to(device) for k, v in enc.items()}

            out = model(**enc)
            probs = torch.softmax(out.logits, dim=-1).cpu()

            for row in probs:
                row_dict = {
                    name: round(float(p) * 100, 1)
                    for name, p in zip(label_names, row)
                }
                results.append(row_dict)

    return results

### Prepare hypothesis from Wikidata statements

In [None]:
def value_to_string(value):
    if isinstance(value, str):
        return value
    if 'string' in value:
        return value['string']
    elif 'label' in value:
        return value['label']
    elif 'time' in value:
        return value['time']
    return str(value)

def prepare_hypothesis(result):
    hypothesis = ""
    for statement in result['values']:
        hypothesis += f"{result['item_label']}: {result['property_label']}: {value_to_string(statement['value'])}"
        if 'qualifiers' in statement:
            for qualifier in statement['qualifiers']:
                values = ', '.join([
                    value_to_string(v['value']) for v in qualifier['values']
                ])
                hypothesis += f" | {qualifier['property_label']}: {values}"
        hypothesis += "\n"
    return hypothesis.strip()

for i in range(len(result_statements)):
    result_statements[i]['hypothesis'] = prepare_hypothesis(result_statements[i]['statement'])

### Predict Entailment per Wikidata statement

In [None]:
predictions = predict_entailment_bulk(
    claim,
    [result_statements[i]['hypothesis'] for i in range(len(result_statements))],
    batch_size=8
)
for i in range(len(result_statements)):
    result_statements[i]['entailment'] = predictions[i]

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['entailment']['neutral'])

In [None]:
print(result_statements[0]['hypothesis'])
print(result_statements[0]['entailment'])

In [None]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests

class FactChecker:
    def __init__(self, include_external_ids=False, lang='en', device='cpu'):
        self.include_external_ids = include_external_ids
        self.lang = lang
        self.headers = {
            'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
        }

        model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"

        self.device = torch.device(device)
        self.nli_labels = ["entailment", "neutral", "contradiction"]
        self.nli_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.nli_model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def check_claim(self, claim, verbose=False):
        if verbose:
            print(f"Vector Search...")
        items, properties = self.vector_search(claim)

        if verbose:
            print(f"Getting Statements Text...")
        statements = self.prepare_statements(items, properties)

        if verbose:
            print(f"Found {len(statements)} statements")

        for i in range(len(statements)):
            statements[i]['hypothesis'] = self.statement_to_string(
                statements[i]['statement']
            )

        if verbose:
            print(f"Predicting Entailment...")
        predictions = self.predict_entailment(
            claim,
            [statements[i]['hypothesis'] for i in range(len(statements))],
            batch_size=8
        )
        for i in range(len(statements)):
            statements[i]['entailment'] = predictions[i]

        # Sort by neutrality
        statements = sorted(statements, key=lambda x: x['entailment']['neutral'])

        return statements

    def vector_search(self, claim):
        # Get relevant Wikidata items
        items = requests.get(
            'https://wd-vectordb.wmcloud.org/item/query',
            params={'query': claim, 'lang': self.lang},
            headers=self.headers,
        )
        items = items.json()

        # Get relevant Wikidata properties
        properties = requests.get(
            'https://wd-vectordb.wmcloud.org/property/query',
            params={'query': claim, 'lang': self.lang, 'exclude_external_ids': True},
            headers=self.headers,
        )
        properties = properties.json()

        return items, properties

    def get_item_statements(self, qids, pids):
        params = {
            'id': qids,
            'pids': pids,
            'external_ids': self.include_external_ids,
            'format': 'json'
        }

        url = "https://wd-textify.toolforge.org"
        results = requests.get(url, params=params, headers=self.headers)
        results.raise_for_status()

        text = results.json()
        return text

    def statement_to_string(self, statement):
        hypothesis = f"{statement['item_label']}: {statement['property_label']}: "
        for svalue in statement['values']:
            hypothesis += f"{self.statement_value_to_string(svalue['value'])}"

            if 'qualifiers' in svalue:
                for qualifier in svalue['qualifiers']:
                    values = ', '.join([
                        self.statement_value_to_string(v['value']) for v in qualifier['values']
                    ])
                    hypothesis += f" ({qualifier['property_label']}: {values})"

            hypothesis += ", "
        return hypothesis.strip().rstrip(',')

    def statement_value_to_string(self, value):
        if isinstance(value, str):
            return value
        if 'string' in value:
            return value['string']
        elif 'label' in value:
            return value['label']
        elif 'time' in value:
            return value['time']
        return str(value)

    def prepare_statements(self, items, properties):
        qids = [q['QID'] for q in items]
        pids = [p['PID'] for p in properties]
        items_info = self.get_item_statements(','.join(qids), ','.join(pids))

        for i in range(len(items)):
            items[i]['label'] = items_info[items[i]['QID']]['label']
            items[i]['claims'] = items_info[items[i]['QID']]['claims']

        result_statements = []
        for item in items:
            for property in properties:
                for statement in item['claims']:
                    if property['PID'] == statement['PID']:
                        result_statements.append({
                            'statement': {
                                **statement,
                                'QID': item['QID'],
                                'item_label': item['label'],
                            },
                            'similarity_score': item['similarity_score'] * property['similarity_score']
                        })

        # Sort by similarity score
        result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)
        return result_statements

    def predict_entailment(self, claim, premises, batch_size = 8):
        results = []

        self.nli_model.eval()
        with torch.no_grad():
            for i in range(0, len(premises), batch_size):
                batch_premises = premises[i:i + batch_size]

                enc = self.nli_tokenizer(
                    batch_premises,
                    [claim] * len(batch_premises),
                    truncation=True,
                    padding=True,
                    return_tensors="pt",
                )

                enc = {k: v.to(self.device) for k, v in enc.items()}

                out = self.nli_model(**enc)
                probs = torch.softmax(out.logits, dim=-1).cpu()

                for row in probs:
                    row_dict = {
                        name: round(float(p) * 100, 1)
                        for name, p in zip(self.nli_labels, row)
                    }
                    results.append(row_dict)

        return results

In [4]:
claim = 'Elon Musk is the founder of Tesla, Inc.'

fact_checker = FactChecker(include_external_ids=False, lang='en', device='cpu')
results = fact_checker.check_claim(claim)

for result in results:
    print(result['hypothesis'])
    print(result['entailment'])
    print()



['Q478214', 'Q317521', 'Q4920135', 'Q20718232', 'Q46845259', 'Q86349924', 'Q107749604', 'Q17085152', 'Q40008974', 'Q16900721', 'Q78163092', 'Q28221306', 'Q109334610', 'Q115619373', 'Q75327597', 'Q54959143', 'Q55642234', 'Q156238', 'Q58811108', 'Q4645801', 'Q5367351', 'Q80088527', 'Q59773555', 'Q30873', 'Q111043538', 'Q107394758', 'Q20687182', 'Q5357530', 'Q105966435', 'Q98356352', 'Q54173', 'Q20529164', 'Q123565578', 'Q57060073', 'Q42417740', 'Q7875985', 'Q39328518', 'Q1960330', 'Q17068357', 'Q97286528', 'Q16993862', 'Q753571', 'Q1326458', 'Q18891264', 'Q4645564', 'Q5374879', 'Q12482376', 'Q5357420', 'Q1326392', 'Q6883039']
['P618', 'P112', 'P169', 'P8324', 'P12642', 'P1079', 'P127', 'P4140', 'P8340', 'P7936', 'P2010', 'P2137', 'P1951', 'P3320', 'P2200', 'P622', 'P2009', 'P3362', 'P375', 'P2226', 'P12621', 'P516', 'P2228', 'P61', 'P1789', 'P1056', 'P2403', 'P1071', 'P450', 'P930', 'P2295', 'P2160', 'P2201', 'P2436', 'P2791', 'P2139', 'P1830', 'P859', 'P6589', 'P619', 'P12221', 'P4519',

In [3]:
import json

with open('trex-dataset-sample.json', 'r') as f:
    eval_data = json.load(f)

eval_data[0]['text']

'The Austroasiatic languages, in recent classifications synonymous with Mon–Khmer, are a large language family of continental Southeast Asia, also scattered throughout India, Bangladesh, Nepal and the southern border of China. The name Austroasiatic comes from the Latin words for "south" and "Asia", hence "South Asia". Of these languages, only Vietnamese, Khmer, and Mon have a long-established recorded history, and only Vietnamese and Khmer have official status (in Vietnam and Cambodia, respectively). The rest of the languages are spoken by minority groups. Ethnologue identifies 168 Austroasiatic languages. These form thirteen established families (plus perhaps Shompen, which is poorly attested, as a fourteenth), which have traditionally been grouped into two, as Mon–Khmer and Munda. However, one recent classification posits three groups (Munda, Nuclear Mon-Khmer and Khasi-Khmuic) while another has abandoned Mon–Khmer as a taxon altogether, making it synonymous with the larger family. 

In [None]:
from collections import defaultdict
import nltk
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk import pos_tag

nltk.download("punkt")
nltk.download("averaged_perceptron_tagger")

def is_pronoun(text):
    if not text:
        return False
    tokens = word_tokenize(text)
    tags = pos_tag(tokens)
    return any(tag in {"PRP", "PRP$", "WP", "WP$"} for _, tag in tags)

def map_triples_grouped_by_sentence(entry):
    text = entry["text"]
    sentences = sent_tokenize(text)

    # Compute sentence spans (char indices)
    spans = []
    start = 0
    for sent in sentences:
        end = start + len(sent)
        spans.append((start, end))
        start = end + 1  # assume 1 char separator

    # Dict to collect all triples per sentence
    sent_to_triples = defaultdict(list)

    for triple in entry["triples"]:
        subj_id = triple["subject"]["uri"].split("/")[-1]
        pred_id = triple["predicate"]["uri"].split("/")[-1]
        obj_id = triple["object"]["uri"].split("/")[-1]

        s_pos = triple["subject"].get("boundaries") or []
        p_pos = triple["predicate"].get("boundaries") or []
        o_pos = triple["object"].get("boundaries") or []

        # Extract surface strings to filter pronouns
        if s_pos:
            subj_text = text[s_pos[0]:s_pos[1]]
            if is_pronoun(subj_text):
                continue
        if o_pos:
            obj_text = text[o_pos[0]:o_pos[1]]
            if is_pronoun(obj_text):
                continue

        if not (s_pos or p_pos or o_pos):
            continue

        min_pos = min(p[0] for p in [s_pos, p_pos, o_pos] if p)
        max_pos = max(p[1] for p in [s_pos, p_pos, o_pos] if p)

        for idx, (s_start, s_end) in enumerate(spans):
            if min_pos >= s_start and max_pos <= s_end:
                sentence = sentences[idx]
                sent_to_triples[sentence].append({
                    "subject_id": subj_id,
                    "predicate_id": pred_id,
                    "object_id": obj_id
                })
                break

    # Format into list of {"sentence": ..., "triples": [...]}
    result = [
        {"sentence": sent, "triples": triples}
        for sent, triples in sent_to_triples.items()
    ]

    return result

data = []
for entry in eval_data:
    data.extend(map_triples_grouped_by_sentence(entry))

[nltk_data] Downloading package punkt to
[nltk_data]     /home/philippe.saade/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/philippe.saade/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [62]:
from tqdm import tqdm
import pickle

fact_checker = FactChecker(include_external_ids=False, lang='en', device='cpu')

for i in tqdm(range(len(data))):
    sentence = data[i]['sentence']
    results = fact_checker.check_claim(sentence)
    data[i]['fact_checking_results'] = results

    with open('fact_checking_data_with_results.pkl', 'wb') as f:
        pickle.dump(data, f)

  0%|          | 16/22364 [04:54<114:06:19, 18.38s/it]


HTTPError: 500 Server Error: Internal Server Error for url: https://wd-textify.toolforge.org/?id=Q868%2CQ130191396%2CQ189506%2CQ1296719%2CQ21536495%2CQ30745180%2CQ8250872%2CQ794394%2CQ7723955%2CQ8508268%2CQ11943224%2CQ61904342%2CQ179541%2CQ8508269%2CQ61008298%2CQ97417348%2CQ16730135%2CQ6242292%2CQ4766262%2CQ49827839%2CQ41980%2CQ28174218%2CQ859%2CQ6242233%2CQ118868646%2CQ5362575%2CQ91784771%2CQ992271%2CQ4418716%2CQ24916215%2CQ4791109%2CQ108094966%2CQ106603322%2CQ20003016%2CQ111942367%2CQ49875965%2CQ66404134%2CQ3855427%2CQ3893430%2CQ6242325%2CQ97388086%2CQ1079293%2CQ6242372%2CQ2528551%2CQ65629570%2CQ2844594%2CQ106752478%2CQ18783854%2CQ29648227%2CQ6991694&pids=P7613%2CP737%2CP10059%2CP9106%2CP9563%2CP1935%2CP3126%2CP9686%2CP140%2CP10700%2CP5390%2CP10782%2CP5088%2CP6302%2CP12782%2CP1142%2CP6223%2CP3123%2CP802%2CP1463%2CP91%2CP8403%2CP12780%2CP7962%2CP3235%2CP9199%2CP184%2CP6488%2CP9212%2CP5004%2CP3732%2CP10717%2CP9625%2CP611%2CP9678%2CP9929%2CP135%2CP7378%2CP10670%2CP9430%2CP2080%2CP3413%2CP3232%2CP25%2CP10535%2CP5550%2CP863%2CP185%2CP3205%2CP7663&external_ids=False&format=json

In [39]:
fact_checker = FactChecker(include_external_ids=False, lang='en', device='cpu')
results = fact_checker.check_claim("J.K. Rowling wrote the Harry Potter series.", verbose=True)

for result in results:
    print(result['hypothesis'])
    print(result['entailment'])
    print()



Vector Search...
['Q46758', 'Q17146193', 'Q8337', 'Q16011965', 'Q5410773', 'Q63113217', 'Q216930', 'Q336868', 'Q30739117', 'Q23891009', 'Q9011908', 'Q19975156', 'Q25934721', 'Q97867314', 'Q6130792', 'Q7372135', 'Q5364984', 'Q25752955', 'Q33129413', 'Q5727166', 'Q6107028', 'Q24302382', 'Q16466028', 'Q55720664', 'Q5606148', 'Q9105175', 'Q19974768', 'Q26691299', 'Q7791407', 'Q96707327', 'Q5482228', 'Q5423930', 'Q97738405', 'Q47467069', 'Q56559893', 'Q5671633', 'Q21113146', 'Q8003021', 'Q28016364', 'Q16030405', 'Q5671961', 'Q55153252', 'Q5673360', 'Q1414034', 'Q65620454', 'Q21167874', 'Q116291112', 'Q16066739', 'Q16066325', 'Q8001865']
['P8371', 'P3818', 'P8359', 'P8360', 'P2629', 'P5940', 'P3650', 'P6658', 'P3132', 'P2860', 'P2637', 'P2540', 'P2913', 'P9191', 'P747', 'P4437', 'P2125', 'P6166', 'P1104', 'P10663', 'P1445', 'P1240', 'P577', 'P2635', 'P1434', 'P1433', 'P6886', 'P3306', 'P2679', 'P1080', 'P1922', 'P110', 'P9866', 'P1574', 'P5970', 'P872', 'P9215', 'P7937', 'P4584', 'P6524', 'P