# Fact Checking
A simple example on how to use the vector database for a fact-checking system with Wikidata statements

In [6]:
import requests

HEADERS = {
    'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
}
LANG = 'en'
INCLUDE_EXTERNAL_IDS = False

# Define the claim to be checked
claim = 'Albert Einstein was a theoretical physicist who developed the theory of relativity.'

### Get from the vector database the Wikidata items and properties that are relevant to the query

In [7]:
# Get relevant Wikidata items
items = requests.get(
    'https://wd-vectordb.wmcloud.org/item/query',
    params={'query': claim, 'lang': LANG},
    headers=HEADERS,
)
items = items.json()

# Get relevant Wikidata properties
properties = requests.get(
    'https://wd-vectordb.wmcloud.org/property/query',
    params={'query': claim, 'lang': LANG},
    headers=HEADERS,
)
properties = properties.json()

### Get all statements of each item

In [10]:
def get_statements(qid, pids):
    params = {
        'id': qid,
        'pids': pids,
        'external_ids': INCLUDE_EXTERNAL_IDS,
        'format': 'json'
    }

    url = "https://wd-textify.toolforge.org"
    results = requests.get(url, params=params, headers=HEADERS)
    results.raise_for_status()

    text = results.json()
    return text

pids = [p['PID'] for p in properties]
for i in range(len(items)):
    item_info = get_statements(items[i]['QID'], pids)
    items[i]['label'] = item_info['label']
    items[i]['claims'] = item_info['claims']

In [17]:
def get_entity(qid):
    results = {}
    r = requests.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbgetentities",
            "ids": qid,
            "props": 'labels|descriptions|aliases|claims',
            "format": "json",
        }, headers=HEADERS
    )
    r.raise_for_status()
    results = r.json()
    return results

get_entity('Q42')

{'entities': {'Q42': {'type': 'item',
   'id': 'Q42',
   'labels': {'ar': {'language': 'ar', 'value': 'دوغلاس آدمز'},
    'ary': {'language': 'ary', 'value': 'دوڭلاس أدامز'},
    'arz': {'language': 'arz', 'value': 'دوجلاس ادامز'},
    'az': {'language': 'az', 'value': 'Duqlas Adams'},
    'azb': {'language': 'azb', 'value': 'داقلاس آدامز'},
    'ba': {'language': 'ba', 'value': 'Дуглас Адамс'},
    'be': {'language': 'be', 'value': 'Дуглас Адамс'},
    'be-tarask': {'language': 'be-tarask', 'value': 'Дуглас Адамз'},
    'bg': {'language': 'bg', 'value': 'Дъглас Адамс'},
    'bho': {'language': 'bho', 'value': 'डगलस एडम्स'},
    'bn': {'language': 'bn', 'value': 'ডগলাস অ্যাডামস'},
    'ckb': {'language': 'ckb', 'value': 'دەگلاس ئادمز'},
    'el': {'language': 'el', 'value': 'Ντάγκλας Άνταμς'},
    'fa': {'language': 'fa', 'value': 'داگلاس آدامز'},
    'gu': {'language': 'gu', 'value': 'ડગ્લાસ એડમ્સ'},
    'he': {'language': 'he', 'value': 'דאגלס אדמס'},
    'hi': {'language': 'hi', 'va

In [13]:
def get_specific_claims(qid, pid):
    results = {}
    r = requests.get(
        "https://www.wikidata.org/w/api.php",
        params={
            "action": "wbgetclaims",
            "entity": qid,
            "property": pid,
            "format": "json",
        }, headers=HEADERS
    )
    r.raise_for_status()
    claims = r.json().get("claims", {}).get(pid, [])
    results[pid] = claims
    return results

get_specific_claims('Q42', 'P31')

{'P31': [{'mainsnak': {'snaktype': 'value',
    'property': 'P31',
    'hash': 'ad7d38a03cdd40cdc373de0dc4e7b7fcbccb31d9',
    'datavalue': {'value': {'entity-type': 'item',
      'numeric-id': 5,
      'id': 'Q5'},
     'type': 'wikibase-entityid'},
    'datatype': 'wikibase-item'},
   'type': 'statement',
   'qualifiers': {'P805': [{'snaktype': 'value',
      'property': 'P805',
      'hash': 'eed7690a3685dc7a62a579e7222b5e9a367d8192',
      'datavalue': {'value': {'entity-type': 'item',
        'numeric-id': 42395533,
        'id': 'Q42395533'},
       'type': 'wikibase-entityid'},
      'datatype': 'wikibase-item'}]},
   'qualifiers-order': ['P805'],
   'id': 'Q42$F078E5B3-F9A8-480E-B7AC-D97778CBBEF9',
   'rank': 'normal',
   'references': [{'hash': 'a4d108601216cffd2ff1819ccf12b483486b62e7',
     'snaks': {'P248': [{'snaktype': 'value',
        'property': 'P248',
        'hash': 'def9f19d84b65167a2a17ce38364d264c16127fc',
        'datavalue': {'value': {'entity-type': 'item',
   

### Sort statements by vector similarity

In [None]:
result_statements = []
for item in items:
    for property in properties:
        for statement in item['claims']:
            if property['PID'] == statement['PID']:
                result_statements.append({
                    'statement': {
                        **statement,
                        'QID': item['QID'],
                        'item_label': item['label'],
                    },
                    'similarity_score': item['similarity_score'] * property['similarity_score']
                })

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)

### Prepare NLI model for textual entailment detection

In [9]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_entailment(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction



### Prepare hypothesis from Wikidata statements

In [None]:
def value_to_string(value):
    if isinstance(value, str):
        return value
    if 'string' in value:
        return value['string']
    elif 'label' in value:
        return value['label']
    elif 'time' in value:
        return value['time']
    return str(value)

def prepare_hypothesis(result):
    hypothesis = ""
    for statement in result['values']:
        hypothesis += f"{result['item_label']}: {result['property_label']}: {value_to_string(statement['value'])}"
        if 'qualifiers' in statement:
            for qualifier in statement['qualifiers']:
                values = ', '.join([
                    value_to_string(v['value']) for v in qualifier['values']
                ])
                hypothesis += f" | {qualifier['property_label']}: {values}"
        hypothesis += "\n"
    return hypothesis.strip()

for i in range(len(result_statements)):
    result_statements[i]['hypothesis'] = prepare_hypothesis(result_statements[i]['statement'])

### Predict Entailment per Wikidata statement

In [None]:
for i in range(len(result_statements)):
    result_statements[i]['entailment'] = predict_entailment(claim, result_statements[i]['hypothesis'])

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['entailment']['neutral'])

In [None]:
print(result_statements[0]['hypothesis'])
print(result_statements[0]['entailment'])

theory of relativity: discoverer or inventor: Albert Einstein
{'entailment': 98.1, 'neutral': 1.8, 'contradiction': 0.2}


In [16]:
import requests

HEADERS = {
    'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
}
LANG = 'en'
INCLUDE_EXTERNAL_IDS = False

def get_statements(qid):
    params = {
        'id': qid,
        'external_ids': INCLUDE_EXTERNAL_IDS,
        'format': 'json'
    }

    url = "https://wd-textify.toolforge.org"
    results = requests.get(url, params=params, headers=HEADERS)
    results.raise_for_status()

    text = results.json()
    return text

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_entailment(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

def value_to_string(value):
    if isinstance(value, str):
        return value
    if 'string' in value:
        return value['string']
    elif 'label' in value:
        return value['label']
    elif 'time' in value:
        return value['time']
    return str(value)

def prepare_hypothesis(result):
    hypothesis = ""
    for statement in result['values']:
        hypothesis += f"{result['item_label']}: {result['property_label']}: {value_to_string(statement['value'])}"
        if 'qualifiers' in statement:
            for qualifier in statement['qualifiers']:
                values = ', '.join([
                    value_to_string(v['value']) for v in qualifier['values']
                ])
                hypothesis += f" | {qualifier['property_label']}: {values}"
        hypothesis += "\n"
    return hypothesis.strip()

def fact_check_claim(claim):
    # Get relevant Wikidata items
    print('Get items')
    items = requests.get(
        'https://wd-vectordb.wmcloud.org/item/query',
        params={'query': claim, 'lang': LANG},
        headers=HEADERS,
    )
    items = items.json()

    # Get relevant Wikidata properties
    print('Get properties')
    properties = requests.get(
        'https://wd-vectordb.wmcloud.org/property/query',
        params={'query': claim, 'lang': LANG},
        headers=HEADERS,
    )
    properties = properties.json()

    print("Prepare Statements")
    for i in range(len(items)):
        item_info = get_statements(items[i]['QID'])
        items[i]['label'] = item_info['label']
        items[i]['claims'] = item_info['claims']

    result_statements = []
    for item in items:
        for property in properties:
            for statement in item['claims']:
                if property['PID'] == statement['PID']:
                    result_statements.append({
                        'statement': {
                            **statement,
                            'QID': item['QID'],
                            'item_label': item['label'],
                        },
                        'similarity_score': item['similarity_score'] * property['similarity_score']
                    })

    # Sort by similarity score
    result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)

    for i in range(len(result_statements)):
        result_statements[i]['hypothesis'] = prepare_hypothesis(result_statements[i]['statement'])

    # for i in range(len(result_statements)):
    #     result_statements[i]['entailment'] = predict_entailment(claim, result_statements[i]['hypothesis'])

    # Sort by similarity score
    result_statements = sorted(result_statements, key=lambda x: x['entailment']['neutral'])
    return result_statements

In [17]:
import json

with open('WDV_JSON.json', 'r+') as f:
    eval_data = json.load(f)

statements = fact_check_claim(eval_data[0]['verbalisation_unk_replaced'])

Get items
Get properties
Prepare Statements


KeyError: 0