# Fact Checking
A simple example on how to use the vector database for a fact-checking system with Wikidata statements

In [44]:
import requests

HEADERS = {
    'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
}
LANG = 'en'
INCLUDE_EXTERNAL_IDS = False

# Define the claim to be checked
claim = 'Albert Einstein was a theoretical physicist who developed the theory of relativity.'

### Get from the vector database the Wikidata items and properties that are relevant to the query

In [45]:
# Get relevant Wikidata items
items = requests.get(
    'https://wd-vectordb.wmcloud.org/item/query',
    params={'query': claim, 'lang': LANG},
    headers=HEADERS,
)
items = items.json()

# Get relevant Wikidata properties
properties = requests.get(
    'https://wd-vectordb.wmcloud.org/property/query',
    params={'query': claim, 'lang': LANG},
    headers=HEADERS,
)
properties = properties.json()

### Get all statements of each item

In [46]:
def get_statements(qid, pids):
    params = {
        'id': qid,
        'pids': pids,
        'external_ids': INCLUDE_EXTERNAL_IDS,
        'format': 'json'
    }

    url = "https://wd-textify.toolforge.org"
    results = requests.get(url, params=params, headers=HEADERS)
    results.raise_for_status()

    text = results.json()
    return text

pids = [p['PID'] for p in properties]
for i in range(len(items)):
    item_info = get_statements(items[i]['QID'], pids)
    items[i]['label'] = item_info['label']
    items[i]['claims'] = item_info['claims']

### Sort statements by vector similarity

In [47]:
result_statements = []
for item in items:
    for property in properties:
        for statement in item['claims']:
            if property['PID'] == statement['PID']:
                result_statements.append({
                    'statement': {
                        **statement,
                        'QID': item['QID'],
                        'item_label': item['label'],
                    },
                    'similarity_score': item['similarity_score'] * property['similarity_score']
                })

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)

### Prepare NLI model for textual entailment detection

In [51]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_entailment(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction



### Prepare hypothesis from Wikidata statements

In [52]:
def value_to_string(value):
    if isinstance(value, str):
        return value
    if 'string' in value:
        return value['string']
    elif 'label' in value:
        return value['label']
    elif 'time' in value:
        return value['time']
    return str(value)

def prepare_hypothesis(result):
    hypothesis = ""
    for statement in result['values']:
        hypothesis += f"{result['item_label']}: {result['property_label']}: {value_to_string(statement['value'])}"
        if 'qualifiers' in statement:
            for qualifier in statement['qualifiers']:
                values = ', '.join([
                    value_to_string(v['value']) for v in qualifier['values']
                ])
                hypothesis += f" | {qualifier['property_label']}: {values}"
        hypothesis += "\n"
    return hypothesis.strip()

for i in range(len(result_statements)):
    result_statements[i]['hypothesis'] = prepare_hypothesis(result_statements[i]['statement'])

### Predict Entailment per Wikidata statement

In [53]:
for i in range(len(result_statements)):
    result_statements[i]['entailment'] = predict_entailment(claim, result_statements[i]['hypothesis'])

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['entailment']['neutral'])

In [54]:
print(result_statements[0]['hypothesis'])
print(result_statements[0]['entailment'])

Ernst Hammer: doctoral student: Alfred Berroth
Ernst Hammer: doctoral student: Alfred Egerer
{'entailment': 0.1, 'neutral': 1.9, 'contradiction': 98.0}


In [None]:
for r in result_statements:
    print(r['hypothesis'])

[{'statement': {'PID': 'P185',
   'property_label': 'doctoral student',
   'datatype': 'wikibase-item',
   'values': [{'value': {'QID': 'Q2644706', 'label': 'Alfred Berroth'},
     'rank': 'normal'},
    {'value': {'QID': 'Q95249864', 'label': 'Alfred Egerer'},
     'rank': 'normal'}],
   'QID': 'Q1358500',
   'item_label': 'Ernst Hammer'},
  'similarity_score': 0.42545586685689996,
  'hypothesis': 'Ernst Hammer: doctoral student: Alfred Berroth\nErnst Hammer: doctoral student: Alfred Egerer',
  'entailment': {'entailment': 0.1, 'neutral': 1.9, 'contradiction': 98.0}},
 {'statement': {'PID': 'P184',
   'property_label': 'doctoral advisor',
   'datatype': 'wikibase-item',
   'values': [{'value': {'QID': 'Q62108',
      'label': 'Gustav Heinrich Wiedemann'},
     'qualifiers': [{'PID': 'P3831',
       'property_label': 'object of statement has role',
       'datatype': 'wikibase-item',
       'values': [{'value': {'QID': 'Q26236695',
          'label': 'first doctoral advisor'}}]},
     

In [16]:
import requests

HEADERS = {
    'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
}
LANG = 'en'
INCLUDE_EXTERNAL_IDS = False

def get_statements(qid):
    params = {
        'id': qid,
        'external_ids': INCLUDE_EXTERNAL_IDS,
        'format': 'json'
    }

    url = "https://wd-textify.toolforge.org"
    results = requests.get(url, params=params, headers=HEADERS)
    results.raise_for_status()

    text = results.json()
    return text

from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)

def predict_entailment(premise, hypothesis):
    input = tokenizer(premise, hypothesis, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    label_names = ["entailment", "neutral", "contradiction"]
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

def value_to_string(value):
    if isinstance(value, str):
        return value
    if 'string' in value:
        return value['string']
    elif 'label' in value:
        return value['label']
    elif 'time' in value:
        return value['time']
    return str(value)

def prepare_hypothesis(result):
    hypothesis = ""
    for statement in result['values']:
        hypothesis += f"{result['item_label']}: {result['property_label']}: {value_to_string(statement['value'])}"
        if 'qualifiers' in statement:
            for qualifier in statement['qualifiers']:
                values = ', '.join([
                    value_to_string(v['value']) for v in qualifier['values']
                ])
                hypothesis += f" | {qualifier['property_label']}: {values}"
        hypothesis += "\n"
    return hypothesis.strip()

def fact_check_claim(claim):
    # Get relevant Wikidata items
    print('Get items')
    items = requests.get(
        'https://wd-vectordb.wmcloud.org/item/query',
        params={'query': claim, 'lang': LANG},
        headers=HEADERS,
    )
    items = items.json()

    # Get relevant Wikidata properties
    print('Get properties')
    properties = requests.get(
        'https://wd-vectordb.wmcloud.org/property/query',
        params={'query': claim, 'lang': LANG},
        headers=HEADERS,
    )
    properties = properties.json()

    print("Prepare Statements")
    for i in range(len(items)):
        item_info = get_statements(items[i]['QID'])
        items[i]['label'] = item_info['label']
        items[i]['claims'] = item_info['claims']

    result_statements = []
    for item in items:
        for property in properties:
            for statement in item['claims']:
                if property['PID'] == statement['PID']:
                    result_statements.append({
                        'statement': {
                            **statement,
                            'QID': item['QID'],
                            'item_label': item['label'],
                        },
                        'similarity_score': item['similarity_score'] * property['similarity_score']
                    })

    # Sort by similarity score
    result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)

    for i in range(len(result_statements)):
        result_statements[i]['hypothesis'] = prepare_hypothesis(result_statements[i]['statement'])

    # for i in range(len(result_statements)):
    #     result_statements[i]['entailment'] = predict_entailment(claim, result_statements[i]['hypothesis'])

    # Sort by similarity score
    result_statements = sorted(result_statements, key=lambda x: x['entailment']['neutral'])
    return result_statements

In [17]:
import json

with open('WDV_JSON.json', 'r+') as f:
    eval_data = json.load(f)

statements = fact_check_claim(eval_data[0]['verbalisation_unk_replaced'])

Get items
Get properties
Prepare Statements


KeyError: 0