# Fact Checking
A simple example on how to use the vector database for a fact-checking system with Wikidata statements

In [32]:
import requests

HEADERS = {
    'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
}
LANG = 'en'
INCLUDE_EXTERNAL_IDS = False

# Define the claim to be checked
claim = 'Albert Einstein was a theoretical physicist who developed the theory of relativity.'

### Get from the vector database the Wikidata items and properties that are relevant to the query

In [33]:
# Get relevant Wikidata items
items = requests.get(
    'https://wd-vectordb.wmcloud.org/item/query',
    params={'query': claim, 'lang': LANG},
    headers=HEADERS,
)
items = items.json()

# Get relevant Wikidata properties
properties = requests.get(
    'https://wd-vectordb.wmcloud.org/property/query',
    params={'query': claim, 'lang': LANG, 'exclude_external_ids': True},
    headers=HEADERS,
)
properties = properties.json()

### Get all statements of each item

In [34]:
def get_statements(qid, pids):
    params = {
        'id': qid,
        'pids': pids,
        'external_ids': INCLUDE_EXTERNAL_IDS,
        'format': 'json'
    }

    url = "https://wd-textify.toolforge.org"
    results = requests.get(url, params=params, headers=HEADERS)
    results.raise_for_status()

    text = results.json()
    return text

qids = [q['QID'] for q in items]
pids = [p['PID'] for p in properties]
items_info = get_statements(','.join(qids), ','.join(pids))

for i in range(len(items)):
    items[i]['label'] = items_info[items[i]['QID']]['label']
    items[i]['claims'] = items_info[items[i]['QID']]['claims']

### Sort statements by vector similarity

In [35]:
result_statements = []
for item in items:
    for property in properties:
        for statement in item['claims']:
            if property['PID'] == statement['PID']:
                result_statements.append({
                    'statement': {
                        **statement,
                        'QID': item['QID'],
                        'item_label': item['label'],
                    },
                    'similarity_score': item['similarity_score'] * property['similarity_score']
                })

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)

### Prepare NLI model for textual entailment detection

In [36]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
device = torch.device("cpu")

model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name)
label_names = ["entailment", "neutral", "contradiction"]

def predict_entailment(claim, wd_fact):
    label_names = ["entailment", "neutral", "contradiction"]
    input = tokenizer(wd_fact, claim, truncation=True, return_tensors="pt")
    output = model(input["input_ids"].to(device))
    prediction = torch.softmax(output["logits"][0], -1).tolist()
    prediction = {name: round(float(pred) * 100, 1) for pred, name in zip(prediction, label_names)}
    return prediction

def predict_entailment_bulk(claim, wd_facts, batch_size=8):
    label_names = ["entailment", "neutral", "contradiction"]
    results = []
    with torch.no_grad():
        for i in range(0, len(wd_facts), batch_size):
            batch_premises = wd_facts[i:i + batch_size]

            # Premise varies, hypothesis (claim) is the same
            enc = tokenizer(
                batch_premises,
                [claim] * len(batch_premises),
                truncation=True,
                padding=True,
                return_tensors="pt",
            )

            enc = {k: v.to(device) for k, v in enc.items()}

            out = model(**enc)
            probs = torch.softmax(out.logits, dim=-1).cpu()

            for row in probs:
                row_dict = {
                    name: round(float(p) * 100, 1)
                    for name, p in zip(label_names, row)
                }
                results.append(row_dict)

    return results



### Prepare hypothesis from Wikidata statements

In [37]:
def value_to_string(value):
    if isinstance(value, str):
        return value
    if 'string' in value:
        return value['string']
    elif 'label' in value:
        return value['label']
    elif 'time' in value:
        return value['time']
    return str(value)

def prepare_hypothesis(result):
    hypothesis = ""
    for statement in result['values']:
        hypothesis += f"{result['item_label']}: {result['property_label']}: {value_to_string(statement['value'])}"
        if 'qualifiers' in statement:
            for qualifier in statement['qualifiers']:
                values = ', '.join([
                    value_to_string(v['value']) for v in qualifier['values']
                ])
                hypothesis += f" | {qualifier['property_label']}: {values}"
        hypothesis += "\n"
    return hypothesis.strip()

for i in range(len(result_statements)):
    result_statements[i]['hypothesis'] = prepare_hypothesis(result_statements[i]['statement'])

### Predict Entailment per Wikidata statement

In [38]:
predictions = predict_entailment_bulk(
    claim,
    [result_statements[i]['hypothesis'] for i in range(len(result_statements))],
    batch_size=8
)
for i in range(len(result_statements)):
    result_statements[i]['entailment'] = predictions[i]

# Sort by similarity score
result_statements = sorted(result_statements, key=lambda x: x['entailment']['neutral'])

In [39]:
print(result_statements[0]['hypothesis'])
print(result_statements[0]['entailment'])

special relativity: discoverer or inventor: Albert Einstein | time of discovery or invention: September 26, 1905
{'entailment': 98.6, 'neutral': 1.4, 'contradiction': 0.0}


In [40]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch
import requests

class FactChecker:
    def __init__(self, include_external_ids=False, lang='en', device='cpu'):
        self.include_external_ids = include_external_ids
        self.lang = lang
        self.headers = {
            'User-Agent': 'Fact-Checker/1.0 (embeddings@wikimedia.de)'
        }

        model_name = "MoritzLaurer/DeBERTa-v3-large-mnli-fever-anli-ling-wanli"

        self.device = torch.device(device)
        self.nli_labels = ["entailment", "neutral", "contradiction"]
        self.nli_tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.nli_model = AutoModelForSequenceClassification.from_pretrained(model_name)

    def check_claim(self, claim, verbose=False):
        if verbose:
            print(f"Vector Search...")
        items, properties = self.vector_search(claim)

        if verbose:
            print(f"Getting Statements Text...")
        statements = self.prepare_statements(items, properties)

        if verbose:
            print(f"Found {len(statements)} statements")

        for i in range(len(statements)):
            statements[i]['hypothesis'] = self.statement_to_string(
                statements[i]['statement']
            )

        if verbose:
            print(f"Predicting Entailment...")
        predictions = self.predict_entailment(
            claim,
            [statements[i]['hypothesis'] for i in range(len(statements))],
            batch_size=8
        )
        for i in range(len(statements)):
            statements[i]['entailment'] = predictions[i]

        # Sort by neutrality
        statements = sorted(statements, key=lambda x: x['entailment']['neutral'])

        return statements

    def vector_search(self, claim):
        # Get relevant Wikidata items
        items = requests.get(
            'https://wd-vectordb.wmcloud.org/item/query',
            params={'query': claim, 'lang': self.lang},
            headers=self.headers,
        )
        items = items.json()

        # Get relevant Wikidata properties
        properties = requests.get(
            'https://wd-vectordb.wmcloud.org/property/query',
            params={'query': claim, 'lang': self.lang, 'exclude_external_ids': True},
            headers=self.headers,
        )
        properties = properties.json()

        return items, properties

    def get_item_statements(self, qids, pids):
        params = {
            'id': qids,
            'pids': pids,
            'external_ids': self.include_external_ids,
            'format': 'json'
        }

        url = "https://wd-textify.toolforge.org"
        results = requests.get(url, params=params, headers=self.headers)
        results.raise_for_status()

        text = results.json()
        return text

    def statement_to_string(self, statement):
        hypothesis = f"{statement['item_label']}: {statement['property_label']}: "
        for svalue in statement['values']:
            hypothesis += f"{self.statement_value_to_string(svalue['value'])}"

            if 'qualifiers' in svalue:
                for qualifier in svalue['qualifiers']:
                    values = ', '.join([
                        self.statement_value_to_string(v['value']) for v in qualifier['values']
                    ])
                    hypothesis += f" ({qualifier['property_label']}: {values})"

            hypothesis += ", "
        return hypothesis.strip().rstrip(',')

    def statement_value_to_string(self, value):
        if isinstance(value, str):
            return value
        if 'string' in value:
            return value['string']
        elif 'label' in value:
            return value['label']
        elif 'time' in value:
            return value['time']
        return str(value)

    def prepare_statements(self, items, properties):
        qids = [q['QID'] for q in items]
        pids = [p['PID'] for p in properties]
        items_info = self.get_item_statements(','.join(qids), ','.join(pids))

        for i in range(len(items)):
            items[i]['label'] = items_info[items[i]['QID']]['label']
            items[i]['claims'] = items_info[items[i]['QID']]['claims']

        result_statements = []
        for item in items:
            for property in properties:
                for statement in item['claims']:
                    if property['PID'] == statement['PID']:
                        result_statements.append({
                            'statement': {
                                **statement,
                                'QID': item['QID'],
                                'item_label': item['label'],
                            },
                            'similarity_score': item['similarity_score'] * property['similarity_score']
                        })

        # Sort by similarity score
        result_statements = sorted(result_statements, key=lambda x: x['similarity_score'], reverse=True)
        return result_statements

    def predict_entailment(self, claim, premises, batch_size = 8):
        results = []

        self.nli_model.eval()
        with torch.no_grad():
            for i in range(0, len(premises), batch_size):
                batch_premises = premises[i:i + batch_size]

                enc = self.nli_tokenizer(
                    batch_premises,
                    [claim] * len(batch_premises),
                    truncation=True,
                    padding=True,
                    return_tensors="pt",
                )

                enc = {k: v.to(self.device) for k, v in enc.items()}

                out = self.nli_model(**enc)
                probs = torch.softmax(out.logits, dim=-1).cpu()

                for row in probs:
                    row_dict = {
                        name: round(float(p) * 100, 1)
                        for name, p in zip(self.nli_labels, row)
                    }
                    results.append(row_dict)

        return results

    def rank_statements(self, claim, statements, batch_size=8):
        results = []

        self.rerank_model.eval()
        with torch.no_grad():
            for i in range(0, len(statements), batch_size):
                batch_pairs = [[statement, claim] for statement in statements[i:i + batch_size]]

                scores = model.compute_score(batch_pairs, max_length=1024)
                results = [*results, *scores]

        return results

In [41]:
claim = 'Elon Musk is the founder of Tesla, Inc.'

fact_checker = FactChecker(include_external_ids=False, lang='en', device='cpu')
results = fact_checker.check_claim(claim, verbose=True)

for result in results:
    print(result['hypothesis'])
    print(result['entailment'])
    print()



Vector Search...
Getting Statements Text...
Found 88 statements
Predicting Entailment...
Tesla, Inc.: founded by: Martin Eberhard, Marc Tarpenning
{'entailment': 0.1, 'neutral': 0.2, 'contradiction': 99.7}

Tesla Electric Light & Manufacturing: founded by: Nikola Tesla
{'entailment': 0.0, 'neutral': 0.3, 'contradiction': 99.7}

Elon Musk: owner of: Tesla, Inc., X.com, Elon Musk's Tesla Roadster, SpaceX, The Boring Company, X (start time: April 2022), X Holdings I, Inc., X Holdings II, Inc., X Holdings III, LLC
{'entailment': 96.9, 'neutral': 3.1, 'contradiction': 0.1}

Dell: chief executive officer: Michael Dell
{'entailment': 1.5, 'neutral': 22.9, 'contradiction': 75.6}

Mapbox: chief executive officer: Peter Sirota (start time: March 1, 2021) (reason for preferred rank: most recent value)
{'entailment': 0.8, 'neutral': 29.6, 'contradiction': 69.5}

Woz U: owned by: Steve Wozniak
{'entailment': 0.7, 'neutral': 48.5, 'contradiction': 50.7}

Dell: owner of: Alienware, Dell Wyse, Perot S