In [None]:
import os
os.environ["LANGUAGE"] = 'en' # Specify the language of the textified entities.

from src.wikidataEmbed import WikidataTextifier
from src.wikidataEntityDB import WikidataProperty, Session
from src.wikidataLangDB import create_wikidatalang_db
from src.wikidataRetriever import AstraDBConnect

import json
from tqdm import tqdm
import os
import requests
from datetime import datetime

MODEL = os.getenv("MODEL", "jinaapi")
EMBED_BATCH_SIZE = int(os.getenv("EMBED_BATCH_SIZE", 10))
QUERY_BATCH_SIZE = int(os.getenv("QUERY_BATCH_SIZE", 1000))
OFFSET = int(os.getenv("OFFSET", 0))
API_KEY_FILENAME = os.getenv("API_KEY", "datastax_wikidata2.json")
COLLECTION_NAME = os.getenv("COLLECTION_NAME", "v2_09_2025")
LANGUAGE = os.getenv("LANGUAGE", 'en')
TEXTIFIER_LANGUAGE = os.getenv("TEXTIFIER_LANGUAGE", None)
DUMPDATE = os.getenv("DUMPDATE", '09/18/2024')

COLLECTION_NAME = 'v2_09_2025'
API_KEY_FILENAME = 'datastax_wikidata2.json'
datastax_token = json.load(open(f"../API_tokens/{API_KEY_FILENAME}"))

textifier = WikidataTextifier(language=LANGUAGE, langvar_filename=TEXTIFIER_LANGUAGE)

WikidataLang = create_wikidatalang_db(db_filname=f"sqlite_{LANGUAGE}wiki.db")
graph_store = AstraDBConnect(datastax_token,
                             COLLECTION_NAME,
                             model=MODEL,
                             batch_size=EMBED_BATCH_SIZE
                            )

In [None]:
total_entities = 12711
OFFSET = 0
def get_property(session):
    entities = session.query(WikidataProperty).offset(OFFSET).yield_per(QUERY_BATCH_SIZE)
    for entity in entities:
        yield entity

In [None]:
def get_items_using_property(property_id, limit=100):
    """
    Fetches a list of Wikidata item QIDs that link to the specified property page.

    Parameters:
        property_id (str): The Wikidata property ID (e.g., 'P31').
        limit (int): Maximum number of items to return.

    Returns:
        List[str]: A list of QIDs that link to the property page.
    """
    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "query",
        "list": "backlinks",
        "bltitle": f"Property:{property_id}",
        "blnamespace": 0,
        "bllimit": limit,
        "format": "json",
        "srprop": 'claims'
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print("Error fetching backlinks:", response.status_code)
        return []

    data = response.json()
    backlinks = data.get("query", {}).get("backlinks", [])
    qids = [link["title"] for link in backlinks]
    return qids

for property in tqdm(get_property(Session()), total=total_entities):
    pid = property.id
    examples = []
    with WikidataLang.get_session() as session:
        property = session.query(WikidataLang)\
                        .filter(WikidataLang.id==pid)\
                        .first()

        if property:
            qids = get_items_using_property(pid, limit=100)
            for qid in qids:
                subject = WikidataLang.get_entity(qid)
                if subject and (pid in subject.claims):
                    examples.append(subject)

            text = textifier.property_to_text(property, examples)
            item_instanceof = [c['mainsnak']['datavalue']['value']['id'] for c in property.claims.get("P31", [])]

            # Chunk the text if needed
            tokens = graph_store.tokenizer(text, add_special_tokens=False, return_offsets_mapping=True)
            token_ids, offsets = tokens['input_ids'], tokens['offset_mapping']
            if len(token_ids) >= graph_store.max_token_size:
                start, end = offsets[0][0], offsets[graph_store.max_token_size - 1][1]
                text = text[start:end]

            metadata = {
                "Label": property.label,
                "Description": property.description,
                "Date": datetime.now().isoformat(),
                "PID": pid,
                "ChunkID": 1,
                "Language": LANGUAGE,
                "InstanceOf": item_instanceof,
                "IsItem": pid.startswith('Q'),
                "IsProperty": pid.startswith('P'),
                "DumpDate": DUMPDATE
            }

            graph_store.add_document(
                id=f"{pid}_{LANGUAGE}_0",
                text=text,
                metadata=metadata
            )

graph_store.push_batch()

In [None]:
for property in tqdm(get_property(Session()), total=total_entities):
    pid = property.id
    examples = []
    with WikidataLang.get_session() as session:
        property = session.query(WikidataLang)\
                        .filter(WikidataLang.id==pid)\
                        .first()

        if property and property.label:
            item_instanceof = [c['mainsnak']['datavalue']['value']['id'] for c in property.claims.get("P31", [])]

            chunks = textifier.chunk_text(
                property,
                graph_store.tokenizer,
                max_length=graph_store.max_token_size
            )

            for chunk_i, chunk in enumerate(chunks):
                db_id = f"{pid}_{LANGUAGE}_{chunk_i+1}"
                metadata = {
                    "Label": property.label,
                    "Description": property.description,
                    "Date": datetime.now().isoformat(),
                    "PID": pid,
                    "ChunkID": 1,
                    "Language": LANGUAGE,
                    "InstanceOf": item_instanceof,
                    "IsItem": pid.startswith('Q'),
                    "IsProperty": pid.startswith('P'),
                    "DumpDate": DUMPDATE
                }

                graph_store.add_document(
                    id=db_id,
                    text=chunk,
                    metadata=metadata
                )

graph_store.push_batch()

In [None]:
def get_entities_api(ids):
    """
    Fetches a list of Wikidata item QIDs that link to the specified property page.

    Parameters:
        property_id (str): The Wikidata property ID (e.g., 'P31').
        limit (int): Maximum number of items to return.

    Returns:
        List[str]: A list of QIDs that link to the property page.
    """
    if isinstance(ids, list):
        ids = '|'.join(set(ids))

    url = "https://www.wikidata.org/w/api.php"
    params = {
        "action": "wbgetentities",
        "ids": ids,
        "props": "labels|descriptions|aliases|claims|sitelinks",
        "format": "json",
    }

    response = requests.get(url, params=params)
    if response.status_code != 200:
        print("Error fetching backlinks:", response.status_code)
        return []

    data = response.json()
    entities = data.get("entities", {})
    return entities

pids = []
for property in tqdm(get_property(Session()), total=total_entities):
    pids.append(property.id)

data_batch = []
for i in tqdm(range(0, len(pids), 50)):
    items = get_entities_api(pids[i:i+50])
    for pid, item in items.items():
        if 'labels' in item:
            item = WikidataLang.normalise_item(item)
            data_batch.append(item)

WikidataLang.add_bulk_entities(data_batch)