# How to read and process the Wikdata dump file.

In [None]:
import os
os.environ["LANGUAGE"] = 'prop' # Specify the language of the entity labels, description, and aliases.

from src.wikidataDumpReader import WikidataDumpReader
from src.wikidataLangDB import create_wikidatalang_db
from src.wikidataEntityDB import WikidataEntity
from multiprocessing import Manager
import time
import os
import json

WikidataLang = create_wikidatalang_db(db_filname=f"sqlite_propwiki.db")

#### Reading the Wikidata dump ZIP file and saving all labels and descriptions to SQLite

In [None]:
FILEPATH = os.getenv("FILEPATH", '../data/Wikidata/latest-all.json.bz2')
PUSH_SIZE = int(os.getenv("PUSH_SIZE", 20000))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", 15000))
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES", 4))
SKIPLINES = int(os.getenv("SKIPLINES", 0))
LANGUAGE = os.getenv("LANGUAGE", 'en')

In [None]:
def save_items_to_sqlite(item, data_batch, sqlitDBlock):
    if (item is not None):
        labels = WikidataEntity.clean_label_description(item['labels'])
        descriptions = WikidataEntity.clean_label_description(item['descriptions'])
        labels = json.dumps(labels, separators=(',', ':'))
        descriptions = json.dumps(descriptions, separators=(',', ':'))
        in_wikipedia = WikidataEntity.is_in_wikipedia(item)
        data_batch.append({
            'id': item['id'],
            'labels': labels,
            'descriptions': descriptions,
            'in_wikipedia': in_wikipedia,
            'is_property': ('P' in item['id']),
            'is_item': ('Q' in item['id']),
        })

        with sqlitDBlock:
            if len(data_batch) > PUSH_SIZE:
                worked = WikidataEntity.add_bulk_items(list(data_batch[:PUSH_SIZE]))
                if worked:
                    del data_batch[:PUSH_SIZE]

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
data_batch = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)
wikidata.run(lambda item: save_items_to_sqlite(item, data_batch, sqlitDBlock), max_iterations=None, verbose=True)

while len(data_batch) > 0:
    worked = WikidataEntity.add_bulk_items(list(data_batch))
    if worked:
        del data_batch[:PUSH_SIZE]
    else:
        time.sleep(1)

### Adding entities (label, description, claims, and aliases) of items connected to Wikipedia

In [None]:
FILEPATH = os.getenv("FILEPATH", '../data/Wikidata/latest-all.json.bz2')
PUSH_SIZE = int(os.getenv("PUSH_SIZE", 2000))
QUEUE_SIZE = int(os.getenv("QUEUE_SIZE", 1500))
NUM_PROCESSES = int(os.getenv("NUM_PROCESSES", 4))
SKIPLINES = int(os.getenv("SKIPLINES", 0))
LANGUAGE = os.getenv("LANGUAGE", 'en')

In [None]:
def save_entities_to_sqlite(item, data_batch, sqlitDBlock):
    if (item is not None) and WikidataLang.is_in_wikipedia(item, language=LANGUAGE):
        item = WikidataLang.normalise_item(item, language=LANGUAGE)
        data_batch.append(item)

        with sqlitDBlock:
            if len(data_batch) > PUSH_SIZE:
                worked = WikidataLang.add_bulk_entities(list(data_batch[:PUSH_SIZE]))
                if worked:
                    del data_batch[:PUSH_SIZE]

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
data_batch = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)
wikidata.run(lambda item: save_entities_to_sqlite(item, data_batch, sqlitDBlock), max_iterations=None, verbose=True)

while len(data_batch) > 0:
    worked = WikidataLang.add_bulk_entities(list(data_batch))
    if worked:
        del data_batch[:PUSH_SIZE]
    else:
        time.sleep(1)

In [None]:
LANGUAGE = 'en'

def save_properties_to_sqlite(item, data_batch, sqlitDBlock):
    if (item is not None) and ('P' in item['id']):
        item = WikidataLang.normalise_item(item, language=LANGUAGE)
        data_batch.append(item)

        with sqlitDBlock:
            if len(data_batch) > PUSH_SIZE:
                worked = WikidataLang.add_bulk_entities(list(data_batch[:PUSH_SIZE]))
                if worked:
                    del data_batch[:PUSH_SIZE]

multiprocess_manager = Manager()
sqlitDBlock = multiprocess_manager.Lock()
data_batch = multiprocess_manager.list()

wikidata = WikidataDumpReader(FILEPATH, num_processes=NUM_PROCESSES, queue_size=QUEUE_SIZE, skiplines=SKIPLINES)
wikidata.run(lambda item: save_properties_to_sqlite(item, data_batch, sqlitDBlock), max_iterations=None, verbose=True)

while len(data_batch) > 0:
    worked = WikidataLang.add_bulk_entities(list(data_batch))
    if worked:
        del data_batch[:PUSH_SIZE]
    else:
        time.sleep(1)