In [1]:
import time
from pathlib import Path
from tqdm import tqdm
import pprint
import pandas as pd
from qwikidata.entity import WikidataItem
from qwikidata.json_dump import WikidataJsonDump

In [3]:
# download from https://archive.org/details/wikimediadownloads?and[]="Wikidata+entity+dumps"
# file should be a compressed json file
# example, we downloaded wikidata-20201130-all.json.bz2 
date = 'wikidata-20201130-all'
filename = f'{date}.json.bz2'

# create an instance of WikidataJsonDump
wjd_dump_path = filename
wjd = WikidataJsonDump(str(filename))

In [4]:
# One use of wikidata is to generate a graph of entity-entity knowledge graph relations
# say we want to generate a graph of company-to-company relationships where the companies are listed on NYSE and NASDAQ
nasdaq_tickers = pd.read_csv(data_dir/'NASDAQ_wiki.csv', names=['ticker', 'wikidataid'])
nyse_tickers = pd.read_csv(data_dir/'NYSE_wiki.csv', names=['ticker', 'wikidataid'])
nasdaq_tickers['mkt'] = 'NASDAQ'
nyse_tickers['mkt'] = 'NYSE'
tickers = pd.concat([nyse_tickers, nasdaq_tickers], axis=0)
assert len(tickers) == len(nasdaq_tickers) + len(nyse_tickers)

In [9]:
# This runs for a while due to number of json statements
# basically there are 'item' and 'property' types
# we can just run through each of the statements, find 'items' and match the id based on entity we want
# then there will be a list of claims which are the property statements
# look through them and extract all of the relevant claims - each a key
# each key can have multiple items linked to the claims so iterate through the list
# then extract the relevant claims that are 'wikibase-item', cause there are other possible items related to claims like quantities
company_list = list(tickers.wikidataid)
company_triplets = []
for index, entity_dict in tqdm(enumerate(wjd)):
    if entity_dict["type"] == "item":
        if entity_dict["id"] in company_list:
            company = entity_dict
            # print(company['id'], company['labels']['en']['value'])
            try:
                for claim in company['claims'].keys():
                    for claimno in range(len(company['claims'][claim])):
                        if company['claims'][claim][claimno]['mainsnak']['datatype'] == 'wikibase-item':
                            # print(f"{claim} | Q{company['claims'][claim][claimno]['mainsnak']['datavalue']['value']['numeric-id']}")
                            company_triplets.append({'source': company['id'], 'source_name': company['labels']['en']['value'], 'relationship': claim, 
                                                    'target': 'Q' + str(company['claims'][claim][claimno]['mainsnak']['datavalue']['value']['numeric-id']),
                                                    'ticker': tickers[tickers.wikidataid == company['id']].ticker.values[0], 
                                                    'id': tickers[tickers.wikidataid == company['id']].wikidataid.values[0], 
                                                    'mkt': tickers[tickers.wikidataid == company['id']].mkt.values[0]})
            except:
                company_triplets.append({'source': company['id'], 'source_name': 'NIL', 'relationship': 'NIL', 'target': 'NIL',
                                                    'ticker': tickers[tickers.wikidataid == company['id']].ticker.values[0], 
                                                    'id': tickers[tickers.wikidataid == company['id']].wikidataid.values[0], 
                                                    'mkt': tickers[tickers.wikidataid == company['id']].mkt.values[0]})

            # break

90880584it [19:18:58, 1306.91it/s]


In [10]:
full_company_triplets = pd.DataFrame(company_triplets)

In [11]:
csv_filename = 'full_company_relationships_triplets_' + date + '.csv'
hdf_filename = 'full_company_relationships_triplets_keydata_' + date + '.h5'
full_company_triplets.to_csv(data_dir/csv_filename, index=False)
full_company_triplets.to_hdf(data_dir/hdf_filename, key='data')