In [2]:
import os
import time

import pandas as pd
from tqdm import tqdm
from pathlib import Path
from qwikidata.linked_data_interface import get_entity_dict_from_api
from qwikidata.entity import WikidataItem, WikidataProperty, WikidataLexeme

In [3]:
nasdaq_tickers = pd.read_csv('NASDAQ_wiki.csv', names=['ticker', 'wikidataid'])
nyse_tickers = pd.read_csv('NYSE_wiki.csv', names=['ticker', 'wikidataid'])
nasdaq_tickers['mkt'] = 'NASDAQ'
nyse_tickers['mkt'] = 'NYSE'
tickers = pd.concat([nyse_tickers, nasdaq_tickers], axis=0)
assert len(tickers) == len(nasdaq_tickers) + len(nyse_tickers)

In [4]:
tickers = tickers[tickers.wikidataid != 'unknown']

In [8]:
company_triplets = []
tickers_list = list(tickers.wikidataid.values)
failed_tickers = []
print(f'Total number of tickers: {len(tickers_list)}')
# for i in tqdm(range(len(tickers))):
while len(tickers_list) > 0:
    # sel_ticker = tickers.wikidataid.values[0]
    sel_ticker = tickers_list.pop()
    print(f'Trying to retrieve: {sel_ticker}')
    try:
        company = get_entity_dict_from_api(sel_ticker)
        print(f'Succeeded in retrieving: {sel_ticker} | Number of tickers left: {len(tickers_list)}')
    except:
        time.sleep(5)
        print(f'Failed in retrieving: {sel_ticker} | Number of tickers left: {len(tickers_list)}')
        print('Retrying')
        try:
            company = get_entity_dict_from_api(sel_ticker)  
            print(f'Succeeded in retrieving: {sel_ticker} | Number of tickers left: {len(tickers_list)}')
        except:
            failed_tickers.append(sel_ticker)
            continue

    try:
        for claim in company['claims'].keys():
            for claimno in range(len(company['claims'][claim])):
                if company['claims'][claim][claimno]['mainsnak']['datatype'] == 'wikibase-item':
                    # print(f"{claim} | Q{company['claims'][claim][claimno]['mainsnak']['datavalue']['value']['numeric-id']}")
                    company_triplets.append({'source': company['id'], 'source_name': company['labels']['en']['value'], 'relationship': claim, 
                                            'target': 'Q' + str(company['claims'][claim][claimno]['mainsnak']['datavalue']['value']['numeric-id']),
                                            'ticker': tickers[tickers.wikidataid == company['id']].ticker.values[0], 
                                            'id': tickers[tickers.wikidataid == company['id']].wikidataid.values[0], 
                                            'mkt': tickers[tickers.wikidataid == company['id']].mkt.values[0]})
    except:
        # redirects can happen to map old ID to new ID so using sel_ticker
        # can check later if company['id] != sel_ticker
        company_triplets.append({'source': company['id'], 'source_name': 'NIL', 'relationship': 'NIL', 'target': 'NIL',
                                            'ticker': tickers[tickers.wikidataid == sel_ticker].ticker.values[0], 
                                            'id': tickers[tickers.wikidataid == sel_ticker].wikidataid.values[0], 
                                            'mkt': tickers[tickers.wikidataid == sel_ticker].mkt.values[0]})

    # break

Total number of tickers: 1109
Trying to retrieve: Q8075252
Succeeded in retrieving: Q8075252 | Number of tickers left: 1108
Trying to retrieve: Q8072579
Succeeded in retrieving: Q8072579 | Number of tickers left: 1107
Trying to retrieve: Q7089136
Succeeded in retrieving: Q7089136 | Number of tickers left: 1106
Trying to retrieve: Q2443609
Succeeded in retrieving: Q2443609 | Number of tickers left: 1105
Trying to retrieve: Q8046104
Succeeded in retrieving: Q8046104 | Number of tickers left: 1104
Trying to retrieve: Q8046013
Succeeded in retrieving: Q8046013 | Number of tickers left: 1103
Trying to retrieve: Q5281
Succeeded in retrieving: Q5281 | Number of tickers left: 1102
Trying to retrieve: Q834888
Succeeded in retrieving: Q834888 | Number of tickers left: 1101
Trying to retrieve: Q1046482
Succeeded in retrieving: Q1046482 | Number of tickers left: 1100
Trying to retrieve: Q2274089
Succeeded in retrieving: Q2274089 | Number of tickers left: 1099
Trying to retrieve: Q8033550
Succeeded

In [9]:
failed_tickers

['Q7447744', 'Q20656950', 'Q6030036', 'Q16976069', 'Q5332702', 'Q6394386']

In [23]:
all_company_triplets = pd.DataFrame(company_triplets)

In [21]:
latest_failed_tickers = pd.DataFrame(failed_tickers, columns=['ticker'])

In [22]:
latest_failed_tickers

Unnamed: 0,ticker
0,Q7447744
1,Q20656950
2,Q6030036
3,Q16976069
4,Q5332702
5,Q6394386


In [24]:
all_company_triplets.to_csv(data_dir/'full_company_relationships_triplets_Jun2021.csv', index=False)
all_company_triplets.to_hdf(data_dir/'full_company_relationships_triplets_Jun2021_keydata.h5', key='data')

In [25]:
latest_failed_tickers.to_csv(data_dir/'failed_tickers_Jun2021.csv', index=False)