In [1]:
import json
import pickle
from collections import defaultdict

import networkx as nx
import requests

In [2]:
visited = set()

def add_to_hierarchy(c):
    if c not in visited:
        hierarchy.add_node(c)
        base_url = 'https://www.wikidata.org/w/api.php'
        payload = {'action': 'wbgetentities', 'ids': c, 'languages': 'en', 'format': 'json'}
        r = requests.get(base_url, params=payload)
        response = r.json()
        d = response['entities'][c]
        if 'claims' in d and 'P279' in d['claims']:
            for rel_inst in d['claims']['P279']:
                if 'datavalue' in rel_inst['mainsnak']:
                    super_c = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                    hierarchy.add_node(super_c)
                    hierarchy.add_edge(c, super_c, {'type': 'is_subclass_of'})
                    hierarchy.add_edge(super_c, c, {'type': 'is_superclass_of'})
                    add_to_hierarchy(super_c)
        visited.add(c)

In [3]:
all_properties_set = set() # all properties: relationships and attributes
all_types_set = set() # all types of instances
all_instances_set = set() # all instances
all_instances_and_types_dict = defaultdict(set) # all instances and their types
hierarchy = nx.DiGraph() # full type hierarchy

In [4]:
%%time

total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            ide = d['id']
            all_instances_set.add(ide)
            for prop in d['claims']:
                all_properties_set.add(prop)
                if prop == 'P31':
                    for prop_inst in d['claims']['P31']:
                        value = 'Q' + str(prop_inst['mainsnak']['datavalue']['value']['numeric-id'])
                        all_types_set.add(value)
                        all_instances_and_types_dict[ide].add(value)
                        add_to_hierarchy(value)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 1630
Count: 1628
Error: 2
CPU times: user 7.77 s, sys: 319 ms, total: 8.09 s
Wall time: 1min 1s


In [5]:
len(all_properties_set)

177

In [6]:
len(all_types_set)

67

In [7]:
len(all_instances_set)

1628

In [8]:
len(hierarchy.nodes(data=False))

234

In [9]:
len(hierarchy.edges(data=True))

640

In [10]:
set(hierarchy.nodes(data=False)) == visited

True

In [11]:
all_properties_list = list(all_properties_set)
all_types_list = list(all_types_set)
all_instances_list = list(all_instances_set)

In [12]:
all_properties_dict = {}
all_types_dict = {}
all_instances_dict = {}

In [13]:
%%time

base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [all_properties_list[x:x+limit] for x in range(0, len(all_properties_list), limit)]
for chunk in chunks:
    property_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': property_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            all_properties_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            all_properties_dict[ide] = ''

chunks = [all_types_list[x:x+limit] for x in range(0, len(all_types_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            all_types_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            all_types_dict[ide] = ''

chunks = [all_instances_list[x:x+limit] for x in range(0, len(all_instances_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            all_instances_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            all_instances_dict[ide] = ''

CPU times: user 1.8 s, sys: 79.1 ms, total: 1.88 s
Wall time: 27 s


In [14]:
len(all_properties_dict)

177

In [15]:
len(all_types_dict)

67

In [16]:
len(all_instances_dict)

1628

In [17]:
len(all_instances_and_types_dict)

1394

In [18]:
with open('pickle/star-trek_all_properties_dict.pickle', 'wb') as f:
    pickle.dump(all_properties_dict, f)

with open('pickle/star-trek_all_types_dict.pickle', 'wb') as f:
    pickle.dump(all_types_dict, f)

with open('pickle/star-trek_all_instances_dict.pickle', 'wb') as f:
    pickle.dump(all_instances_dict, f)

with open('pickle/star-trek_all_instances_and_types_dict.pickle', 'wb') as f:
    pickle.dump(dict(all_instances_and_types_dict), f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'wb') as f:
    pickle.dump(hierarchy, f)