In [1]:
import json
import pickle
from collections import defaultdict

import networkx as nx
import requests

In [2]:
all_relationships_set = set() # all relationships
all_types_set = set() # all types of instances
all_instances_set = set() # all instances
all_instances_and_types_dict = defaultdict(set) # all instances and their types
hierarchy = nx.DiGraph() # full type hierarchy

In [3]:
visited = set()

In [4]:
def add_to_hierarchy(c):
    if c not in visited:
        hierarchy.add_node(c)
        base_url = 'https://www.wikidata.org/w/api.php'
        payload = {'action': 'wbgetentities', 'ids': c, 'languages': 'en', 'format': 'json'}
        r = requests.get(base_url, params=payload)
        response = r.json()
        d = response['entities'][c]
        if 'claims' in d and 'P279' in d['claims']:
            for rel_inst in d['claims']['P279']:
                if 'datavalue' in rel_inst['mainsnak']:
                    super_c = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                    hierarchy.add_node(super_c)
                    hierarchy.add_edge(c, super_c, {'type': 'is_subclass_of'})
                    hierarchy.add_edge(super_c, c, {'type': 'is_superclass_of'})
                    add_to_hierarchy(super_c)
        visited.add(c)

In [5]:
%%time
total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            ide = d['id']
            all_instances_set.add(ide)
            for rel in d['claims']:
                all_relationships_set.add(rel)
                if rel == 'P31':
                    for rel_inst in d['claims']['P31']:
                        typ = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                        all_types_set.add(typ)
                        all_instances_and_types_dict[ide].add(typ)
                        add_to_hierarchy(typ)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 1630
Count: 1628
Error: 2
CPU times: user 5.68 s, sys: 93.6 ms, total: 5.78 s
Wall time: 1min 3s


In [6]:
len(all_relationships_set)

177

In [7]:
len(all_types_set)

67

In [8]:
len(all_instances_set)

1628

In [9]:
len(hierarchy.nodes(data=False))

234

In [10]:
len(hierarchy.edges(data=True))

640

In [11]:
set(hierarchy.nodes(data=False)) == visited

True

In [12]:
all_relationships_list = list(all_relationships_set)
all_types_list = list(all_types_set)
all_instances_list = list(all_instances_set)

In [13]:
all_relationships_dict = {}
all_types_dict = {}
all_instances_dict = {}

In [14]:
base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [all_relationships_list[x:x+limit] for x in range(0, len(all_relationships_list), limit)]
for chunk in chunks:
    relationship_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': relationship_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            all_relationships_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            all_relationships_dict[ide] = ''

chunks = [all_types_list[x:x+limit] for x in range(0, len(all_types_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            all_types_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            all_types_dict[ide] = ''

chunks = [all_instances_list[x:x+limit] for x in range(0, len(all_instances_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            all_instances_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            all_instances_dict[ide] = ''

In [15]:
len(all_relationships_dict)

177

In [16]:
len(all_types_dict)

67

In [17]:
len(all_instances_dict)

1628

In [18]:
len(all_instances_and_types_dict)

1394

In [19]:
with open('pickle/star-trek_all_relationships_dict.pickle', 'wb') as f:
    pickle.dump(all_relationships_dict, f)

with open('pickle/star-trek_all_types_dict.pickle', 'wb') as f:
    pickle.dump(all_types_dict, f)

with open('pickle/star-trek_all_instances_dict.pickle', 'wb') as f:
    pickle.dump(all_instances_dict, f)

with open('pickle/star-trek_all_instances_and_types_dict.pickle', 'wb') as f:
    pickle.dump(all_instances_and_types_dict, f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'wb') as f:
    pickle.dump(hierarchy, f)