In [1]:
import json
import pickle

import networkx as nx
import requests

In [2]:
def get_value_and_type(prop_inst):
    if 'mainsnak' in prop_inst: # for properties themselves
        main_info = prop_inst['mainsnak']
    else: # for qualifiers
        main_info = prop_inst
    if 'datavalue' in main_info:
        prop_type = main_info['datavalue']['type']
        if prop_type == 'wikibase-entityid':
            value = 'Q{}'.format(main_info['datavalue']['value']['numeric-id'])
            typ = 'object'
        elif prop_type == 'string':
            value = str(main_info['datavalue']['value'])
            typ = 'string'
        elif prop_type == 'time':
            value = str(main_info['datavalue']['value']['time'][1:11])
            typ = 'date'
        elif prop_type == 'monolingualtext':
            value = str(main_info['datavalue']['value']['text'])
            typ = 'string'
        elif prop_type == 'quantity':
            value = str(main_info['datavalue']['value']['amount'])
            typ = 'number'
        elif prop_type == 'globecoordinate':
            latitude = main_info['datavalue']['value']['latitude']
            longitude = main_info['datavalue']['value']['longitude']
            value = '({}, {})'.format(latitude, longitude)
            typ = 'string'
        else:
            raise Exception('Unknown property type')
    else:
        value, typ = None, None
    return (value, typ)

In [3]:
def get_property_info(d):
    if 'labels' in d and 'en' in d['labels']:
        name = d['labels']['en']['value']
    else:
        name = ''
    if 'descriptions' in d and 'en' in d['descriptions']:
        description = d['descriptions']['en']['value']
    else:
        description = ''
    info = {
        'name': name,
        'description': description,
    }
    return info

def get_superclasses(d):
    superclasses = set()
    if 'claims' in d and 'P279' in d['claims']:
        for prop_inst in d['claims']['P279']:
            value = get_value_and_type(prop_inst)[0]
            superclasses.add(value)
    return superclasses

def get_types(d):
    types = set()
    if 'claims' in d and 'P31' in d['claims']:
        for prop_inst in d['claims']['P31']:
            value = get_value_and_type(prop_inst)[0]
            types.add(value)
    return types

def get_instance_info(d):
    if 'labels' in d and 'en' in d['labels']:
        name = d['labels']['en']['value']
    else:
        name = ''
    if 'sitelinks' in d and 'enwiki' in d['sitelinks']:
        wikilink = d['sitelinks']['enwiki']['title']
    else:
        wikilink = ''
    if 'descriptions' in d and 'en' in d['descriptions']:
        description = d['descriptions']['en']['value']
    else:
        description = ''
    info = {
        'name': name,
        'wikilink': wikilink,
        'description': description,
    }
    return info

In [4]:
all_properties_set = set() # all properties (relationships and attributes)
first_types_set = set() # types of source instances
all_instances_set = set() # all instances

In [5]:
first_instances_dict = {} # source instances, info, and types

In [6]:
%%time

total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            ide = d['id']
            info = get_instance_info(d)
            first_instances_dict[ide] = info
            first_instances_dict[ide]['types'] = set()
            first_instances_dict[ide]['properties'] = {}
            all_instances_set.add(ide)
            for prop in d['claims']:
                all_properties_set.add(prop)
                p_values = []
                for prop_inst in d['claims'][prop]:
                    p_value, p_type = get_value_and_type(prop_inst)
                    if p_type == 'object':
                        all_instances_set.add(p_value)
                        if prop == 'P31':
                            first_types_set.add(p_value)
                            first_instances_dict[ide]['types'].add(p_value)
                    qualifiers = {}
                    if 'qualifiers' in prop_inst:
                        for q in prop_inst['qualifiers']:
                            all_properties_set.add(q)
                            q_values = []
                            for q_inst in prop_inst['qualifiers'][q]:
                                q_value, q_type = get_value_and_type(q_inst)
                                q_values.append({'value': q_value, 'type': q_type})
                                if q_type == 'object':
                                    all_instances_set.add(q_value)
                            qualifiers[q] = q_values
                    p_values.append({'value': p_value, 'type': p_type, 'qualifiers': qualifiers})
                first_instances_dict[ide]['properties'][prop] = p_values
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 1630
Count: 1628
Error: 2
CPU times: user 292 ms, sys: 12.1 ms, total: 304 ms
Wall time: 304 ms


In [7]:
len(all_properties_set)

182

In [8]:
len(first_types_set)

67

In [9]:
len(all_instances_set)

2602

In [10]:
len(first_instances_dict)

1628

In [11]:
rest_instances_set = all_instances_set.difference(first_instances_dict)

In [12]:
len(rest_instances_set)

974

In [13]:
all_properties_list = list(all_properties_set)
rest_instances_list = list(rest_instances_set)

In [14]:
all_properties_dict = {}
rest_instances_dict = {}

In [15]:
%%time

base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [all_properties_list[x:x+limit] for x in range(0, len(all_properties_list), limit)]
for chunk in chunks:
    property_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': property_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        all_properties_dict[ide] = get_property_info(entities[ide])

chunks = [rest_instances_list[x:x+limit] for x in range(0, len(rest_instances_list), limit)]
for chunk in chunks:
    instance_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': instance_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        rest_instances_dict[ide] = get_instance_info(entities[ide])
        rest_instances_dict[ide]['types'] = get_types(entities[ide])
        rest_instances_dict[ide]['properties'] = {}

CPU times: user 1.48 s, sys: 52.7 ms, total: 1.53 s
Wall time: 21.8 s


In [16]:
len(all_properties_dict)

182

In [17]:
len(rest_instances_dict)

974

In [18]:
rest_types_set = set()
for ide in rest_instances_dict:
    rest_types_set = rest_types_set.union(rest_instances_dict[ide]['types'])

In [19]:
len(rest_types_set)

170

In [20]:
all_types_set = first_types_set.union(rest_types_set)

In [21]:
all_types_list = list(all_types_set)

In [22]:
all_types_dict = {}

In [23]:
%%time

base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [all_types_list[x:x+limit] for x in range(0, len(all_types_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        all_types_dict[ide] = get_instance_info(entities[ide])
        all_types_dict[ide]['superclasses'] = get_superclasses(entities[ide])    

CPU times: user 273 ms, sys: 3.84 ms, total: 277 ms
Wall time: 2.85 s


In [24]:
len(all_types_dict)

217

In [25]:
supertypes_set = set()

In [26]:
for ide in all_types_dict:
    supertypes_set = supertypes_set.union(all_types_dict[ide]['superclasses'])

In [27]:
len(supertypes_set)

225

In [28]:
supertypes_list = list(supertypes_set)

In [29]:
supertypes_dict = {}

In [30]:
%%time

base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [supertypes_list[x:x+limit] for x in range(0, len(supertypes_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        supertypes_dict[ide] = get_instance_info(entities[ide])
        supertypes_dict[ide]['superclasses'] = get_superclasses(entities[ide])    

CPU times: user 260 ms, sys: 16.4 ms, total: 276 ms
Wall time: 3.03 s


In [31]:
len(supertypes_dict)

225

In [32]:
all_types_dict.update(supertypes_dict)

In [33]:
len(all_types_dict)

389

In [34]:
all_instances_dict = dict(first_instances_dict)
all_instances_dict.update(rest_instances_dict)

In [35]:
len(all_instances_dict)

2602

In [36]:
hierarchy = nx.DiGraph() # full type hierarchy

In [37]:
for ide in all_types_dict:
    hierarchy.add_node(ide)
    superclasses = all_types_dict[ide]['superclasses']
    for c in superclasses:
        hierarchy.add_edge(ide, c)

In [38]:
len(hierarchy)

492

In [39]:
nx.number_connected_components(nx.Graph(hierarchy))

23

In [40]:
# Summary
print('Properties:', len(all_properties_dict))
print('Types:', len(all_types_dict))
print('Instances:', len(all_instances_dict))
print('Classes:', len(hierarchy))

Properties: 182
Types: 389
Instances: 2602
Classes: 492


In [41]:
with open('pickle/star-trek_all_properties_dict.pickle', 'wb') as f:
    pickle.dump(all_properties_dict, f)

with open('pickle/star-trek_all_types_dict.pickle', 'wb') as f:
    pickle.dump(all_types_dict, f)

with open('pickle/star-trek_all_instances_dict.pickle', 'wb') as f:
    pickle.dump(all_instances_dict, f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'wb') as f:
    pickle.dump(hierarchy, f)