In [1]:
import json
import pickle

import requests

In [2]:
def get_value_and_type(prop_inst):
    if 'mainsnak' in prop_inst: # for properties themselves
        main_info = prop_inst['mainsnak']
    else: # for qualifiers
        main_info = prop_inst
    if 'datavalue' in main_info:
        prop_type = main_info['datavalue']['type']
        if prop_type == 'wikibase-entityid':
            value = 'Q{}'.format(main_info['datavalue']['value']['numeric-id'])
            typ = 'object'
        elif prop_type == 'string':
            value = str(main_info['datavalue']['value'])
            typ = 'string'
        elif prop_type == 'time':
            value = str(main_info['datavalue']['value']['time'][1:11])
            typ = 'date'
        elif prop_type == 'monolingualtext':
            value = str(main_info['datavalue']['value']['text'])
            typ = 'string'
        elif prop_type == 'quantity':
            value = str(main_info['datavalue']['value']['amount'])
            typ = 'number'
        elif prop_type == 'globecoordinate':
            latitude = main_info['datavalue']['value']['latitude']
            longitude = main_info['datavalue']['value']['longitude']
            value = '({}, {})'.format(latitude, longitude)
            typ = 'string'
        else:
            raise Exception('Unknown property type')
    else:
        value, typ = None, None
    return (value, typ)

In [3]:
def get_info(d):
    if 'labels' in d and 'en' in d['labels']:
        name = d['labels']['en']['value']
    else:
        name = ''
    if 'sitelinks' in d and 'enwiki' in d['sitelinks']:
        wikilink = d['sitelinks']['enwiki']['title']
    else:
        wikilink = ''
    if 'descriptions' in d and 'en' in d['descriptions']:
        description = d['descriptions']['en']['value']
    else:
        description = ''
    info = {
        'name': name,
        'wikilink': wikilink,
        'description': description,
    }
    return info

In [4]:
all_properties_set = set() # all properties (relationships and attributes)
first_types_set = set() # types of source instances
all_instances_set = set() # all instances

In [5]:
first_instances_dict = {} # source instances, info, and types

In [6]:
# all_properties_dict = {} # all properties (relationships and attributes)
# first_types_dict = {} # types of source instances and their superclasses
# first_instances_dict = {} # source instances and their types

In [7]:
%%time

total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            ide = d['id']
            info = get_info(d)
            first_instances_dict[ide] = info
            first_instances_dict[ide]['types'] = set()
            all_instances_set.add(ide)
            for prop in d['claims']:
                all_properties_set.add(prop)
                for prop_inst in d['claims'][prop]:
                    value, typ = get_value_and_type(prop_inst)
                    if typ == 'object':
                        all_instances_set.add(value)
                        if prop == 'P31':
                            first_types_set.add(value)
                            first_instances_dict[ide]['types'].add(value)
                    if 'qualifiers' in prop_inst:
                        for q in prop_inst['qualifiers']:
                            all_properties_set.add(q)
                            for q_inst in prop_inst['qualifiers'][q]:
                                q_value, q_type = get_value_and_type(q_inst)
                                if q_type == 'object':
                                    all_instances_set.add(q_value)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 1630
Count: 1628
Error: 2
CPU times: user 242 ms, sys: 0 ns, total: 242 ms
Wall time: 240 ms


In [8]:
len(all_properties_set)

182

In [9]:
len(first_types_set)

67

In [10]:
len(all_instances_set)

2602

In [11]:
len(first_instances_dict)

1628

In [12]:
rest_instances_set = all_instances_set.difference(first_instances_dict)

In [13]:
len(rest_instances_set)

974

In [49]:
all_properties_list = list(all_properties_set)
first_types_list = list(first_types_set)
rest_instances_list = list(rest_instances_set)

In [50]:
all_properties_dict = {}
all_types_dict = {}
other_instances_dict = {}

In [51]:
# %%time

base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [all_properties_list[x:x+limit] for x in range(0, len(all_properties_list), limit)]
for chunk in chunks:
    property_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': property_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
#         try:
        all_properties_dict[ide] = get_info(entities[ide])
#         except Exception as e:
#             all_properties_dict[ide] = ''

chunks = [all_types_list[x:x+limit] for x in range(0, len(all_types_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
#         try:
        all_types_dict[ide] = get_info(entities[ide])
#         except Exception as e:
#             all_types_dict[ide] = ''

chunks = [other_instances_list[x:x+limit] for x in range(0, len(other_instances_list), limit)]
for chunk in chunks:
    instance_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': instance_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
#         try:
        other_instances_dict[ide] = get_info(entities[ide])
#         except Exception as e:
#             all_instances_dict[ide] = ''

In [54]:
all_instances_dict.update(other_instances_dict)

In [52]:
len(all_properties_dict)

182

In [53]:
len(all_types_dict)

67

In [55]:
len(all_instances_dict)

2602

In [18]:
with open('pickle/star-trek_all_properties_dict.pickle', 'wb') as f:
    pickle.dump(all_properties_dict, f)

with open('pickle/star-trek_all_types_dict.pickle', 'wb') as f:
    pickle.dump(all_types_dict, f)

with open('pickle/star-trek_all_instances_dict.pickle', 'wb') as f:
    pickle.dump(all_instances_dict, f)

with open('pickle/star-trek_all_instances_and_types_dict.pickle', 'wb') as f:
    pickle.dump(dict(all_instances_and_types_dict), f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'wb') as f:
    pickle.dump(hierarchy, f)

In [58]:
with open('pickle/star-trek_all_properties_dict-old.pickle', 'rb') as f:
    old = pickle.load(f)

In [59]:
len(old)

177

In [67]:
for p in all_properties_set.difference(old):
    print(p, all_properties_dict[p])
    print()

P554 {'wikilink': '', 'description': 'username on a website that the person or organization has an account on (only use as qualifier of P553)', 'name': 'website username'}

P453 {'wikilink': '', 'description': 'specific role played or filled by subject -- use only as qualifier of "cast member" (P161) or "voice actor" (P725)', 'name': 'character role'}

P794 {'wikilink': '', 'description': 'generic qualifier', 'name': 'as'}

P642 {'wikilink': '', 'description': 'qualifiers stating that a statement applies within the scope of a particular item', 'name': 'of'}

P585 {'wikilink': '', 'description': 'time and date something took place, existed or a statement was true', 'name': 'point in time'}



In [71]:
# for k in all_properties_dict:
#     print(k, all_properties_dict[k]['wikilink'])