In [1]:
import json
import pickle
from collections import defaultdict

import networkx as nx
import requests

In [3]:
with open('pickle/star-trek_all_instances_and_types_dict.pickle', 'rb') as f:
    all_instances_and_types_dict = pickle.load(f)

with open('pickle/star-trek_only_relevant_relationships_dict.pickle', 'rb') as f:
    only_relevant_relationships_dict = pickle.load(f)

with open('pickle/star-trek_only_relevant_types_dict.pickle', 'rb') as f:
    only_relevant_types_dict = pickle.load(f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'rb') as f:
    hierarchy = pickle.load(f)

In [4]:
new_types_set = set() # new types for relevant relationships
new_instances_set = set() # new instances for relevant relationships
new_instances_and_types_dict = defaultdict(set) # new instances for relevant relationships and their types

In [5]:
def get_types(ide):
    types = []
    base_url = 'https://www.wikidata.org/w/api.php'
    payload = {'action': 'wbgetentities', 'ids': ide, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    d = response['entities'][ide]
    for rel in d['claims']:
        if rel == 'P31':
            for rel_inst in d['claims']['P31']:
                value = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                types.append(value)
    return types

In [6]:
visited = set(hierarchy.nodes(data=False))

In [7]:
def add_to_hierarchy(c):
    if c not in visited:
        hierarchy.add_node(c)
        base_url = 'https://www.wikidata.org/w/api.php'
        payload = {'action': 'wbgetentities', 'ids': c, 'languages': 'en', 'format': 'json'}
        r = requests.get(base_url, params=payload)
        response = r.json()
        d = response['entities'][c]
        if 'claims' in d and 'P279' in d['claims']:
            for rel_inst in d['claims']['P279']:
                if 'datavalue' in rel_inst['mainsnak']:
                    super_c = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                    hierarchy.add_node(super_c)
                    hierarchy.add_edge(c, super_c, {'type': 'is_subclass_of'})
                    hierarchy.add_edge(super_c, c, {'type': 'is_superclass_of'})
                    add_to_hierarchy(super_c)
        visited.add(c)

In [8]:
%%time
total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            for rel in d['claims']:
                if rel in only_relevant_relationships_dict:
                    for rel_inst in d['claims'][rel]:
                        type_rel = rel_inst['mainsnak']['datavalue']['type']
                        if type_rel == 'wikibase-entityid':
                            value = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                            if value not in all_instances_and_types_dict and value not in new_instances_set:
                                new_instances_set.add(value)
                                for t in get_types(value):
                                    if t not in only_relevant_types_dict and t not in new_types_set:
                                        new_types_set.add(t)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 1630
Count: 1628
Error: 2
CPU times: user 5.65 s, sys: 178 ms, total: 5.82 s
Wall time: 1min 2s


In [9]:
len(new_types_set)

72

In [10]:
len(new_instances_set)

233

In [11]:
new_types_list = list(new_types_set)
new_instances_set = list(new_instances_set)

In [12]:
new_types_dict = {}
new_instances_dict = {}

In [13]:
base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [new_types_list[x:x+limit] for x in range(0, len(new_types_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            new_types_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            new_types_dict[ide] = ''

chunks = [new_instances_set[x:x+limit] for x in range(0, len(new_instances_set), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        try:
            new_instances_dict[ide] = entities[ide]['labels']['en']['value']
        except Exception as e:
            new_instances_dict[ide] = ''

In [14]:
len(new_types_dict)

72

In [15]:
len(new_instances_dict)

233

In [20]:
new_instances_dict

{'Q1044478': "shoot 'em up",
 'Q1055307': 'fictional technology',
 'Q10676': 'Sega Mega Drive',
 'Q10677': 'PlayStation',
 'Q10680': 'PlayStation 2',
 'Q10683': 'PlayStation 3',
 'Q1093941': 'Toronto City Hall',
 'Q110154': 'George Takei',
 'Q1114502': 'American comic book',
 'Q11266439': 'Wikimedia template',
 'Q1140363': 'educational game',
 'Q11424': 'film',
 'Q1143118': 'interactive fiction',
 'Q1150710': 'strategy game',
 'Q116553': 'Jean Piccard',
 'Q1190578': 'space warfare',
 'Q120877': 'cloning',
 'Q1226311': 'Emergency Medical Hologram',
 'Q1257934': 'clip show',
 'Q132020': 'Xbox',
 'Q13406463': 'Wikimedia list article',
 'Q134556': 'single',
 'Q13915': 'xkcd',
 'Q13974': 'NBC',
 'Q1406': 'Microsoft Windows',
 'Q14116': 'OS X',
 'Q1436734': 'adventure',
 'Q145': 'United Kingdom',
 'Q14623646': 'fictional organization',
 'Q14637321': 'fictional spacecraft',
 'Q1475303': 'Microvision',
 'Q14864330': 'Teen',
 'Q14915515': 'PEGI 12',
 'Q14920387': 'USK 0',
 'Q14920392': 'USK 12'

In [16]:
len(all_instances_and_types_dict)

1394

In [17]:
all_instances_and_types_dict.update(new_instances_dict)

In [18]:
len(all_instances_and_types_dict)

1627

In [19]:
with open('pickle/star-trek_new_types_dict.pickle', 'wb') as f:
    pickle.dump(new_types_dict, f)
    
with open('pickle/star-trek_all_instances_and_types_dict.pickle', 'wb') as f:
    pickle.dump(all_instances_and_types_dict, f)