In [1]:
import json
import pickle
import re

import requests

In [2]:
all_relationships_list = [] # all relationships
all_types_list = [] # all types of instances

In [3]:
%%time
total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            for rel in d['claims']:
                all_relationships_list.append(rel)
                if rel == 'P31':
                    types = d['claims']['P31']
                    for t in types:
                        typ = 'Q' + str(t['mainsnak']['datavalue']['value']['numeric-id'])
                        all_types_list.append(typ)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 1630
Count: 1628
Error: 2
CPU times: user 224 ms, sys: 682 µs, total: 225 ms
Wall time: 200 ms


In [4]:
len(all_relationships_list)

9647

In [5]:
len(all_types_list)

1408

In [6]:
all_relationships_list = list(set(all_relationships_list))
all_types_list = list(set(all_types_list))

In [7]:
len(all_relationships_list)

177

In [8]:
len(all_types_list)

67

In [9]:
all_relationships_dict = {}
all_types_dict = {}

In [10]:
%%time
base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

chunks = [all_relationships_list[x:x+limit] for x in range(0, len(all_relationships_list), limit)]
for chunk in chunks:
    relationship_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': relationship_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        all_relationships_dict[ide] = entities[ide]['labels']['en']['value']

chunks = [all_types_list[x:x+limit] for x in range(0, len(all_types_list), limit)]
for chunk in chunks:
    type_ids = '|'.join(chunk)
    payload = {'action': 'wbgetentities', 'ids': type_ids, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    entities = response['entities']
    for ide in entities:
        all_types_dict[ide] = entities[ide]['labels']['en']['value']

CPU times: user 211 ms, sys: 4.24 ms, total: 215 ms
Wall time: 3.3 s


In [11]:
len(all_relationships_dict)

177

In [12]:
len(all_types_dict)

67

In [13]:
all_relationships_dict

{'P1005': 'PTBNP identifier',
 'P1006': 'NTA identifier (Netherlands)',
 'P101': 'field of work',
 'P1019': 'feed URL',
 'P103': 'native language',
 'P106': 'occupation',
 'P1074': 'fictional analog of',
 'P1080': 'from fictional universe',
 'P110': 'illustrator',
 'P1104': 'number of pages',
 'P1151': "topic's main Wikimedia portal",
 'P1165': 'home world',
 'P1196': 'manner of death',
 'P1204': "Wikimedia portal's main topic",
 'P1207': 'NUKAT (WarsawU) authorities',
 'P122': 'basic form of government',
 'P123': 'publisher',
 'P1233': 'ISFDB author ID',
 'P1235': 'ISFDB series ID',
 'P1237': 'Box Office Mojo film ID',
 'P1245': 'OmegaWiki Defined Meaning',
 'P1258': 'Rotten Tomatoes identifier',
 'P1265': 'AlloCiné movie ID',
 'P1273': 'CANTIC',
 'P1284': 'Munzinger IBA',
 'P131': 'located in the administrative territorial entity',
 'P135': 'movement',
 'P136': 'genre',
 'P137': 'operator',
 'P138': 'named after',
 'P140': 'religion',
 'P1407': 'MusicBrainz series ID',
 'P1412': 'lan

In [14]:
all_types_dict

{'Q1055307': 'fictional technology',
 'Q1114502': 'American comic book',
 'Q11266439': 'Wikimedia template',
 'Q1137046': 'Star Trek: The Next Generation episode',
 'Q11424': 'film',
 'Q1226311': 'Emergency Medical Hologram',
 'Q1257934': 'clip show',
 'Q13406463': 'Wikimedia list article',
 'Q134556': 'single',
 'Q14623646': 'fictional organization',
 'Q14637321': 'fictional spacecraft',
 'Q15053464': 'fictional material',
 'Q15238535': 'fictional government or government body',
 'Q15416': 'television program',
 'Q15632617': 'fictional human',
 'Q15633582': 'MediaWiki site',
 'Q15707583': 'fictional taxon',
 'Q15773317': 'television character',
 'Q15773347': 'film character',
 'Q15831596': 'fictional class',
 'Q1659605': 'asteroids in fiction',
 'Q1700422': 'extraterrestrial from Star Trek',
 'Q17633526': 'Wikinews article',
 'Q18039177': 'fictional spacecraft class',
 'Q18533645': 'spacecraft in Star Trek',
 'Q18572946': 'fictional space station',
 'Q186117': 'timeline',
 'Q19595246'

In [15]:
with open('pickle/star-trek_all_relationships_dict.pickle', 'wb') as f:
    pickle.dump(all_relationships_dict, f)

with open('pickle/star-trek_all_types_dict.pickle', 'wb') as f:
    pickle.dump(all_types_dict, f)