In [36]:
import json
import pickle

import requests

In [18]:
with open('pickle/star-trek_only_relevant_relationships_dict.pickle', 'rb') as f:
    only_relevant_relationships_dict = pickle.load(f)

with open('pickle/star-trek_only_relevant_types_dict.pickle', 'rb') as f:
    only_relevant_types_dict = pickle.load(f)

In [19]:
only_relevant_relationships_dict

{'P103': 'native language',
 'P1074': 'fictional analog of',
 'P1080': 'from fictional universe',
 'P1235': 'ISFDB series ID',
 'P1237': 'Box Office Mojo film ID',
 'P1258': 'Rotten Tomatoes identifier',
 'P1265': 'AlloCiné movie ID',
 'P135': 'movement',
 'P136': 'genre',
 'P138': 'named after',
 'P1407': 'MusicBrainz series ID',
 'P1412': 'languages spoken or published',
 'P1434': 'describes the fictional universe',
 'P144': 'based on',
 'P1441': 'present in work',
 'P1445': 'fictional universe described in',
 'P1476': 'title',
 'P155': 'follows',
 'P156': 'followed by',
 'P1562': 'AllMovie movie ID',
 'P1651': 'YouTube video identifier',
 'P17': 'country',
 'P1712': 'Metacritic ID',
 'P179': 'series',
 'P180': 'depicts',
 'P1804': 'Danish National Filmography ID',
 'P1811': 'list of episodes',
 'P1813': 'short name',
 'P1970': 'MovieMeter movie ID',
 'P2002': 'Twitter username',
 'P2003': 'Instagram username',
 'P2013': 'Facebook ID',
 'P212': 'ISBN-13',
 'P243': 'OCLC control numbe

In [20]:
only_relevant_types_dict

{'Q1114502': 'American comic book',
 'Q1137046': 'Star Trek: The Next Generation episode',
 'Q11424': 'film',
 'Q1257934': 'clip show',
 'Q134556': 'single',
 'Q15416': 'television program',
 'Q186117': 'timeline',
 'Q196600': 'media franchise',
 'Q1983062': 'episode',
 'Q24856': 'film series',
 'Q24862': 'short film',
 'Q278379': 'a priori language',
 'Q3252662': 'series finale',
 'Q3297186': 'limited series',
 'Q33215': 'constructed language',
 'Q3464665': 'television season',
 'Q35127': 'website',
 'Q386724': 'work',
 'Q41298': 'magazine',
 'Q4176708': 'soundtrack album',
 'Q4234303': 'WikiProject',
 'Q482994': 'album',
 'Q5398426': 'television series',
 'Q559618': 'fictional universe',
 'Q571': 'book',
 'Q581714': 'animated series',
 'Q7889': 'video game',
 'Q838795': 'comic strip'}

In [375]:
class Tree(object):
    def __init__(self, data):
        self.node = data
        self.children = []
    
    def __repr__(self):
        return str(self.paths())
    
    def add_child(self, tree):
        self.children.append(tree)
    
    def inorder(self):
        result = [self.node]
        for ch in self.children:
            result.extend(ch.inorder())
        return result
    
    def search(self, data):
        return data in self.inorder()
    
    def paths(self):
        if not self.children:
            ps = [[self.node]]
        else:
            ps = [[self.node] + p for ch in self.children for p in ch.paths()]
        return ps

In [376]:
x=Tree(5)
y=Tree(4)
z=Tree(3)
a=Tree(1)
b=Tree(2)
y.add_child(a)
y.add_child(b)
x.add_child(y)
x.add_child(z)

In [377]:
x

[[5, 4, 1], [5, 4, 2], [5, 3]]

In [378]:
x.inorder()

[5, 4, 1, 2, 3]

In [373]:
def get_superclasses(c):
    superclasses = Tree(c)
    base_url = 'https://www.wikidata.org/w/api.php'
    payload = {'action': 'wbgetentities', 'ids': c, 'languages': 'en', 'format': 'json'}
    r = requests.get(base_url, params=payload)
    response = r.json()
    d = response['entities'][c]
    if 'claims' in d and 'P279' in d['claims']:
        for rel_inst in d['claims']['P279']:
            if 'datavalue' in rel_inst['mainsnak']:
                super_c = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                superclasses.add_child(get_superclasses(super_c))
    return superclasses

In [381]:
%%time
total = 0
count = 0
error = 0
all_types_only_rr_tree_dict = {} # all types for only relevant relationships tree dict
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
#         try:
        if not line.startswith('[') and not line.startswith(']'):
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            for rel in d['claims']:
                if rel in only_relevant_relationships_dict:
                    for rel_inst in d['claims'][rel]:
                        type_rel = rel_inst['mainsnak']['datavalue']['type']
                        if type_rel == 'wikibase-entityid':
                            typ = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                            all_types_only_rr_tree_dict[typ] = get_superclasses(typ)
            count += 1
#         except Exception as e:
#             print(e)
#             error += 1
        print(total)
        if(typ and typ in all_types_only_rr_tree_dict):
            print(all_types_only_rr_tree_dict[typ])
print('Total:', total)
print('Count:', count)
print('Error:', error)

0
1
[['Q866']]
2
[['Q278379', 'Q33215', 'Q34770', 'Q17376908', 'Q11028', 'Q853614', 'Q7184903', 'Q488383', 'Q35120'], ['Q278379', 'Q33215', 'Q34770', 'Q17376908', 'Q7184903', 'Q488383', 'Q35120'], ['Q278379', 'Q33215', 'Q17537576', 'Q15621286', 'Q386724', 'Q16686448', 'Q488383', 'Q35120']]
3
[['Q150', 'Q37351', 'Q19814', 'Q19860', 'Q771861']]
4
[['Q14637321', 'Q18670171', 'Q6619802', 'Q14897293', 'Q17537576', 'Q15621286', 'Q386724', 'Q16686448', 'Q488383', 'Q35120']]
5
[['Q14637321', 'Q18670171', 'Q6619802', 'Q14897293', 'Q17537576', 'Q15621286', 'Q386724', 'Q16686448', 'Q488383', 'Q35120']]
6
[['Q20922']]
7
[['Q15632617', 'Q95074', 'Q14897293', 'Q17537576', 'Q15621286', 'Q386724', 'Q16686448', 'Q488383', 'Q35120'], ['Q15632617', 'Q95074', 'Q15619164', 'Q830077', 'Q35120'], ['Q15632617', 'Q95074', 'Q18336849'], ['Q15632617', 'Q6619679', 'Q95074', 'Q14897293', 'Q17537576', 'Q15621286', 'Q386724', 'Q16686448', 'Q488383', 'Q35120'], ['Q15632617', 'Q6619679', 'Q95074', 'Q15619164', 'Q83007

KeyboardInterrupt: 

In [208]:


def is_subclass(c1, c2):
    return get_superclasses(c1).search(c2)

def get_all_types(ide):
    types = set()
    with open('data/wikidata-20150907-star_trek.json') as f:
        for line in f:
            try:
                cleaned = line.strip()[:-1]
                d = json.loads(cleaned)
                if ide == d['id']:
                    for rel_inst in d['claims']['P31']:
                        typ = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                        types.add(typ)
                    break
            except Exception as e:
                pass
    return types

def get_type(ide):
    types = get_all_types(ide)
    if len(types) == 1:
        typ = types.pop()
    else:
        superclasses_tree_list = [get_superclasses(t) for t in types]
        common_classes = 
            
        
        
        types_aux = set(types)
        for x in types:
            for y in types:
                if x != y and is_subclass(x, y):
                    try:
                        types_aux.remove(y)
                    except KeyError as e:
                        pass
        if len(types_aux) == 0:
            typ = None
        elif len(types_aux) == 1:
            typ = types_aux.pop()
        else:
            print('MARRON')
            print(ide)
            typ = None
    return typ

In [209]:
# %%time
# Check if the types of the target of the relevant relationships are included in the relevant types
total = 0
count = 0
error = 0
not_included_types = set()
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
#         try:
        if not line.startswith('[') and not line.startswith(']'):
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            for rel in d['claims']:
                if rel in only_relevant_relationships_dict:
                    for rel_inst in d['claims'][rel]:
                        type_rel = rel_inst['mainsnak']['datavalue']['type']
                        if type_rel == 'wikibase-entityid':
                            val = rel_inst['mainsnak']['datavalue']['value']['numeric-id']
                            id2 = 'Q' + str(val)
                            get_type(id2)
                            get_superclass(id2)
            count += 1
#         except Exception as e:
#             print(e)
#             error += 1
        print(total)
print('Total:', total)
print('Count:', count)
print('Error:', error)

0
MAS DE UNA SUPERCLASE Q196600
1
MAS DE UNA SUPERCLASE Q2623733


KeyboardInterrupt: 

In [None]:
s={1,2,3}

In [42]:
%%time
base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

# chunks = [all_relationships_list[x:x+limit] for x in range(0, len(all_relationships_list), limit)]
# for chunk in chunks:
# relationship_ids = '|'.join(chunk)
payload = {'action': 'wbgetentities', 'ids': 'Q1137046', 'languages': 'en', 'format': 'json'}
r = requests.get(base_url, params=payload)
response = r.json()
# entities = response['entities']
# for ide in entities:
#     all_relationships_dict[ide] = entities[ide]['labels']['en']['value']

CPU times: user 15.4 ms, sys: 7.96 ms, total: 23.3 ms
Wall time: 266 ms


In [46]:
{'id': 'Q1137046$484f9602-478e-27de-ced5-7cd2c33d07ea',
    'mainsnak': {'datatype': 'wikibase-item',
     'datavalue': {'type': 'wikibase-entityid',
      'value': {'entity-type': 'item', 'numeric-id': 1983062}},
     'property': 'P279',
     'snaktype': 'value'},
    'rank': 'normal',
    'type': 'statement'}

{'id': 'Q1137046$484f9602-478e-27de-ced5-7cd2c33d07ea',
 'mainsnak': {'datatype': 'wikibase-item',
  'datavalue': {'type': 'wikibase-entityid',
   'value': {'entity-type': 'item', 'numeric-id': 1983062}},
  'property': 'P279',
  'snaktype': 'value'},
 'rank': 'normal',
 'type': 'statement'}

In [None]:
# Double check
total = 0
count = 0
error = 0
with open('wikidata-20150907-star_trek-relevant-rels.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

In [None]:
# # Report
# import csv
# 
# total = 0
# count = 0
# error = 0
# with open('wikidata-20150907-star_trek-relevant-rels.json') as f1,open('star_trek-all_types.csv', 'w') as f2:
#     csv_writer = csv.writer(
#         f2,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for line in f1:
#         ide = None
#         title = None
#         type_id = None
#         type_title = None
#         try:
#             total += 1
#             cleaned = line.strip()[:-1]
#             d = json.loads(cleaned)
#             ide = d['id']
#             if 'en' in d['labels']:
#                 title = d['labels']['en']['value']
#             if 'P31' in d['claims']:
#                 type_id = 'Q' + str(d['claims']['P31'][0]['mainsnak']['datavalue']['value']['numeric-id'])
#                 if type_id in all_related_types_dict:
#                     type_title = all_related_types_dict[type_id]
#             csv_writer.writerow([ide, title, type_id, type_title])
#             count += 1
#         except Exception as e:
#             error += 1
# print('Total:', total)
# print('Count:', count)
# print('Error:', error)