In [43]:
import json
import pickle

import networkx as nx

In [44]:
with open('pickle/star-trek_only_relevant_relationships_dict.pickle', 'rb') as f:
    only_relevant_relationships_dict = pickle.load(f)

In [45]:
with open('pickle/star-trek_only_relevant_types_dict.pickle', 'rb') as f:
    only_relevant_types_dict = pickle.load(f)

In [46]:
with open('pickle/star-trek_hierarchy_graph.pickle', 'rb') as f:
    hierarchy = pickle.load(f)

In [47]:
only_relevant_relationships_dict

{'P103': 'native language',
 'P1074': 'fictional analog of',
 'P1080': 'from fictional universe',
 'P1235': 'ISFDB series ID',
 'P1237': 'Box Office Mojo film ID',
 'P1258': 'Rotten Tomatoes identifier',
 'P1265': 'AlloCiné movie ID',
 'P135': 'movement',
 'P136': 'genre',
 'P138': 'named after',
 'P1407': 'MusicBrainz series ID',
 'P1412': 'languages spoken or published',
 'P1434': 'describes the fictional universe',
 'P144': 'based on',
 'P1441': 'present in work',
 'P1445': 'fictional universe described in',
 'P1476': 'title',
 'P155': 'follows',
 'P156': 'followed by',
 'P1562': 'AllMovie movie ID',
 'P1651': 'YouTube video identifier',
 'P17': 'country',
 'P1712': 'Metacritic ID',
 'P179': 'series',
 'P180': 'depicts',
 'P1804': 'Danish National Filmography ID',
 'P1811': 'list of episodes',
 'P1813': 'short name',
 'P1970': 'MovieMeter movie ID',
 'P2002': 'Twitter username',
 'P2003': 'Instagram username',
 'P2013': 'Facebook ID',
 'P212': 'ISBN-13',
 'P243': 'OCLC control numbe

In [48]:
only_relevant_types_dict

{'Q1114502': 'American comic book',
 'Q1137046': 'Star Trek: The Next Generation episode',
 'Q11424': 'film',
 'Q1257934': 'clip show',
 'Q134556': 'single',
 'Q15416': 'television program',
 'Q186117': 'timeline',
 'Q196600': 'media franchise',
 'Q1983062': 'episode',
 'Q24856': 'film series',
 'Q24862': 'short film',
 'Q278379': 'a priori language',
 'Q3252662': 'series finale',
 'Q3297186': 'limited series',
 'Q33215': 'constructed language',
 'Q3464665': 'television season',
 'Q35127': 'website',
 'Q386724': 'work',
 'Q41298': 'magazine',
 'Q4176708': 'soundtrack album',
 'Q4234303': 'WikiProject',
 'Q482994': 'album',
 'Q5398426': 'television series',
 'Q559618': 'fictional universe',
 'Q571': 'book',
 'Q581714': 'animated series',
 'Q7889': 'video game',
 'Q838795': 'comic strip'}

In [49]:
hierarchy

<networkx.classes.digraph.DiGraph at 0x7fe4e37950f0>

In [50]:
hierarchy.nodes()[:10]

['Q20912',
 'Q4840668',
 'Q43229',
 'Q7731917',
 'Q11424',
 'Q1928841',
 'Q25372',
 'Q7601035',
 'Q4658168',
 'Q284742']

In [51]:
hierarchy.edges(data=True)[:10]

[('Q43229', 'Q618123', {'type': 'subclass'}),
 ('Q43229', 'Q2088357', {'type': 'superclass'}),
 ('Q43229', 'Q874405', {'type': 'subclass'}),
 ('Q11424', 'Q319221', {'type': 'superclass'}),
 ('Q11424', 'Q2431196', {'type': 'subclass'}),
 ('Q11424', 'Q17537576', {'type': 'subclass'}),
 ('Q11424', 'Q202866', {'type': 'superclass'}),
 ('Q11424', 'Q838948', {'type': 'subclass'}),
 ('Q11424', 'Q24862', {'type': 'superclass'}),
 ('Q25372', 'Q7725634', {'type': 'subclass'})]

In [63]:
# get_superclasses('Q43229')

In [61]:
def get_direct_superclasses(c):
    all_neighbors = hierarchy.neighbors(c)
    direct_superclasses = list(filter(lambda x: hierarchy[c][x]['type'] == 'subclass', all_neighbors))
    return direct_superclasses

def get_superclasses(c):
    direct_superclasses = get_direct_superclasses(c)
    if not direct_superclasses:
        superclasses = [[c]]
    else:
        superclasses = [[c] + spclsss for c2 in direct_superclasses for spclsss in get_superclasses(c2)]
    return superclasses

            
    

def is_subclass(c1, c2):
    return get_superclasses(c1).search(c2)

def get_all_types(ide):
    types = set()
    with open('data/wikidata-20150907-star_trek.json') as f:
        for line in f:
            try:
                cleaned = line.strip()[:-1]
                d = json.loads(cleaned)
                if ide == d['id']:
                    for rel_inst in d['claims']['P31']:
                        typ = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                        types.add(typ)
                    break
            except Exception as e:
                pass
    return types

def get_type(ide):
    types = get_all_types(ide)
    if len(types) == 1:
        typ = types.pop()
    else:
        superclasses_tree_list = [get_superclasses(t) for t in types]
#         common_classes = 
            
        
        
        types_aux = set(types)
        for x in types:
            for y in types:
                if x != y and is_subclass(x, y):
                    try:
                        types_aux.remove(y)
                    except KeyError as e:
                        pass
        if len(types_aux) == 0:
            typ = None
        elif len(types_aux) == 1:
            typ = types_aux.pop()
        else:
            print('MARRON')
            print(ide)
            typ = None
    return typ

In [209]:
# %%time
# Check if the types of the target of the relevant relationships are included in the relevant types
total = 0
count = 0
error = 0
not_included_types = set()
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
#         try:
        if not line.startswith('[') and not line.startswith(']'):
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            for rel in d['claims']:
                if rel in only_relevant_relationships_dict:
                    for rel_inst in d['claims'][rel]:
                        type_rel = rel_inst['mainsnak']['datavalue']['type']
                        if type_rel == 'wikibase-entityid':
                            val = rel_inst['mainsnak']['datavalue']['value']['numeric-id']
                            id2 = 'Q' + str(val)
                            get_type(id2)
                            get_superclass(id2)
            count += 1
#         except Exception as e:
#             print(e)
#             error += 1
        print(total)
print('Total:', total)
print('Count:', count)
print('Error:', error)

0
MAS DE UNA SUPERCLASE Q196600
1
MAS DE UNA SUPERCLASE Q2623733


KeyboardInterrupt: 

In [None]:
s={1,2,3}

In [42]:
%%time
base_url = 'https://www.wikidata.org/w/api.php'
limit = 50

# chunks = [all_relationships_list[x:x+limit] for x in range(0, len(all_relationships_list), limit)]
# for chunk in chunks:
# relationship_ids = '|'.join(chunk)
payload = {'action': 'wbgetentities', 'ids': 'Q1137046', 'languages': 'en', 'format': 'json'}
r = requests.get(base_url, params=payload)
response = r.json()
# entities = response['entities']
# for ide in entities:
#     all_relationships_dict[ide] = entities[ide]['labels']['en']['value']

CPU times: user 15.4 ms, sys: 7.96 ms, total: 23.3 ms
Wall time: 266 ms


In [46]:
{'id': 'Q1137046$484f9602-478e-27de-ced5-7cd2c33d07ea',
    'mainsnak': {'datatype': 'wikibase-item',
     'datavalue': {'type': 'wikibase-entityid',
      'value': {'entity-type': 'item', 'numeric-id': 1983062}},
     'property': 'P279',
     'snaktype': 'value'},
    'rank': 'normal',
    'type': 'statement'}

{'id': 'Q1137046$484f9602-478e-27de-ced5-7cd2c33d07ea',
 'mainsnak': {'datatype': 'wikibase-item',
  'datavalue': {'type': 'wikibase-entityid',
   'value': {'entity-type': 'item', 'numeric-id': 1983062}},
  'property': 'P279',
  'snaktype': 'value'},
 'rank': 'normal',
 'type': 'statement'}

In [None]:
# Double check
total = 0
count = 0
error = 0
with open('wikidata-20150907-star_trek-relevant-rels.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

In [None]:
# # Report
# import csv
# 
# total = 0
# count = 0
# error = 0
# with open('wikidata-20150907-star_trek-relevant-rels.json') as f1,open('star_trek-all_types.csv', 'w') as f2:
#     csv_writer = csv.writer(
#         f2,
#         delimiter=',',
#         quotechar='"',
#         quoting=csv.QUOTE_MINIMAL
#     )
#     for line in f1:
#         ide = None
#         title = None
#         type_id = None
#         type_title = None
#         try:
#             total += 1
#             cleaned = line.strip()[:-1]
#             d = json.loads(cleaned)
#             ide = d['id']
#             if 'en' in d['labels']:
#                 title = d['labels']['en']['value']
#             if 'P31' in d['claims']:
#                 type_id = 'Q' + str(d['claims']['P31'][0]['mainsnak']['datavalue']['value']['numeric-id'])
#                 if type_id in all_related_types_dict:
#                     type_title = all_related_types_dict[type_id]
#             csv_writer.writerow([ide, title, type_id, type_title])
#             count += 1
#         except Exception as e:
#             error += 1
# print('Total:', total)
# print('Count:', count)
# print('Error:', error)