In [2]:
with open('pickle/star-trek_only_relevant_properties_dict.pickle', 'rb') as f:
    only_relevant_properties_dict = pickle.load(f)

with open('pickle/star-trek_only_relevant_types_dict.pickle', 'rb') as f:
    only_relevant_types_dict = pickle.load(f)

with open('pickle/star-trek_all_instances_and_types_dict.pickle', 'rb') as f:
    all_instances_and_types_dict = pickle.load(f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'rb') as f:
    hierarchy = pickle.load(f)

In [3]:
def get_direct_superclasses(c):
    all_neighbors = hierarchy.neighbors(c)
    direct_superclasses = list(filter(lambda x: hierarchy[c][x]['type'] == 'is_subclass_of', all_neighbors))
    return direct_superclasses

def get_superclasses(c):
    direct_superclasses = get_direct_superclasses(c)
    if not direct_superclasses:
        superclasses = [[c]]
    else:
        superclasses = [[c] + spclsss for c2 in direct_superclasses for spclsss in get_superclasses(c2)]
    return superclasses

def is_subclass_of(c1, c2):
    return any(map(lambda x: c2 in x, get_superclasses(c1)))

def get_closest_class(c, cs):
    cs = list(cs.keys())
    lengths = []
    for c2 in cs:
        try:
            # WARNING: if there is more than one shortest path, then it returns only one of them
            length = nx.shortest_path_length(hierarchy, c, c2)
        except nx.exception.NetworkXNoPath as e:
            length = 999
        lengths.append(length)
    min_length = min(lengths)
    closest = cs[lengths.index(min_length)]
    count_min = lengths.count(min_length)
    return closest

def get_closest_relevant_class(c):
    return get_closest_class(c, only_relevant_types_dict)

def is_subclass_of_any_relevant_class(c):
    return any(map(lambda x: is_subclass_of(c, x), only_relevant_types_dict))

def get_relevant_type(ide):
    relevant_type = None
    if ide in all_instances_and_types_dict:
        types = all_instances_and_types_dict[ide]
        if len(types) == 1:
            t = types.pop(); types.add(t)
            if is_subclass_of_any_relevant_class(t):
                relevant_type = get_closest_relevant_class(t)
        else:
            relevant_candidates = set()
            for t in types:
                if is_subclass_of_any_relevant_class(t):
                    relevant_type = get_closest_relevant_class(t)
                    relevant_candidates.add(relevant_type)
            if relevant_candidates:
                # WARNING: if there is more than one relevant_candidates, then it returns only one of them
                relevant_type = relevant_candidates.pop(); relevant_candidates.add(relevant_type)
    return relevant_type

In [4]:
def get_value_and_type(prop_inst):
    try:
        main_info = prop_inst['mainsnak'] # for properties themselves
    except KeyError as e:
        main_info = prop_inst # for qualifiers
    prop_type = main_info['datavalue']['type']
    if prop_type == 'wikibase-entityid':
        value = 'Q' + str(main_info['datavalue']['value']['numeric-id'])
        typ = 'object'
    elif prop_type == 'string':
        value = str(main_info['datavalue']['value'])
        typ = 'string'
    elif prop_type == 'time':
        value = str(main_info['datavalue']['value']['time'][1:11])
        typ = 'date'
    elif prop_type == 'monolingualtext':
        value = str(main_info['datavalue']['value']['text'])
        typ = 'string'
    else:
        # WARNING: Unknown property type
        raise Exception('Unknown property type')
    return (value, typ)

In [5]:
only_relevant_instances_dict = {} # all relevant information for instances of relevant types

In [6]:
%%time

total = 0
count = 0
error = 0
with open('data/wikidata-20150907-star_trek.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned) # keys: ['type', 'labels', 'descriptions', 'claims', 'aliases', 'sitelinks', 'id']
            ide = d['id']
            typ = get_relevant_type(ide)
            if typ:
                if 'en' in d['labels']:
                    name = d['labels']['en']['value']
                else:
                    name = ''
                if 'enwiki' in d['sitelinks']:
                    wikilink = d['sitelinks']['enwiki']['title']
                else:
                    wikilink = ''
                if 'en' in d['descriptions']:
                    description = d['descriptions']['en']['value']
                else:
                    description = ''
                info = {
                    'id': ide,
                    'type': typ,
                    'name': name,
                    'wikilink': wikilink,
                    'description': description,
                    'properties': {},
                }
                for prop in d['claims']:
                    if prop in only_relevant_properties_dict and prop != 'P31' and prop != 'P279':
                        values = []
                        for prop_inst in d['claims'][prop]:
                            p_value_and_type = get_value_and_type(prop_inst)
                            p_value = p_value_and_type[0]
                            p_type = p_value_and_type[1]
                            qualifiers = {}
                            if 'qualifiers' in prop_inst:
                                for q in prop_inst['qualifiers']:
                                    if q in only_relevant_properties_dict:
                                        q_values = []
                                        for q_inst in prop_inst['qualifiers'][q]:
                                            q_value_and_type = get_value_and_type(q_inst)
                                            q_value = q_value_and_type[0]
                                            q_type = q_value_and_type[1]
                                            q_values.append({'value': q_value, 'type': q_type})
                                        qualifiers[q] = q_values
                            values.append({'value': p_value, 'type': p_type, 'qualifiers': qualifiers})
                        info['properties'][prop] = values
                only_relevant_instances_dict[ide] = info
                count += 1
        except Exception as e:
            print(e)
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Expecting value: line 1 column 1 (char 0)
Expecting value: line 1 column 1 (char 0)
Total: 1630
Count: 915
Error: 2
CPU times: user 6.25 s, sys: 11.7 ms, total: 6.26 s
Wall time: 6.26 s


In [7]:
len(only_relevant_instances_dict)

915

In [10]:
only_relevant_instances_dict

{'Q10134': {'description': 'constructed language',
  'id': 'Q10134',
  'name': 'Klingon language',
  'properties': {'P1080': [{'qualifiers': {},
     'type': 'object',
     'value': 'Q18043309'}],
   'P373': [{'qualifiers': {}, 'type': 'string', 'value': 'Klingon language'}],
   'P646': [{'qualifiers': {}, 'type': 'string', 'value': '/m/049ph'}],
   'P910': [{'qualifiers': {}, 'type': 'object', 'value': 'Q8574920'}]},
  'type': 'Q33215',
  'wikilink': 'Klingon language'},
 'Q1016685': {'description': 'Star Trek: Voyager episode',
  'id': 'Q1016685',
  'name': 'The Cloud',
  'properties': {'P155': [{'qualifiers': {},
     'type': 'object',
     'value': 'Q661255'}],
   'P156': [{'qualifiers': {}, 'type': 'object', 'value': 'Q5422700'}],
   'P179': [{'qualifiers': {}, 'type': 'object', 'value': 'Q156329'}],
   'P345': [{'qualifiers': {}, 'type': 'string', 'value': 'tt0708978'}],
   'P361': [{'qualifiers': {}, 'type': 'object', 'value': 'Q3465791'}],
   'P364': [{'qualifiers': {}, 'type':

In [8]:
# Double check
for ide in only_relevant_instances_dict:
    assert(only_relevant_instances_dict[ide]['type'] in only_relevant_types_dict)
    for prop in only_relevant_instances_dict[ide]['properties']:
        assert(prop in only_relevant_properties_dict)

In [9]:
with open('pickle/star-trek_only_relevant_instances_dict.pickle', 'wb') as f:
    pickle.dump(only_relevant_instances_dict, f)