In [67]:
import json
from collections import defaultdict

In [68]:
final_relationship_dict = {
    'P1080': 'from fictional universe',
    'P1235': 'ISFDB series ID',
    'P1237': 'Box Office Mojo film ID',
    'P1258': 'Rotten Tomatoes identifier',
    'P1265': 'AlloCiné movie ID',
    'P136': 'genre',
    'P1407': 'MusicBrainz series ID',
    'P1434': 'describes the fictional universe',
    'P144': 'based on',
    'P1445': 'fictional universe described in',
    'P1476': 'title',
    'P155': 'follows',
    'P156': 'followed by',
    'P1562': 'AllMovie movie ID',
    'P1651': 'YouTube video identifier',
    'P17': 'country',
    'P1712': 'Metacritic ID',
    'P179': 'series',
    'P180': 'depicts',
    'P1804': 'Danish National Filmography ID',
    'P1811': 'list of episodes',
    'P1970': 'MovieMeter movie ID',
    'P2002': 'Twitter username',
    'P2003': 'Instagram username',
    'P2013': 'Facebook ID',
    'P212': 'ISBN-13',
    'P243': 'OCLC control number',
    'P244': 'LCAuth identifier',
    'P275': 'license',
    'P279': 'subclass of',
    'P31': 'instance of',
    'P345': 'IMDb identifier',
    'P348': 'software version',
    'P357': '(OBSOLETE) title (use P1476, "title")',
    'P361': 'part of',
    'P364': 'original language of work',
    'P373': 'Commons category',
    'P400': 'platform',
    'P433': 'issue',
    'P437': 'distribution',
    'P449': 'original network',
    'P480': 'FilmAffinity identifier',
    'P495': 'country of origin',
    'P508': 'PIJAS',
    'P553': 'website account on',
    'P577': 'publication date',
    'P580': 'start time',
    'P582': 'end time',
    'P646': 'Freebase identifier',
    'P852': 'ESRB rating',
    'P854': 'reference URL',
    'P856': 'official website',
    'P905': 'PORT film ID',
    'P908': 'PEGI rating',
    'P910': "topic's main category",
    'P914': 'USK rating',
    'P915': 'filming location',
    'P921': 'main subject',
    'P941': 'inspired by',
    'P957': 'ISBN-10'
}

In [173]:
final_type_dict = {
    'Q1114502': 'American comic book',
    'Q1137046': 'Star Trek: The Next Generation episode',
    'Q11424': 'film',
    'Q1257934': 'clip show',
    'Q134556': 'single',
    'Q15416': 'television program',
    'Q186117': 'timeline',
    'Q196600': 'media franchise',
    'Q1983062': 'episode',
    'Q24856': 'film series',
    'Q24862': 'short film',
    'Q278379': 'a priori language',
    'Q3252662': 'series finale',
    'Q3297186': 'limited series',
    'Q33215': 'constructed language',
    'Q3464665': 'television season',
    'Q35127': 'website',
    'Q386724': 'work',
    'Q41298': 'magazine',
    'Q4176708': 'soundtrack album',
    'Q4234303': 'WikiProject',
    'Q482994': 'album',
    'Q5398426': 'television series',
    'Q559618': 'fictional universe',
    'Q571': 'book',
    'Q581714': 'animated series',
    'Q7889': 'video game',
    'Q838795': 'comic strip',
    # new types
    'Q15633582': 'MediaWiki site',
    'Q2623733': 'fictional language',
    'Q653928': 'pinball',
}

In [174]:
wikidata_to_sylva_types = {
    'Q1114502': ('American comic book', 'Comic'),
    'Q1137046': ('Star Trek: The Next Generation episode', 'Episode'),
    'Q11424': ('film', 'Film'),
    'Q1257934': ('clip show', 'Episode'),
    'Q134556': ('single', 'Single'),
    'Q15416': ('television program', 'TV_program'),
    'Q186117': ('timeline', 'Timeline'),
    'Q196600': ('media franchise', 'Series'),
    'Q1983062': ('episode', 'Episode'),
    'Q24856': ('film series', 'Film_series'),
    'Q24862': ('short film', 'Film'),
    'Q278379': ('a priori language', 'Language'),
    'Q3252662': ('series finale', 'Episode'),
    'Q3297186': ('limited series', 'Comic_series'),
    'Q33215': ('constructed language', 'Language'),
    'Q3464665': ('television season', 'TV_season'),
    'Q35127': ('website', 'Website'),
    'Q386724': ('work', 'Word'),
    'Q41298': ('magazine', 'Magazine'),
    'Q4176708': ('soundtrack album', 'Soundtrack_album'),
    'Q4234303': ('WikiProject', 'Website'),
    'Q482994': ('album', 'Album'),
    'Q5398426': ('television series', 'TV_series'),
    'Q559618': ('fictional universe', 'Fictional_universe'),
    'Q571': ('book', 'Book'),
    'Q581714': ('animated series', 'Animated_series'),
    'Q7889': ('video game', 'Video_game'),
    'Q838795': ('comic strip', 'Comic'),
    'Q15633582': ('MediaWiki site', 'Website'),
    'Q2623733': ('fictional language', 'Language'),
    'Q653928': ('pinball', 'Video_game'),
}

In [175]:
# Classes on Sylva
ts = set()
for k, v in wikidata_to_sylva_types.items():
    ts.add(v[1])
ts

{'Album',
 'Animated_series',
 'Book',
 'Comic',
 'Comic_series',
 'Episode',
 'Fictional_universe',
 'Film',
 'Film_series',
 'Language',
 'Magazine',
 'Series',
 'Single',
 'Soundtrack_album',
 'TV_program',
 'TV_season',
 'TV_series',
 'Timeline',
 'Video_game',
 'Website',
 'Word'}

In [133]:
%%time
total = 0
count = 0
error = 0
graph = defaultdict(list)
with open('wikidata-20150907-star_trek-relevant-rels-and-types.json') as f:
    for line in f:
        try:
            total += 1
            cleaned = line.strip()[:-1]
            d = json.loads(cleaned)
            id1 = d['id']
            for rel in d['claims']:
                for rel_inst in d['claims'][rel]:
                    type_rel = rel_inst['mainsnak']['datavalue']['type']
                    id2 = None
                    if type_rel == 'wikibase-entityid':
                        value = rel_inst['mainsnak']['datavalue']['value']['numeric-id']
                        id2 = 'Q' + str(value)
                    graph[rel].append((id1, id2))
                    count += 1
        except Exception as e:
            error += 1
print('Total:', total)
print('Count:', count)
print('Error:', error)

Total: 918
Count: 6612
Error: 2
CPU times: user 87.5 ms, sys: 0 ns, total: 87.5 ms
Wall time: 86.8 ms


In [134]:
len(graph)

60

In [110]:
def get_types(ide):
    types = []
    with open('wikidata-20150907-star_trek-relevant-rels-and-types.json') as f:
        for line in f:
            try:
                cleaned = line.strip()[:-1]
                d = json.loads(cleaned)
                if ide == d['id']:
                    for rel_inst in d['claims']['P31']:
                        typ = 'Q' + str(rel_inst['mainsnak']['datavalue']['value']['numeric-id'])
                        types.append(typ)
                    break
            except Exception as e:
                pass
    return types

In [168]:
%%time
domains_and_ranges_sets = {}
count = 0
for k in graph:
    ids1 = set([x for (x, y) in graph[k]])
    types1 = set([item for sublist in map(get_types, ids1) for item in sublist])
    ids2 = set([y for (x, y) in graph[k]])
    types2 = set([item for sublist in map(get_types, ids2) for item in sublist])
    domains_and_ranges_sets[k] = (types1, types2)
    count += 1
    if count % 10 == 0:
        print(count, 'relationships processed')

10 relationships processed
20 relationships processed
30 relationships processed
40 relationships processed
50 relationships processed
60 relationships processed
CPU times: user 4min 51s, sys: 4.69 s, total: 4min 56s
Wall time: 4min 55s


In [176]:
ok = set()
err = set()
for k in domains_and_ranges_sets:
    ds, rs = domains_and_ranges_sets[k]
    for d in ds:
        if d not in final_type_dict:
            err.add(d)
        else:
            ok.add(d)
    for r in rs:
        if r not in final_type_dict:
            err.add(r)
        else:
            ok.add(r)

In [177]:
err

set()

In [178]:
yes = []
no = []
for k in domains_and_ranges_sets:
    if k == 'P17'  or k == 'P577':
        yes.append((k, (final_relationship_dict[k], 'attribute')))
    elif domains_and_ranges_sets[k][1]:
        yes.append((k, (final_relationship_dict[k], 'relationship')))
    else:
        no.append((k, (final_relationship_dict[k], 'attribute')))

In [179]:
yes

[('P921', ('main subject', 'relationship')),
 ('P1434', ('describes the fictional universe', 'relationship')),
 ('P361', ('part of', 'relationship')),
 ('P1445', ('fictional universe described in', 'relationship')),
 ('P144', ('based on', 'relationship')),
 ('P156', ('followed by', 'relationship')),
 ('P17', ('country', 'attribute')),
 ('P941', ('inspired by', 'relationship')),
 ('P1080', ('from fictional universe', 'relationship')),
 ('P136', ('genre', 'relationship')),
 ('P577', ('publication date', 'attribute')),
 ('P155', ('follows', 'relationship')),
 ('P179', ('series', 'relationship'))]

In [180]:
properties = dict(yes)
properties

{'P1080': ('from fictional universe', 'relationship'),
 'P136': ('genre', 'relationship'),
 'P1434': ('describes the fictional universe', 'relationship'),
 'P144': ('based on', 'relationship'),
 'P1445': ('fictional universe described in', 'relationship'),
 'P155': ('follows', 'relationship'),
 'P156': ('followed by', 'relationship'),
 'P17': ('country', 'attribute'),
 'P179': ('series', 'relationship'),
 'P361': ('part of', 'relationship'),
 'P577': ('publication date', 'attribute'),
 'P921': ('main subject', 'relationship'),
 'P941': ('inspired by', 'relationship')}

In [160]:
# properties = {
#     'P1080': ('from fictional universe', 'relationship'),
#     'P136': ('genre', 'relationship'),
#     'P1434': ('describes the fictional universe', 'relationship'),
#     'P144': ('based on', 'relationship'),
#     'P1445': ('fictional universe described in', 'relationship'),
#     'P155': ('follows', 'relationship'),
#     'P156': ('followed by', 'relationship'),
#     'P17': ('country', 'attribute'),
#     'P179': ('series', 'relationship'),
#     'P361': ('part of', 'relationship'),
#     'P577': ('publication date', 'attribute'),
#     'P921': ('main subject', 'relationship'),
#     'P941': ('inspired by', 'relationship')
# }

In [161]:
%%time
domains_and_ranges = {}
count = 0
for k in graph:
    if k in properties:
        domains_and_ranges[k] = [(get_types(id1), get_types(id2)) for (id1, id2) in graph[k]]
    count += 1
    if count % 10 == 0:
        print(count, 'relationships processed')

10 relationships processed
20 relationships processed
30 relationships processed
40 relationships processed
50 relationships processed
60 relationships processed
CPU times: user 3min 26s, sys: 3.61 s, total: 3min 29s
Wall time: 3min 29s


In [157]:
filtered_graph = {}
for k in graph:
    if k in properties:
        filtered_graph[k] = graph[k]

In [159]:
# filtered_graph

In [182]:
properties_tuple3 = set()
for k in filtered_graph:
    drs = domains_and_ranges[k]
    for dr in drs:
        ds = dr[0]
        rs = dr[1]
        for d in ds:
            for r in rs:
                dom = wikidata_to_sylva_types[d][1]
                if properties[k][1] == 'relationship':
                    ran = wikidata_to_sylva_types[r][1]
                else:
                    ran = 'string'
                rel = dom.lower() + '_' + properties[k][0].replace(' ', '_')
                properties_tuple3.add((dom, rel, ran))

In [184]:
properties_tuple3

{('Animated_series', 'animated_series_followed_by', 'TV_series'),
 ('Animated_series', 'animated_series_follows', 'TV_series'),
 ('Animated_series', 'animated_series_part_of', 'Series'),
 ('Book', 'book_based_on', 'Film'),
 ('Book', 'book_based_on', 'TV_series'),
 ('Book', 'book_describes_the_fictional_universe', 'Fictional_universe'),
 ('Book', 'book_followed_by', 'Book'),
 ('Book', 'book_follows', 'Book'),
 ('Book', 'book_genre', 'TV_series'),
 ('Book', 'book_main_subject', 'TV_series'),
 ('Book', 'book_part_of', 'Series'),
 ('Book', 'book_series', 'TV_series'),
 ('Comic', 'comic_describes_the_fictional_universe', 'Fictional_universe'),
 ('Comic', 'comic_part_of', 'Series'),
 ('Episode', 'episode_followed_by', 'Episode'),
 ('Episode', 'episode_follows', 'Episode'),
 ('Episode', 'episode_from_fictional_universe', 'Fictional_universe'),
 ('Episode', 'episode_part_of', 'TV_season'),
 ('Episode', 'episode_part_of', 'TV_series'),
 ('Episode', 'episode_series', 'Animated_series'),
 ('Episo

In [188]:
import os
import shutil

In [193]:
shutil.rmtree('nodes')
shutil.rmtree('relationships')
os.makedirs('nodes')
os.makedirs('relationships')