In [1]:
import json
import pickle

In [2]:
with open('pickle/star-trek_network_graph.pickle', 'rb') as f:
    network = pickle.load(f)

## Classes' names

In [3]:
wikidata_types = set([n[1]['type'][0] for n in network.nodes(data=True)])

In [4]:
len(wikidata_types)

44

In [5]:
wikidata_types

{'American comic book',
 'MediaWiki site',
 'WikiProject',
 'Wikimedia portal',
 'agglutinative language',
 'album',
 'animated series',
 'book',
 'clip show',
 'comic strip',
 'comic strip series',
 'computing platform',
 'country',
 'episode',
 'fictional language',
 'fictional universe',
 'film',
 'film genre',
 'genre',
 'handheld game console',
 'language',
 'limited series',
 'literary genre',
 'magazine',
 'media franchise',
 'musical ensemble',
 'natural language',
 'no type',
 'release',
 'remix album',
 'science fiction genre',
 'series',
 'series finale',
 'single',
 'soundtrack album',
 'television genre',
 'television program',
 'television season',
 'television series',
 'timeline',
 'video game',
 'video game console',
 'video game genre',
 'web portal'}

In [6]:
# Manually assigned
sylva_classes = {
    'American comic book': 'Comic',
    'MediaWiki site': 'Web',
    'WikiProject': 'Web',
    'Wikimedia portal': 'Web',
    'a priori language': 'Language',
    'agglutinative language': 'Language',
    'album': 'Music',
    'animated series': 'Animated series',
    'book': 'Book',
    'cinematography': 'Cinematography',
    'clip show': 'Episode',
    'comic strip': 'Comic',
    'comic strip series': 'Comic series',
    'computing platform': 'Video game console',
    'constructed language': 'Language',
    'country': 'Country',
    'episode': 'Episode',
    'fictional language': 'Language',
    'fictional universe': 'Fictional Universe',
    'film': 'Film',
    'film genre': 'Genre',
    'film series': 'Film series',
    'genre': 'Genre',
    'handheld game console': 'Video game console',
    'language': 'Language',
    'limited series': 'Series',
    'literary genre': 'Genre',
    'magazine': 'Magazine',
    'media franchise': 'Series',
    'musical ensemble': 'Music',
    'natural language': 'Language',
    'no type': 'No class',
    'release': 'Release',
    'remix album': 'Music',
    'science fiction genre': 'Genre',
    'series': 'Series',
    'series finale': 'Episode',
    'short film': 'Film',
    'single': 'Music',
    'soundtrack album': 'Music',
    'television genre': 'Genre',
    'television program': 'TV program',
    'television season': 'TV season',
    'television series': 'TV series',
    'timeline': 'Timeline',
    'video game': 'Video game',
    'video game console': 'Video game console',
    'video game genre': 'Genre',
    'web portal': 'Web',
    'web series': 'Web series',
    'webcomic': 'Web',
    'website': 'Web',
}

In [7]:
len(sylva_classes)

52

In [8]:
set(wikidata_types).difference(sylva_classes)

set()

In [9]:
unique_sylva_classes = set([sylva_classes[k] for k in wikidata_types])

In [10]:
len(unique_sylva_classes)

22

In [11]:
unique_sylva_classes

{'Animated series',
 'Book',
 'Comic',
 'Comic series',
 'Country',
 'Episode',
 'Fictional Universe',
 'Film',
 'Genre',
 'Language',
 'Magazine',
 'Music',
 'No class',
 'Release',
 'Series',
 'TV program',
 'TV season',
 'TV series',
 'Timeline',
 'Video game',
 'Video game console',
 'Web'}

In [12]:
with open('pickle/star-trek_sylva_classes_dict.pickle', 'wb') as f:
    pickle.dump(sylva_classes, f)

## Relationships' names

In [13]:
wikidata_relationships = set([e[2]['label'][0] for e in network.edges(data=True)])

In [14]:
wikidata_relationships

{'ESRB rating',
 'PEGI rating',
 'USK rating',
 'based on',
 'country',
 'country of origin',
 'depicts',
 'describes the fictional universe',
 'distribution',
 'fictional universe described in',
 'filming location',
 'followed by',
 'follows',
 'from fictional universe',
 'genre',
 'inspired by',
 'license',
 'list of episodes',
 'main subject',
 'original language of work',
 'original network',
 'part of',
 'platform',
 'series',
 "topic's main category",
 'website account on'}

In [15]:
# Manually assigned
sylva_relationships = {
    'ESRB rating': 'ESRB rating',
    'PEGI rating': 'PEGI rating',
    'USK rating': 'USK rating',
    'based on': 'based on',
    'country': 'country',
    'country of origin': 'country',
    'depicts': 'depicts',
    'describes the fictional universe': 'describes the fictional universe',
    'distribution': 'distribution',
    'fictional universe described in': 'fictional universe described in',
    'filming location': 'filming location',
    'followed by': 'followed by',
    'follows': 'follows',
    'from fictional universe': 'from fictional universe',
    'genre': 'genre',
    'inspired by': 'based on',
    'language': 'language',
    'license': 'license',
    'list of episodes': 'list of episodes',
    'main subject': 'main subject',
    'original language of work': 'language',
    'original network': 'original network',
    'part of': 'part of',
    'platform': 'platform',
    'series': 'series',
    "topic's main category": "topic's main category",
    'website account on': 'website account on',
}

In [16]:
len(sylva_relationships)

27

In [17]:
unique_sylva_relationships = set([sylva_relationships[k] for k in wikidata_relationships])

In [18]:
len(unique_sylva_relationships)

24

In [19]:
unique_sylva_relationships

{'ESRB rating',
 'PEGI rating',
 'USK rating',
 'based on',
 'country',
 'depicts',
 'describes the fictional universe',
 'distribution',
 'fictional universe described in',
 'filming location',
 'followed by',
 'follows',
 'from fictional universe',
 'genre',
 'language',
 'license',
 'list of episodes',
 'main subject',
 'original network',
 'part of',
 'platform',
 'series',
 "topic's main category",
 'website account on'}

In [20]:
with open('pickle/star-trek_sylva_relationships_dict.pickle', 'wb') as f:
    pickle.dump(sylva_relationships, f)

## Classes' attributes

In [21]:
from collections import defaultdict

In [22]:
class_attributes = defaultdict(set)

In [23]:
for ide, info in network.nodes(data=True):
    attrs = set([(label, typ) for (label, (value, typ)) in info.items()])
    t = info['type'][0]
    c = sylva_classes[t]
    class_attributes[c] = class_attributes[c].union(attrs)
    class_attributes[c].remove(('id', None))
    class_attributes[c].remove(('type', None))

class_attributes = dict(class_attributes)

In [24]:
class_attributes

{'Animated series': {('Commons category', 'string'),
  ('Freebase identifier', 'string'),
  ('IMDb identifier', 'string'),
  ('description', 'string'),
  ('name', 'string'),
  ('publication date', 'date'),
  ('wikilink', 'string')},
 'Book': {('Freebase identifier', 'string'),
  ('ISBN-10', 'string'),
  ('ISBN-13', 'string'),
  ('OCLC control number', 'string'),
  ('description', 'string'),
  ('name', 'string'),
  ('publication date', 'date'),
  ('wikilink', 'string')},
 'Comic': {('description', 'string'),
  ('issue', 'string'),
  ('name', 'string'),
  ('publication date', 'date'),
  ('reference URL', 'string'),
  ('wikilink', 'string')},
 'Comic series': {('description', 'string'),
  ('name', 'string'),
  ('wikilink', 'string')},
 'Country': {('description', 'string'),
  ('name', 'string'),
  ('wikilink', 'string')},
 'Episode': {('Freebase identifier', 'string'),
  ('IMDb identifier', 'string'),
  ('YouTube video identifier', 'string'),
  ('description', 'string'),
  ('name', 'strin

## Relationships' attributes, domain, and range

In [25]:
relationship_attributes = defaultdict(set)
relationship_domains_ranges = defaultdict(set)

In [26]:
for id1, id2, info in network.edges(data=True):
    attrs = set([(label, typ) for (label, (value, typ)) in info.items()])
    label = info['label'][0]
    r = sylva_relationships[label]
    
    relationship_attributes[r] = relationship_attributes[r].union(attrs)
    relationship_attributes[r].remove(('label', None))
    
    t1 = network.node[id1]['type'][0]
    t2 = network.node[id2]['type'][0]
    c1 = sylva_classes[t1]
    c2 = sylva_classes[t2]
    relationship_domains_ranges[r].add((c1, c2))

relationship_attributes = dict(relationship_attributes)
relationship_domains_ranges = dict(relationship_domains_ranges)

In [27]:
relationship_attributes

{'ESRB rating': set(),
 'PEGI rating': set(),
 'USK rating': set(),
 'based on': set(),
 'country': {('publication date', 'date')},
 'depicts': set(),
 'describes the fictional universe': set(),
 'distribution': set(),
 'fictional universe described in': set(),
 'filming location': set(),
 'followed by': set(),
 'follows': set(),
 'from fictional universe': set(),
 'genre': set(),
 'language': set(),
 'license': set(),
 'list of episodes': set(),
 'main subject': set(),
 'original network': set(),
 'part of': set(),
 'platform': {('Metacritic ID', 'string')},
 'series': set(),
 "topic's main category": set(),
 'website account on': set()}

In [28]:
relationship_domains_ranges

{'ESRB rating': {('Video game', 'No class')},
 'PEGI rating': {('Video game', 'No class')},
 'USK rating': {('Video game', 'No class')},
 'based on': {('Book', 'Film'),
  ('Book', 'No class'),
  ('Book', 'TV series'),
  ('Film', 'Episode'),
  ('Film', 'Film'),
  ('Film', 'Series'),
  ('Video game', 'TV series')},
 'country': {('Animated series', 'Country'),
  ('Episode', 'Country'),
  ('Film', 'Country'),
  ('Series', 'Country'),
  ('TV program', 'Country'),
  ('TV season', 'Country'),
  ('TV series', 'Country')},
 'depicts': {('Comic', 'No class')},
 'describes the fictional universe': {('Book', 'Fictional Universe'),
  ('Comic', 'Fictional Universe'),
  ('Series', 'Fictional Universe')},
 'distribution': {('Video game', 'No class')},
 'fictional universe described in': {('Fictional Universe', 'Book'),
  ('Fictional Universe', 'Comic'),
  ('Fictional Universe', 'Film'),
  ('Fictional Universe', 'No class'),
  ('Fictional Universe', 'Series'),
  ('Fictional Universe', 'TV series')},
 '

## Schema

In [29]:
schema = {}
schema['allowedEdges'] = []
for r in relationship_domains_ranges:
    for dom, ran in relationship_domains_ranges[r]:
        d = {
            'label': r,
            'properties': {},
            'source': dom,
            'target': ran,
        }
        for attr, typ in relationship_attributes[r]:
            d['properties'][attr] = {
                'auto': 'null',
                'datatype': 'n' if typ == 'number' else 'd' if typ == 'date' else 'u',
                'default': '',
                'description': '',
                'display': 'true',
                'required': 'false',
                'slug': attr,
                'validation': 'null',
                'value': ''
        }
        schema['allowedEdges'].append(d)
schema['nodeTypes'] = {}
for c in class_attributes:
    schema['nodeTypes'][c] = {}
    for attr, typ in class_attributes[c]:
        schema['nodeTypes'][c][attr] = {
            'auto': 'null',
            'datatype': 'n' if typ == 'number' else 'd' if typ == 'date' else 'u',
            'default': '',
            'description': '',
            'display': 'true',
            'required': 'false',
            'slug': attr,
            'validation': 'null',
            'value': ''
        }

In [30]:
with open('sylva/schema.json', 'w') as f:
    json.dump(schema, f)

with open('sylva/schema.json') as f:
    content = f.read()
    content = content.replace('"false"', 'false')
    content = content.replace('"true"', 'true')
    content = content.replace('"null"', 'null')

with open('sylva/schema.json', 'w') as f:
    f.write(content)

## File system

In [31]:
import csv
import os
import shutil

In [32]:
shutil.rmtree('sylva/nodes')
shutil.rmtree('sylva/relationships')
os.makedirs('sylva/nodes')
os.makedirs('sylva/relationships')

In [33]:
for c in unique_sylva_classes:
    open(os.path.join('sylva/nodes', c + '.csv'), 'w')
for p in unique_sylva_relationships:
    open(os.path.join('sylva/relationships', p + '.csv'), 'w')

## Data

In [34]:
nodes = defaultdict(list)
for n in network.nodes(data=True):
    new = {}
    for attr in n[1]:
        new[attr] = n[1][attr][0]
    c = sylva_classes[new['type']]
    new['type'] = c
    nodes[c].append(new)
nodes = dict(nodes)

In [35]:
edges = defaultdict(list)
for e in network.edges(data=True):
    new = {}
    new['source id'] = e[0]
    new['target id'] = e[1]
    for attr in e[2]:
        new[attr] = e[2][attr][0]
    r = sylva_relationships[new['label']]
    new['label'] = r
    edges[r].append(new)
edges = dict(edges)

In [36]:
for e in os.listdir('sylva/nodes'):
    with open(os.path.join('sylva/nodes', e), 'a') as f:
        writer = csv.writer(
            f,
            delimiter=',',
            quotechar='"',
            quoting=csv.QUOTE_ALL
        )
        c = e[:-4]
        attrs = set([name for name, typ in class_attributes[c]])
        attr_row = ['id', 'type']
        attr_row.extend(attrs)
        writer.writerow(attr_row)
        for n in nodes[c]:
            row = ['' if a not in n else n[a] for a in attr_row]
            writer.writerow(row)
for e in os.listdir('sylva/relationships'):
    with open(os.path.join('sylva/relationships', e), 'a') as f:
        writer = csv.writer(
            f,
            delimiter=',',
            quotechar='"',
            quoting=csv.QUOTE_ALL
        )
        r = e[:-4]
        attrs = set([name for name, typ in relationship_attributes[r]])
        attr_row = ['source id', 'target id', 'label']
        attr_row.extend(attrs)
        writer.writerow(attr_row)
        for e in edges[r]:
            row = ['' if a not in e else e[a] for a in attr_row]
            writer.writerow(row)