In [1]:
import json
import pickle

import networkx as nx
import nltk
from nltk.corpus import wordnet as wn

In [2]:
with open('pickle/star-trek_all_properties_dict.pickle', 'rb') as f:
    all_properties_dict = pickle.load(f)

with open('pickle/star-trek_all_types_dict.pickle', 'rb') as f:
    all_types_dict = pickle.load(f)

with open('pickle/star-trek_all_instances_dict.pickle', 'rb') as f:
    all_instances_dict = pickle.load(f)

with open('pickle/star-trek_hierarchy_graph.pickle', 'rb') as f:
    hierarchy = pickle.load(f)

with open('selected/star-trek_property_ids.pickle', 'rb') as f:
    property_ids = pickle.load(f)

with open('selected/star-trek_type_ids.pickle', 'rb') as f:
    type_ids = pickle.load(f)

In [3]:
relevant_properties_dict = {}
for ide in property_ids:
    relevant_properties_dict[ide] = all_properties_dict[ide]

In [4]:
len(relevant_properties_dict)

80

In [5]:
relevant_properties_dict

{'P103': {'description': 'language that a person learned natively',
  'name': 'native language'},
 'P1074': {'description': 'used to link a class of items appearing in a creative work with the analogous class of objects in the real world',
  'name': 'fictional analog of'},
 'P1080': {'description': "subject's fictional entity is in the object narrative. See also P1441 and P1445",
  'name': 'from fictional universe'},
 'P1235': {'description': 'identifier for a series, periodical or franchise in the Internet Speculative Fiction Database',
  'name': 'ISFDB series ID'},
 'P1237': {'description': 'identifier for a movie at the website Box Office Mojo',
  'name': 'Box Office Mojo film ID'},
 'P1258': {'description': 'identifier on Rotten Tomatoes: must use prefix "m/" for movies, "tv/" for TV series (remove /s number on end unless the item is a specific season), "celebrity/" for celebrities, "critic/" for film critics, "source-" for review sources',
  'name': 'Rotten Tomatoes identifier'},


In [6]:
keywords_from_domain = [
    'book', 'novel', 'fiction', 'movie', 'film', 'tv', 'television', 'videogame', 'video game',
    'theme park', 'amusement', 'park', 'play', 'broadway', 'theater', 'website', 'volume'
]

In [7]:
keywords_from_types = [
    'WikiProject', 'album', 'animated', 'animation', 'arcade', 'audio', 'band', 'book', 'broadcast', 'broadcasting',
    'chronicle', 'cinematography', 'clip', 'comic', 'console', 'country', 'episode', 'fantastic', 'fantasy',
    'fiction', 'fictional', 'film', 'franchise', 'game', 'genre', 'language', 'languoid', 'literary', 'magazine',
    'media', 'music', 'musical', 'periodical', 'platform', 'program', 'radio', 'record', 'season', 'series', 'show',
    'single', 'song', 'soundtrack', 'television', 'timeline', 'toy', 'universe', 'web', 'webcomic', 'website',
]

In [8]:
keywords = set(keywords_from_domain).union(set(keywords_from_types))

In [9]:
def synonyms(word):
    syns = set([word])
    for ss in wn.synsets(word):
        syns = syns.union(set(ss.lemma_names()))
    return syns

In [10]:
type_keyword_synonyms = set([s.replace('_', ' ') for w in keywords for s in synonyms(w) if len(s) > 1])

In [11]:
provisional_types_dict = {}
for ide in all_types_dict:
    name_words = set(nltk.word_tokenize(all_types_dict[ide]['name']))
    description_words = set(nltk.word_tokenize(all_types_dict[ide]['description']))
    if type_keyword_synonyms.intersection(name_words) or type_keyword_synonyms.intersection(name_words):
        provisional_types_dict[ide] = all_types_dict[ide]

In [12]:
len(provisional_types_dict)

155

In [13]:
provisional_types_dict

{'Q1004': {'description': 'creative work in which pictures and text convey information such as narratives',
  'name': 'comic',
  'superclasses': {'Q386724'},
  'wikilink': 'Comics'},
 'Q103495': {'description': "war affecting most of the world's most powerful and populous countries",
  'name': 'world war',
  'superclasses': {'Q198'},
  'wikilink': 'World war'},
 'Q1055307': {'description': '',
  'name': 'fictional technology',
  'superclasses': {'Q14897293'},
  'wikilink': 'Fictional technology'},
 'Q107390': {'description': 'territorial and constitutional community forming part of a federal union',
  'name': 'federated state',
  'superclasses': {'Q7275'},
  'wikilink': 'Federated state'},
 'Q1076968': {'description': '',
  'name': 'digital media',
  'superclasses': {'Q1209283'},
  'wikilink': 'Digital media'},
 'Q1078597': {'description': '',
  'name': 'Artistic language',
  'superclasses': {'Q33215'},
  'wikilink': 'Artistic language'},
 'Q10864048': {'description': 'administrative d

In [14]:
# Manually selected
# relevant_types_dict = {

In [15]:
# Manually selected
relevant_types_dict = {
 'Q1004': {'description': 'creative work in which pictures and text convey information such as narratives',
  'name': 'comic',
  'superclasses': {'Q386724'},
  'wikilink': 'Comics'},
 'Q1076968': {'description': '',
  'name': 'digital media',
  'superclasses': {'Q1209283'},
  'wikilink': 'Digital media'},
 'Q1078597': {'description': '',
  'name': 'Artistic language',
  'superclasses': {'Q33215'},
  'wikilink': 'Artistic language'},
 'Q1092563': {'description': 'published work',
  'name': 'periodical literature',
  'superclasses': {'Q11033', 'Q732577'},
  'wikilink': 'Periodical literature'},
 'Q1114502': {'description': 'thin periodical containing primarily comics content',
  'name': 'American comic book',
  'superclasses': {'Q1760610'},
  'wikilink': 'American comic book'},
 'Q11410': {'description': 'structured playing, usually undertaken for enjoyment',
  'name': 'game',
  'superclasses': {'Q173799', 'Q17538258'},
  'wikilink': 'Game'},
 'Q11424': {'description': 'sequence of images that give the impression of movement',
  'name': 'film',
  'superclasses': {'Q17537576', 'Q2431196', 'Q838948'},
  'wikilink': 'Film'},
 'Q1186952': {'description': '',
  'name': 'interactive media',
  'superclasses': {'Q340169'},
  'wikilink': 'Interactive media'},
 'Q1188693': {'description': '',
  'name': 'mind sport',
  'superclasses': {'Q31629'},
  'wikilink': 'Mind sport'},
 'Q1194240': {'description': '',
  'name': 'science fiction genre',
  'superclasses': {'Q20076756', 'Q24925'},
  'wikilink': ''},
 'Q1257934': {'description': 'episode of a television series that consists primarily of excerpts from previous episodes',
  'name': 'clip show',
  'superclasses': {'Q1983062'},
  'wikilink': 'Clip show'},
 'Q131436': {'description': 'game that involves counters or pieces moved or placed on a pre-marked surface or "board", according to a set of rules',
  'name': 'board game',
  'superclasses': {'Q3244175'},
  'wikilink': 'Board game'},
 'Q134556': {'description': 'type of music release usually containing one or two tracks',
  'name': 'single',
  'superclasses': {'Q2031291', 'Q207628', 'Q7366'},
  'wikilink': 'Single (music)'},
 'Q1427969': {'description': '',
  'name': 'flectional language',
  'superclasses': {'Q34770'},
  'wikilink': ''},
 'Q14406742': {'description': '',
  'name': 'comic book series',
  'superclasses': {'Q1004', 'Q15980953', 'Q277759'},
  'wikilink': ''},
 'Q15416': {'description': 'segment of content intended for broadcast on television',
  'name': 'television program',
  'superclasses': {'Q19968906'},
  'wikilink': 'Television program'},
 'Q15610833': {'description': 'work manifested in internet',
  'name': 'internet-based work',
  'superclasses': {'Q340169', 'Q386724'},
  'wikilink': ''},
 'Q15633582': {'description': 'site whose software engine is MediaWiki',
  'name': 'MediaWiki site',
  'superclasses': {'Q171'},
  'wikilink': ''},
 'Q15654425': {'description': '',
  'name': 'electronic toy',
  'superclasses': {'Q11422', 'Q581105'},
  'wikilink': ''},
 'Q15709880': {'description': '',
  'name': 'radio or TV series',
  'superclasses': {'Q7725310'},
  'wikilink': 'Serial (radio and television)'},
 'Q15961987': {'description': 'genre',
  'name': 'television genre',
  'superclasses': {'Q15416', 'Q483394'},
  'wikilink': ''},
 'Q171263': {'description': '',
  'name': 'agglutinative language',
  'superclasses': {'Q1427969'},
  'wikilink': 'Agglutinative language'},
 'Q17376908': {'description': 'language, variety of a language, or group of languages',
  'name': 'languoid',
  'superclasses': {'Q11028', 'Q7184903'},
  'wikilink': ''},
 'Q1760610': {'description': 'publication of comics art',
  'name': 'comic book',
  'superclasses': {'Q1004', 'Q571'},
  'wikilink': 'Comic book'},
 'Q185363': {'description': 'historical account of facts and events ranged in chronological order',
  'name': 'chronicle',
  'superclasses': {'Q2267856', 'Q571'},
  'wikilink': 'Chronicle'},
 'Q186117': {'description': 'way of displaying a list of events in chronological order',
  'name': 'timeline',
  'superclasses': {'Q185363'},
  'wikilink': 'Timeline'},
 'Q186165': {'description': 'website that integrates applications, processes and services',
  'name': 'web portal',
  'superclasses': {'Q1668024', 'Q35127'},
  'wikilink': 'Web portal'},
 'Q192851': {'description': 'coin-operated entertainment machine',
  'name': 'arcade game machine',
  'superclasses': {'Q2249149', 'Q241317'},
  'wikilink': 'Arcade game'},
 'Q196600': {'description': 'use of a creative work across several different media',
  'name': 'media franchise',
  'superclasses': {'Q171947', 'Q386724'},
  'wikilink': 'Media franchise'},
 'Q1983062': {'description': 'part of a dramatic work such as a serial television or radio program',
  'name': 'episode',
  'superclasses': {'Q15416', 'Q36649'},
  'wikilink': 'Episode'},
 'Q19968906': {'description': 'segment of content intended for broadcast',
  'name': 'broadcasting program',
  'superclasses': {'Q386724'},
  'wikilink': ''},
 'Q20076756': {'description': '',
  'name': 'fantastique genre',
  'superclasses': {'Q223393', 'Q2387832'},
  'wikilink': ''},
 'Q201658': {'description': 'refers to the method based on similarities in the narrative elements from which films are constructed',
  'name': 'film genre',
  'superclasses': {'Q1792379'},
  'wikilink': 'Film genre'},
 'Q202866': {'description': 'film for the cinema, television or computer screen that is made by using drawings',
  'name': 'animated film',
  'superclasses': {'Q11424', 'Q11425'},
  'wikilink': ''},
 'Q207628': {'description': 'aesthetic ordering and disposing of musical information',
  'name': 'musical composition',
  'superclasses': {'Q2188189'},
  'wikilink': 'Musical composition'},
 'Q2088357': {'description': 'group of people who perform music',
  'name': 'musical ensemble',
  'superclasses': {'Q16334295', 'Q43229'},
  'wikilink': 'Musical ensemble'},
 'Q21191134': {'description': '',
  'name': 'comic strip series',
  'superclasses': {'Q14406742', 'Q7725310'},
  'wikilink': ''},
 'Q213369': {'description': 'comics published online',
  'name': 'webcomic',
  'superclasses': {'Q1004'},
  'wikilink': 'Webcomic'},
 'Q215380': {'description': 'group of people who perform instrumental or vocal music',
  'name': 'band',
  'superclasses': {'Q2088357'},
  'wikilink': ''},
 'Q217199': {'description': 'recorded music accompanying and synchronized to the images of a motion picture, book, television program or video game',
  'name': 'soundtrack',
  'superclasses': {'Q2188189'},
  'wikilink': 'Soundtrack'},
 'Q2188189': {'description': '',
  'name': 'musical work',
  'superclasses': {'Q838948'},
  'wikilink': ''},
 'Q223393': {'description': 'category of literary composition',
  'name': 'literary genre',
  'superclasses': {'Q483394'},
  'wikilink': 'Literary genre'},
 'Q2249149': {'description': '',
  'name': 'electronic game',
  'superclasses': {'Q11410', 'Q7397'},
  'wikilink': 'Electronic game'},
 'Q241317': {'description': 'hardware architecture and software framework',
  'name': 'computing platform',
  'superclasses': {'Q15411548'},
  'wikilink': 'Computing platform'},
 'Q24856': {'description': 'collection of related films in succession',
  'name': 'film series',
  'superclasses': {'Q36649', 'Q7725310'},
  'wikilink': 'Film series'},
 'Q24862': {'description': 'any film not long enough to be considered a feature film',
  'name': 'short film',
  'superclasses': {'Q11424'},
  'wikilink': 'Short film'},
 'Q24925': {'description': 'genre of fiction',
  'name': 'science fiction',
  'superclasses': {'Q5240628'},
  'wikilink': 'Science fiction'},
 'Q2623733': {'description': 'language in fictional stories',
  'name': 'fictional language',
  'superclasses': {'Q1078597', 'Q14897293', 'Q315'},
  'wikilink': 'Fictional language'},
 'Q2755547': {'description': '',
  'name': 'individual sport',
  'superclasses': {'Q31629'},
  'wikilink': 'Individual sport'},
 'Q278379': {'description': '',
  'name': 'a priori language',
  'superclasses': {'Q33215'},
  'wikilink': 'A priori (languages)'},
 'Q2917506': {'description': 'Wikipedia disambiguation page',
  'name': 'series',
  'superclasses': set(),
  'wikilink': 'Series'},
 'Q315': {'description': 'capacity to communicate using signs such as words or gestures',
  'name': 'language',
  'superclasses': {'Q17376908', 'Q20811739'},
  'wikilink': 'Language'},
 'Q3244175': {'description': 'game played on a flat surface',
  'name': 'tabletop game',
  'superclasses': {'Q11410', 'Q2424752', 'Q28877'},
  'wikilink': 'Tabletop game'},
 'Q3252662': {'description': 'last installment of a series',
  'name': 'series finale',
  'superclasses': {'Q1983062'},
  'wikilink': 'Series finale'},
 'Q3297186': {'description': 'comic book series of predetermined length',
  'name': 'limited series',
  'superclasses': {'Q14406742'},
  'wikilink': 'Limited series (comics)'},
 'Q33215': {'description': '',
  'name': 'constructed language',
  'superclasses': {'Q17537576', 'Q34770'},
  'wikilink': 'Constructed language'},
 'Q33742': {'description': 'language naturally spoken by humans, as opposed to "formal" or "built" languages',
  'name': 'natural language',
  'superclasses': {'Q34770'},
  'wikilink': 'Natural language'},
 'Q3464665': {'description': 'set of episodes produced for a television series',
  'name': 'television season',
  'superclasses': {'Q7725310'},
  'wikilink': ''},
 'Q34770': {'description': 'particular system of communication',
  'name': 'language',
  'superclasses': {'Q17376908'},
  'wikilink': ''},
 'Q35127': {'description': 'set of related web pages served from a single web domain',
  'name': 'website',
  'superclasses': {'Q1076968', 'Q1186952', 'Q15610833'},
  'wikilink': 'Website'},
 'Q41298': {'description': 'publication type',
  'name': 'magazine',
  'superclasses': {'Q1092563'},
  'wikilink': 'Magazine'},
 'Q4176708': {'description': 'album containing music from another work, such as a film or opera',
  'name': 'soundtrack album',
  'superclasses': {'Q217199', 'Q482994'},
  'wikilink': 'Soundtrack album'},
 'Q4234303': {'description': 'Working groups on Wikimedia projects',
  'name': 'WikiProject',
  'superclasses': {'Q14204246'},
  'wikilink': 'Wikipedia:WikiProject'},
 'Q482994': {'description': 'collection of music',
  'name': 'album',
  'superclasses': {'Q2031291', 'Q2188189', 'Q354004', 'Q4650799'},
  'wikilink': 'Album'},
 'Q483394': {'description': 'category of creative works based on stylistic criteria',
  'name': 'genre',
  'superclasses': {'Q5962346'},
  'wikilink': 'Genre'},
 'Q526877': {'description': 'video serial broadcast on the internet',
  'name': 'web series',
  'superclasses': {'Q24862', 'Q7725310', 'Q841645'},
  'wikilink': 'Web series'},
 'Q5398426': {'description': 'connected set of television program episodes that run under the same title, possibly spanning many seasons',
  'name': 'television series',
  'superclasses': {'Q15416', 'Q15709880', 'Q2431196'},
  'wikilink': ''},
 'Q559618': {'description': 'self-consistent fictional setting with elements that differ from the real world',
  'name': 'fictional universe',
  'superclasses': {'Q14897293'},
  'wikilink': 'Fictional universe'},
 'Q571': {'description': 'medium for a collection of words and/or pictures to represent knowledge, often manifested in bound paper and ink, or in e-books',
  'name': 'book',
  'superclasses': {'Q18593264', 'Q2342494', 'Q340169', 'Q386724', 'Q49848'},
  'wikilink': 'Book'},
 'Q581714': {'description': 'TV genre; set of regularly presented animated television programs',
  'name': 'animated series',
  'superclasses': {'Q202866', 'Q5398426'},
  'wikilink': 'Animated series'},
 'Q590870': {'description': 'art or science of motion picture photography',
  'name': 'cinematography',
  'superclasses': {'Q11633'},
  'wikilink': 'Cinematography'},
 'Q6256': {'description': 'distinct region in geography; a broad term that can include political divisions or regions associated with distinct political characteristics',
  'name': 'country',
  'superclasses': {'Q56061'},
  'wikilink': 'Country'},
 'Q659563': {'description': 'categorization of video games',
  'name': 'video game genre',
  'superclasses': {'Q483394', 'Q7889'},
  'wikilink': 'Video game genre'},
 'Q7366': {'description': 'composition for voice(s)',
  'name': 'song',
  'superclasses': {'Q207628', 'Q6942562'},
  'wikilink': 'Song'},
 'Q7889': {'description': 'class of electronic game',
  'name': 'video game',
  'superclasses': {'Q166142', 'Q2249149'},
  'wikilink': 'Video game'},
 'Q8076': {'description': 'interactive entertainment computer or customized computer system to display a video game',
  'name': 'video game console',
  'superclasses': {'Q15654425', 'Q241317'},
  'wikilink': 'Video game console'},
 'Q838795': {'description': 'short serialized comics',
  'name': 'comic strip',
  'superclasses': {'Q1004'},
  'wikilink': 'Comic strip'},
 'Q841645': {'description': '',
  'name': 'Internet television',
  'superclasses': {'Q269415'},
  'wikilink': 'Internet television'},
 'Q941818': {'description': 'lightweight, portable electronic device used for gaming',
  'name': 'handheld game console',
  'superclasses': {'Q5082128', 'Q8076'},
  'wikilink': 'Handheld game console'},
 'Q963099': {'description': 'album consisting mostly of remixes or re-recorded versions of earlier released material',
  'name': 'remix album',
  'superclasses': {'Q482994'},
  'wikilink': 'Remix album'}
}

In [16]:
len(relevant_types_dict)

80

In [17]:
def get_closest_type(t, ts):
    ts = list(ts.keys())
    lengths = []
    for t2 in ts:
        try:
            # WARNING: if there is more than one shortest path, then it returns only one of them
            length = nx.shortest_path_length(hierarchy, t, t2)
        except nx.exception.NetworkXNoPath as e:
            length = 999
        lengths.append(length)
    min_length = min(lengths)
    closest = ts[lengths.index(min_length)]
    count_min = lengths.count(min_length)
    return (closest, min_length)

def get_relevant_type(ide):
    relevant_type = None
    types = all_instances_dict[ide]['types']
    candidates = set()
    for t in types:
        closest_relevant_type = get_closest_type(t, relevant_types_dict)
        if closest_relevant_type:
            candidates.add(closest_relevant_type)
    if candidates:
        # WARNING: if there is more than one candidate, then it returns only one of them
        possible_type, distance = candidates.pop(); candidates.add(possible_type)
        if distance < 999:
            relevant_type = possible_type
    return relevant_type

In [18]:
relevant_instances_dict = {}

In [19]:
for ide in all_instances_dict:
    t = get_relevant_type(ide)
    if t:
        relevant_instances_dict[ide] = all_instances_dict[ide]
        relevant_instances_dict[ide]['type'] = t

In [20]:
len(relevant_instances_dict)

1011

In [21]:
with open('pickle/star-trek_relevant_properties_dict.pickle', 'wb') as f:
    pickle.dump(relevant_properties_dict, f)

with open('pickle/star-trek_relevant_types_dict.pickle', 'wb') as f:
    pickle.dump(relevant_types_dict, f)

with open('pickle/star-trek_relevant_instances_dict.pickle', 'wb') as f:
    pickle.dump(relevant_instances_dict, f)