In [1]:
rdf_path = "C:\\Users\\melania\\Documents\\Master-sеmester3\\Thesis - graph embeddings\\ontologies\\genre.rdf"


# Merging the tags dataset to an ontology of genres. 

Merging a folksonomy (dataset of tags) to an formal domain ontology is a common method for word sense disambiguation of tags.     

In this notebook, we link the tags to The CWRC Genre Ontology Specification 0.3: http://sparql.cwrc.ca/ontologies/genre-2019-07-09.html 

##  Link tags to genre instances

In [1]:
import pandas as pd
import numpy as np
import json
from fuzzywuzzy import fuzz
from fuzzywuzzy import process
import pickle



In [17]:
# ID schema (node2vec needs unique int IDs of nodes
# - **[0, 100,000]** - Goodreads book IDs
# - **[100,000 - 200,000]** - Tag IDs
# - **[200,000 - 300,000]** - DBPedia Nodes
# - **[1,000,000 - 1,100,000]** - Genre Nodes
# - **[1,100,000 - ....]** - Genre Classes Nodes
TAGS_ID_BEGIN = 100000
GENRES_ID_BEGIN = 1000000
GENRE_CLASSES_ID_BEGIN = 1100000
# Weight of the hard links
HARD_WEIGHT = 1000000

In [5]:
#Open genre instanses file and extract all genres
with open("onto_files/all_genres.json") as f:
    data = json.load(f)
genres = dict()
for res in data['results']['bindings']: 
    genres[res['label']['value']] =  res['genre']['value']

In [6]:
genres

{'realist': 'http://sparql.cwrc.ca/ontologies/genre#realist',
 'fairytale': 'http://sparql.cwrc.ca/ontologies/genre#fairytale',
 'guidebook': 'http://sparql.cwrc.ca/ontologies/genre#guidebook',
 'treatise': 'http://sparql.cwrc.ca/ontologies/genre#treatise',
 'novella': 'http://sparql.cwrc.ca/ontologies/genre#novella',
 'polemic': 'http://sparql.cwrc.ca/ontologies/genre#polemic',
 'magic realist': 'http://sparql.cwrc.ca/ontologies/genre#magicRealist',
 'panegyric': 'http://sparql.cwrc.ca/ontologies/genre#panegyric',
 'slave narrative': 'http://sparql.cwrc.ca/ontologies/genre#slaveNarrative',
 'dramatic monologue': 'http://sparql.cwrc.ca/ontologies/genre#dramaticMonologue',
 'popular': 'http://sparql.cwrc.ca/ontologies/genre#popular',
 'genealogy': 'http://sparql.cwrc.ca/ontologies/genre#genealogy',
 'bergamasque': 'http://sparql.cwrc.ca/ontologies/genre#bergamasque',
 'dissertation': 'http://sparql.cwrc.ca/ontologies/genre#dissertation',
 'epitaph': 'http://sparql.cwrc.ca/ontologies/gen

In [7]:
genres["fantasy"]

'http://sparql.cwrc.ca/ontologies/genre#fantasy'

In [8]:
print("Total number of genre istances: ", len(genres))

Total number of genre istances:  284


In [15]:
# Load selected tags
df_tags = pd.read_csv("data/tags_selected.csv")
df_tags.head()

Unnamed: 0,tag_id,book_count,count,tag_name
0,0,7,24,-
1,1,2,6,--1-
2,15,2,6,--6-
3,21,2,19,-calif--
4,22,3,27,-d-c--


In [10]:
def clean_tag(tag): 
    tag = tag.replace("-", " ").replace("-", " ") #remove "-_"
    tag = " ".join(tag.split()) #remove multiple whitespaces
 
    return tag

In [11]:
df_tags["cleaned_tag"] = df_tags["tag_name"].apply(clean_tag)
df_tags.head()

Unnamed: 0,tag_id,book_count,count,tag_name,cleaned_tag
0,0,7,24,-,
1,1,2,6,--1-,1
2,15,2,6,--6-,6
3,21,2,19,-calif--,calif
4,22,3,27,-d-c--,d c


In [12]:
def create_tag_id(tag_id):
    """Helper function for creating tag ids"""
    return tag_id + TAGS_IDS_BEGIN

In [13]:
df_tags["tag_id"] = df_tags["tag_id"].apply(create_tag_id)

NameError: name 'TAGS_IDS_BEGIN' is not defined

In [14]:
def choose_genre(tag):
    choices = genres.keys()
#     tag = "german philosophy"
    candidate = process.extractOne(tag, choices, scorer = fuzz.token_sort_ratio)
    #token sort ratio is the best for now
    if candidate[1] >= 75: 
        return candidate[0]
    else: 
        return None 

In [15]:
df_tags["genre"] = df_tags["cleaned_tag"].apply(choose_genre)



In [16]:
df_genres = df_tags.dropna()
df_genres

Unnamed: 0,tag_id,book_count,count,tag_name,cleaned_tag,genre
18,98,48,1000,02-fantasy,02 fantasy,fantasy
198,571,4,5,2-non-fiction,2 non fiction,non-fiction
470,1318,6,11,a-fiction,a fiction,fiction
472,1322,4,30,a-g-riddle,a g riddle,riddle
494,1401,3,10,aa-fiction,aa fiction,fiction
...,...,...,...,...,...,...
15641,33134,4,9,young-adult-series,young adult series,young adult writing
15642,33136,4,13,young-adult-teen,young adult teen,young adult writing
15657,33166,34,285,youth-fiction,youth fiction,non-fiction
15697,33311,43,1337,Ýa-fiction,Ýa fiction,fiction


In [27]:
pd.set_option('display.max_rows', 1000)
display(df_genres[0:100])

Unnamed: 0,tag_id,book_count,count,tag_name,cleaned_tag,genre,genre_id,genre_uri
18,98,48,1000,02-fantasy,02 fantasy,fantasy,1000199,http://sparql.cwrc.ca/ontologies/genre#fantasy
198,571,4,5,2-non-fiction,2 non fiction,non-fiction,1000264,http://sparql.cwrc.ca/ontologies/genre#nonFiction
470,1318,6,11,a-fiction,a fiction,fiction,1000028,http://sparql.cwrc.ca/ontologies/genre#fiction
472,1322,4,30,a-g-riddle,a g riddle,riddle,1000180,http://sparql.cwrc.ca/ontologies/genre#riddle
494,1401,3,10,aa-fiction,aa fiction,fiction,1000028,http://sparql.cwrc.ca/ontologies/genre#fiction
547,1525,2,20,ace,ace,farce,1000174,http://sparql.cwrc.ca/ontologies/genre#farce
552,1540,2172,68941,action,action,fiction,1000028,http://sparql.cwrc.ca/ontologies/genre#fiction
575,1584,17,62,adaptation,adaptation,adaptation,1000212,http://sparql.cwrc.ca/ontologies/genre#adaptation
576,1586,5,7,adaptations,adaptations,adaptation,1000212,http://sparql.cwrc.ca/ontologies/genre#adaptation
579,1599,97,2243,addiction,addiction,fiction,1000028,http://sparql.cwrc.ca/ontologies/genre#fiction


In [None]:
wrong: 
    - a g riddle - riddle 
    - ace 	 farce
    - action - fiction 
    - adorable - parable 
    - adult literature - conduct literature
    - ancient literature - childrens literature
    - animation - imitation
    - archeology - theology
    - atheist - ma thesis 
    - athiesm - anthem
    - atonement - announcement
    - autobiographical fiction - biographical dictionary
    - award - war
    - ballet - ballad 
    - baseball fiction - bisexual fiction 
    - biker romance - literary romance
    - canadian literature - conduct literature
    - cape - camp 
    - caribbean literature - childrens literature
    - chemistry-history 
    - church fiction - courtship fiction 
    - clamp - clamp

22 wrong out of first 100. So we will map to genres, using embeddings instead of fuzzy match. 

In [9]:
df_genres = pd.read_csv("data/tags_stacked_embeddings_genres.csv")
df_genres

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tag_id,book_count,count,tag_name,clean_tag,stacked_embed,genre_name
0,573,575,1584,17,62,adaptation,adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,adaptation
1,797,799,2112,2,20,allegories,allegories,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,allegory
2,798,800,2113,33,1577,allegory,allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,allegory
3,1460,1462,3586,25,2005,autism,autism,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,paranormal
4,1467,1469,3596,265,3130,auto-biography,autobiography,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,autobiography
...,...,...,...,...,...,...,...,...,...
107,14675,14677,31141,121,1509,translations,translations,[-0.36979428 -0.16403413 -0.32276735 ... 0.14...,translation
108,14932,14934,31667,11,268,utopia,utopia,[-0.3878106 -0.21069324 -0.4752291 ... 0.23...,utopia
109,15155,15157,32130,1044,66056,war,war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,war
110,15156,15158,32131,2,5,war-,war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,war


In [7]:
#Create int IDs  of the genres for node2vec algorithm
genre_ids = dict()
i = GENRES_ID_BEGIN
for genre in genres.keys():
    genre_ids[genre] = i
    i += 1
    
print(i)
print(GENRES_ID_BEGIN)

1000284
1000000


In [11]:
df_genres["genre_id"]= df_genres["genre_name"].apply(lambda x: genre_ids[x])
df_genres

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tag_id,book_count,count,tag_name,clean_tag,stacked_embed,genre_name,genre_id
0,573,575,1584,17,62,adaptation,adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,adaptation,1000212
1,797,799,2112,2,20,allegories,allegories,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,allegory,1000027
2,798,800,2113,33,1577,allegory,allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,allegory,1000027
3,1460,1462,3586,25,2005,autism,autism,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,paranormal,1000257
4,1467,1469,3596,265,3130,auto-biography,autobiography,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,autobiography,1000149
...,...,...,...,...,...,...,...,...,...,...
107,14675,14677,31141,121,1509,translations,translations,[-0.36979428 -0.16403413 -0.32276735 ... 0.14...,translation,1000034
108,14932,14934,31667,11,268,utopia,utopia,[-0.3878106 -0.21069324 -0.4752291 ... 0.23...,utopia,1000108
109,15155,15157,32130,1044,66056,war,war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,war,1000262
110,15156,15158,32131,2,5,war-,war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,war,1000262


In [12]:
df_genres["genre_uri"] = df_genres["genre_name"].apply(lambda x: genres[x])
df_genres.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tag_id,book_count,count,tag_name,clean_tag,stacked_embed,genre_name,genre_id,genre_uri
0,573,575,1584,17,62,adaptation,adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,adaptation,1000212,http://sparql.cwrc.ca/ontologies/genre#adaptation
1,797,799,2112,2,20,allegories,allegories,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,allegory,1000027,http://sparql.cwrc.ca/ontologies/genre#allegory
2,798,800,2113,33,1577,allegory,allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,allegory,1000027,http://sparql.cwrc.ca/ontologies/genre#allegory
3,1460,1462,3586,25,2005,autism,autism,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,paranormal,1000257,http://sparql.cwrc.ca/ontologies/genre#paranormal
4,1467,1469,3596,265,3130,auto-biography,autobiography,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,autobiography,1000149,http://sparql.cwrc.ca/ontologies/genre#autobio...


In [20]:
#Hashing
#df_genres["genre_hash"] = df_genres["genre_uri"].apply(lambda x: hash(x))

In [16]:
#Coverage 
len(df_genres) / len(df_tags)

0.0069947539345490885

In [14]:
df_genres.to_csv("data/tags_to_genres.csv")

In [23]:
# tag_to_genre_edges = list(zip(list(df_genres.tag_id.values), list(df_genres.genre_uri.values)))
# tag_to_genre_edges

In [17]:
tags_genre_ids_edges = list(zip(list(df_genres.tag_id.values), list(df_genres.genre_id.values)))
tags_genre_ids_edges 

[(1584, 1000212),
 (2112, 1000027),
 (2113, 1000027),
 (3586, 1000257),
 (3596, 1000149),
 (3611, 1000149),
 (4514, 1000179),
 (4516, 1000179),
 (4605, 1000196),
 (5951, 1000272),
 (7725, 1000145),
 (7866, 1000282),
 (8114, 1000252),
 (8116, 1000252),
 (8130, 1000252),
 (8147, 1000252),
 (8149, 1000252),
 (8150, 1000252),
 (8151, 1000252),
 (8560, 1000251),
 (8573, 1000248),
 (9336, 1000066),
 (9385, 1000243),
 (9389, 1000243),
 (9432, 1000096),
 (9472, 1000235),
 (9482, 1000170),
 (9593, 1000013),
 (9669, 1000220),
 (9697, 1000052),
 (9886, 1000019),
 (9969, 1000019),
 (10059, 1000181),
 (10064, 1000181),
 (10080, 1000181),
 (10081, 1000181),
 (10082, 1000181),
 (10083, 1000181),
 (10084, 1000181),
 (10711, 1000281),
 (10729, 1000093),
 (10753, 1000141),
 (10825, 1000280),
 (10835, 1000280),
 (10886, 1000068),
 (11061, 1000118),
 (11106, 1000203),
 (11148, 1000001),
 (11152, 1000001),
 (11159, 1000001),
 (11162, 1000001),
 (11305, 1000199),
 (11673, 1000215),
 (11743, 1000028),
 (1283

In [29]:
def save_edges(edgelist, outfile, weights=False):
    """Write the edges in a file in a suitable for Node2vec form. """
    with open(outfile, "w+") as f:
        for edge in edgelist:
            f.write(str(edge[0]))
            f.write(" ")
            f.write(str(edge[1]))
            if weights:
                    f.write(" ")
                    f.write(str(edge[2]))
            f.write("\n")                

In [19]:
# #Save edges
save_edges(tags_genre_ids_edges, "edges/tag_to_genre_ids_new.edgelist")

In [88]:
# #Save edges
# with open("edges/tag_to_genre_edges.pickle", "wb") as f: 
#     pickle.dump(tag_to_genre_edges, f)

##  Add egdes for the classes in the ontology

In [20]:
#Open genre instanses file and extract all genres
with open("onto_files/all_edges.json") as f:
    data = json.load(f)

In [21]:
data['results']['bindings']

[{'genre1': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#realist'},
  'genre2': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#FictionalGenre'}},
 {'genre1': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#fairytale'},
  'genre2': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#FictionalGenre'}},
 {'genre1': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#guidebook'},
  'genre2': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#ThematicGenre'}},
 {'genre1': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#treatise'},
  'genre2': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#ScholarlyGenre'}},
 {'genre1': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#novella'},
  'genre2': {'type': 'uri',
   'value': 'http://sparql.cwrc.ca/ontologies/genre#NovelisticGenre'}},
 {'genre1': {'type': 'uri',
   'value': 'http

In [22]:
len(data['results']['bindings'])

449

In [23]:
edges = set()
for res in data['results']['bindings']: 
    edges.add((res['genre1']['value'], res['genre2']['value']))

In [24]:
edges

{('http://sparql.cwrc.ca/ontologies/genre#abridgement',
  'http://sparql.cwrc.ca/ontologies/genre#IntertextualGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#acrostic',
  'http://sparql.cwrc.ca/ontologies/genre#PoeticGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#adaptation',
  'http://sparql.cwrc.ca/ontologies/genre#IntertextualGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#adventureWriting',
  'http://sparql.cwrc.ca/ontologies/genre#FictionalGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#advertisement',
  'http://sparql.cwrc.ca/ontologies/genre#AdvertisementForm'),
 ('http://sparql.cwrc.ca/ontologies/genre#afterpiece',
  'http://sparql.cwrc.ca/ontologies/genre#DramaticGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#afterword',
  'http://sparql.cwrc.ca/ontologies/genre#EmbeddedWork'),
 ('http://sparql.cwrc.ca/ontologies/genre#agitprop',
  'http://sparql.cwrc.ca/ontologies/genre#DramaticGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#agitprop',
  'http://sparql.cwr

In [25]:
#All is-a relations on the ontology: 
print("All is-a relations on the ontology: ", len(edges))

All is-a relations on the ontology:  449


In [26]:
genre_classes = set() 
for res in data['results']['bindings']: 
    genre_classes.add(res['genre2']['value'])
print("Number of genre classes: ", len(genre_classes))

Number of genre classes:  57


In [27]:
#Create genre classes IDs 
genre_classes_ids = dict()
j = GENRE_CLASSES_ID_BEGIN
for genre_class in genre_classes:
    genre_classes_ids[genre_class] = j
    j += 1
    
print(j)
print(GENRE_CLASSES_ID_BEGIN)

1100057
1100000


In [28]:
genre_classes_ids

{'http://sparql.cwrc.ca/ontologies/genre#AlphabetForm': 1100000,
 'http://sparql.cwrc.ca/ontologies/genre#TextualMedium': 1100001,
 'http://sparql.cwrc.ca/ontologies/genre#CatalogueForm': 1100002,
 'http://sparql.cwrc.ca/ontologies/genre#LiteraryGenre': 1100003,
 'http://sparql.cwrc.ca/ontologies/genre#IllustratedForm': 1100004,
 'http://sparql.cwrc.ca/ontologies/genre#JournalisticGenre': 1100005,
 'http://sparql.cwrc.ca/ontologies/genre#SatiricalGenre': 1100006,
 'http://sparql.cwrc.ca/ontologies/genre#ScholarlyGenre': 1100007,
 'http://sparql.cwrc.ca/ontologies/genre#FeministGenre': 1100008,
 'http://sparql.cwrc.ca/ontologies/genre#AdvertisementForm': 1100009,
 'http://sparql.cwrc.ca/ontologies/genre#DigitalMedium': 1100010,
 'http://sparql.cwrc.ca/ontologies/genre#StandaloneWork': 1100011,
 'http://sparql.cwrc.ca/ontologies/genre#LifeWritingGenre': 1100012,
 'http://sparql.cwrc.ca/ontologies/genre#VisualArtMedium': 1100013,
 'http://sparql.cwrc.ca/ontologies/genre#ReferenceWorkGenre

In [29]:
#Create edges dict 
edges_dict = dict() 
for pair in list(edges): 
    if pair[0] not in edges_dict.keys(): 
        edges_dict[pair[0]] = [pair[1]]
    else: 
        edges_dict[pair[0]].append(pair[1])

In [30]:
edges_dict

{'http://sparql.cwrc.ca/ontologies/genre#exhibitionCatalogue': ['http://sparql.cwrc.ca/ontologies/genre#CatalogueForm'],
 'http://sparql.cwrc.ca/ontologies/genre#serialVolume': ['http://sparql.cwrc.ca/ontologies/genre#SerialForm'],
 'http://sparql.cwrc.ca/ontologies/genre#comedy': ['http://sparql.cwrc.ca/ontologies/genre#ComedicGenre'],
 'http://sparql.cwrc.ca/ontologies/genre#clerihew': ['http://sparql.cwrc.ca/ontologies/genre#PoeticGenre'],
 'http://sparql.cwrc.ca/ontologies/genre#fable': ['http://sparql.cwrc.ca/ontologies/genre#NarrativeGenre'],
 'http://sparql.cwrc.ca/ontologies/genre#prayer': ['http://sparql.cwrc.ca/ontologies/genre#ReligiousGenre'],
 'http://sparql.cwrc.ca/ontologies/genre#legendFolktale': ['http://sparql.cwrc.ca/ontologies/genre#FictionalGenre'],
 'http://sparql.cwrc.ca/ontologies/genre#epistle': ['http://sparql.cwrc.ca/ontologies/genre#EpistolaryGenre'],
 'http://sparql.cwrc.ca/ontologies/genre#philosophy': ['http://sparql.cwrc.ca/ontologies/genre#Philosophical

In [31]:
genre_is_a_edges = list()
for genre in list(df_genres.genre_uri.values): 
    classes = edges_dict[genre]
    for cl in classes: 
        genre_is_a_edges.append((genre, cl))
genre_is_a_edges

[('http://sparql.cwrc.ca/ontologies/genre#adaptation',
  'http://sparql.cwrc.ca/ontologies/genre#IntertextualGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#allegory',
  'http://sparql.cwrc.ca/ontologies/genre#FictionalGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#allegory',
  'http://sparql.cwrc.ca/ontologies/genre#FictionalGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#paranormal',
  'http://sparql.cwrc.ca/ontologies/genre#ThematicGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#autobiography',
  'http://sparql.cwrc.ca/ontologies/genre#AutobiographicalGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#autobiography',
  'http://sparql.cwrc.ca/ontologies/genre#AutobiographicalGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#bildungsroman',
  'http://sparql.cwrc.ca/ontologies/genre#NovelisticGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#bildungsroman',
  'http://sparql.cwrc.ca/ontologies/genre#NovelisticGenre'),
 ('http://sparql.cwrc.ca/ontologies/genre#biography',


In [32]:
genre_is_a_edges =list(set(genre_is_a_edges))
print("All new edges: ", len(genre_is_a_edges))

All new edges:  100


In [33]:
#Save edges
with open("edges/genre_is_a_edges_new.pickle", "wb") as f: 
    pickle.dump(genre_is_a_edges, f)

In [34]:
len(edges_dict)

377

In [35]:
#Add the is-a reations to the tags in the folksonomy
df_genres.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tag_id,book_count,count,tag_name,clean_tag,stacked_embed,genre_name,genre_id,genre_uri
0,573,575,1584,17,62,adaptation,adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,adaptation,1000212,http://sparql.cwrc.ca/ontologies/genre#adaptation
1,797,799,2112,2,20,allegories,allegories,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,allegory,1000027,http://sparql.cwrc.ca/ontologies/genre#allegory
2,798,800,2113,33,1577,allegory,allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,allegory,1000027,http://sparql.cwrc.ca/ontologies/genre#allegory
3,1460,1462,3586,25,2005,autism,autism,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,paranormal,1000257,http://sparql.cwrc.ca/ontologies/genre#paranormal
4,1467,1469,3596,265,3130,auto-biography,autobiography,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,autobiography,1000149,http://sparql.cwrc.ca/ontologies/genre#autobio...


In [36]:
def add_class(genre): 
    genre_uri = genres[genre]
    links = edges_dict[genre_uri]
    
    return links

In [38]:
df_genres["class"] = df_genres["genre_name"].apply(add_class)
df_genres

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,tag_id,book_count,count,tag_name,clean_tag,stacked_embed,genre_name,genre_id,genre_uri,class
0,573,575,1584,17,62,adaptation,adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,adaptation,1000212,http://sparql.cwrc.ca/ontologies/genre#adaptation,[http://sparql.cwrc.ca/ontologies/genre#Intert...
1,797,799,2112,2,20,allegories,allegories,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,allegory,1000027,http://sparql.cwrc.ca/ontologies/genre#allegory,[http://sparql.cwrc.ca/ontologies/genre#Fictio...
2,798,800,2113,33,1577,allegory,allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,allegory,1000027,http://sparql.cwrc.ca/ontologies/genre#allegory,[http://sparql.cwrc.ca/ontologies/genre#Fictio...
3,1460,1462,3586,25,2005,autism,autism,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,paranormal,1000257,http://sparql.cwrc.ca/ontologies/genre#paranormal,[http://sparql.cwrc.ca/ontologies/genre#Themat...
4,1467,1469,3596,265,3130,auto-biography,autobiography,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,autobiography,1000149,http://sparql.cwrc.ca/ontologies/genre#autobio...,[http://sparql.cwrc.ca/ontologies/genre#Autobi...
...,...,...,...,...,...,...,...,...,...,...,...,...
107,14675,14677,31141,121,1509,translations,translations,[-0.36979428 -0.16403413 -0.32276735 ... 0.14...,translation,1000034,http://sparql.cwrc.ca/ontologies/genre#transla...,[http://sparql.cwrc.ca/ontologies/genre#Intert...
108,14932,14934,31667,11,268,utopia,utopia,[-0.3878106 -0.21069324 -0.4752291 ... 0.23...,utopia,1000108,http://sparql.cwrc.ca/ontologies/genre#utopia,[http://sparql.cwrc.ca/ontologies/genre#Fictio...
109,15155,15157,32130,1044,66056,war,war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,war,1000262,http://sparql.cwrc.ca/ontologies/genre#war,[http://sparql.cwrc.ca/ontologies/genre#Politi...
110,15156,15158,32131,2,5,war-,war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,war,1000262,http://sparql.cwrc.ca/ontologies/genre#war,[http://sparql.cwrc.ca/ontologies/genre#Politi...


In [39]:
def explode(df, lst_cols, fill_value='', preserve_index=False):
    # make sure `lst_cols` is list-alike
    if (lst_cols is not None
        and len(lst_cols) > 0
        and not isinstance(lst_cols, (list, tuple, np.ndarray, pd.Series))):
        lst_cols = [lst_cols]
    # all columns except `lst_cols`
    idx_cols = df.columns.difference(lst_cols)
    # calculate lengths of lists
    lens = df[lst_cols[0]].str.len()
    # preserve original index values    
    idx = np.repeat(df.index.values, lens)
    # create "exploded" DF
    res = (pd.DataFrame({
                col:np.repeat(df[col].values, lens)
                for col in idx_cols},
                index=idx)
             .assign(**{col:np.concatenate(df.loc[lens>0, col].values)
                            for col in lst_cols}))
    # append those rows that have empty lists
    if (lens == 0).any():
        # at least one list in cells is empty
        res = (res.append(df.loc[lens==0, idx_cols], sort=False)
                  .fillna(fill_value))
    # revert the original index order
    res = res.sort_index()
    # reset index if requested
    if not preserve_index:        
        res = res.reset_index(drop=True)
    return res
df_links = explode(df_genres, ["class"])

In [41]:
df_links

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,book_count,clean_tag,count,genre_id,genre_name,genre_uri,stacked_embed,tag_id,tag_name,class
0,573,575,17,adaptation,62,1000212,adaptation,http://sparql.cwrc.ca/ontologies/genre#adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,1584,adaptation,http://sparql.cwrc.ca/ontologies/genre#Interte...
1,797,799,2,allegories,20,1000027,allegory,http://sparql.cwrc.ca/ontologies/genre#allegory,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,2112,allegories,http://sparql.cwrc.ca/ontologies/genre#Fiction...
2,798,800,33,allegory,1577,1000027,allegory,http://sparql.cwrc.ca/ontologies/genre#allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,2113,allegory,http://sparql.cwrc.ca/ontologies/genre#Fiction...
3,1460,1462,25,autism,2005,1000257,paranormal,http://sparql.cwrc.ca/ontologies/genre#paranormal,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,3586,autism,http://sparql.cwrc.ca/ontologies/genre#Themati...
4,1467,1469,265,autobiography,3130,1000149,autobiography,http://sparql.cwrc.ca/ontologies/genre#autobio...,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,3596,auto-biography,http://sparql.cwrc.ca/ontologies/genre#Autobio...
...,...,...,...,...,...,...,...,...,...,...,...,...
126,15155,15157,1044,war,66056,1000262,war,http://sparql.cwrc.ca/ontologies/genre#war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,32130,war,http://sparql.cwrc.ca/ontologies/genre#Politic...
127,15155,15157,1044,war,66056,1000262,war,http://sparql.cwrc.ca/ontologies/genre#war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,32130,war,http://sparql.cwrc.ca/ontologies/genre#Themati...
128,15156,15158,2,war,5,1000262,war,http://sparql.cwrc.ca/ontologies/genre#war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,32131,war-,http://sparql.cwrc.ca/ontologies/genre#Politic...
129,15156,15158,2,war,5,1000262,war,http://sparql.cwrc.ca/ontologies/genre#war,[ 0.5518989 0.50877196 -0.49524078 ... 0.10...,32131,war-,http://sparql.cwrc.ca/ontologies/genre#Themati...


In [42]:
df_links["class_id"] = df_links["class"].apply(lambda x: genre_classes_ids[x])

In [43]:
df_links.head()

Unnamed: 0.2,Unnamed: 0,Unnamed: 0.1,book_count,clean_tag,count,genre_id,genre_name,genre_uri,stacked_embed,tag_id,tag_name,class,class_id
0,573,575,17,adaptation,62,1000212,adaptation,http://sparql.cwrc.ca/ontologies/genre#adaptation,[ 0.02827154 -0.39966932 -0.4262851 ... 0.13...,1584,adaptation,http://sparql.cwrc.ca/ontologies/genre#Interte...,1100052
1,797,799,2,allegories,20,1000027,allegory,http://sparql.cwrc.ca/ontologies/genre#allegory,[-0.7937539 -0.0359518 -0.28845835 ... 0.14...,2112,allegories,http://sparql.cwrc.ca/ontologies/genre#Fiction...,1100019
2,798,800,33,allegory,1577,1000027,allegory,http://sparql.cwrc.ca/ontologies/genre#allegory,[-7.8196716e-01 2.9064909e-01 -2.5628287e-01 ...,2113,allegory,http://sparql.cwrc.ca/ontologies/genre#Fiction...,1100019
3,1460,1462,25,autism,2005,1000257,paranormal,http://sparql.cwrc.ca/ontologies/genre#paranormal,[-0.51119936 -0.59016585 0.05876559 ... -0.12...,3586,autism,http://sparql.cwrc.ca/ontologies/genre#Themati...,1100047
4,1467,1469,265,autobiography,3130,1000149,autobiography,http://sparql.cwrc.ca/ontologies/genre#autobio...,[-0.02404164 -0.08033854 -0.32767 ... 0.20...,3596,auto-biography,http://sparql.cwrc.ca/ontologies/genre#Autobio...,1100032


In [44]:
# Create genre to class edges 
genre_to_class_edges = list(zip(list(df_links.genre_id.values), list(df_links.class_id.values)))
print("Is-a links: ", len(genre_to_class_edges))
# Remove duplicates 
genre_to_class_edges = list(set(genre_to_class_edges))
print("Is-a links unique: ", len(genre_to_class_edges))

Is-a links:  131
Is-a links unique:  100


In [45]:
# #Save edges
save_edges(genre_to_class_edges, "edges/genre_to_class_ids_new.edgelist")

## Concatenate new edges with previous ones 

In [8]:
#load book to tag edges without the weights 
tag_to_genre_edges = list()
with open("edges/tag_to_genre_ids_new.edgelist", "r") as f: 
    for line in f: 
        unweighted_edge = line.split()
        edge = (int(unweighted_edge[0]), int(unweighted_edge[1])) 
        tag_to_genre_edges.append(edge)
print(len(tag_to_genre_edges)

112


In [9]:
#load tag to class edges without the weights 
genre_to_class_edges = list()
with open("edges/genre_to_class_ids_new.edgelist", "r") as f: 
    for line in f: 
        unweighted_edge = line.split()
        edge = (int(unweighted_edge[0]), int(unweighted_edge[1])) 
        genre_to_class_edges.append(edge)

In [10]:
len(genre_to_class_edges)

100

In [15]:
new_edges = genre_to_class_edges + tag_to_genre_edges
print("New edges after merging with ontology: ", len(new_edges))

New edges after merging with ontology:  212


In [18]:
new_weighted_edges = [(edge[0], edge[1], HARD_WEIGHT) for edge in new_edges]

In [19]:
new_weighted_edges[0:10]

[(1000212, 1100052, 1000000),
 (1000079, 1100037, 1000000),
 (1000222, 1100007, 1000000),
 (1000090, 1100047, 1000000),
 (1000215, 1100016, 1000000),
 (1000030, 1100047, 1000000),
 (1000084, 1100031, 1000000),
 (1000249, 1100019, 1000000),
 (1000054, 1100047, 1000000),
 (1000179, 1100025, 1000000)]

In [23]:
#load book to tag edges without the weights 
book_to_tag_edges = list()
with open("edges/book_tags_full.edgelist", "r") as f: 
    for line in f: 
        weighted_edge = line.split(",")
        edge = (int(weighted_edge[0]), int(weighted_edge[1]), int(weighted_edge[2])) 
        book_to_tag_edges.append(edge)

In [24]:
book_to_tag_edges[0:10]

[(1, 30574, 167697),
 (1, 11305, 37174),
 (1, 11557, 34173),
 (1, 8717, 12986),
 (1, 33114, 12716),
 (1, 11743, 9954),
 (1, 14017, 7169),
 (1, 5207, 6221),
 (1, 22743, 4974),
 (1, 32989, 4364)]

In [25]:
print("Book to tag edges: ", len(book_to_tag_edges))

Book to tag edges:  999912


In [26]:
print("Percent of growth of edges: ", len(new_edges) / len(book_to_tag_edges))

Percent of growth of edges:  0.00021201865764187248


In [27]:
concat_edges = book_to_tag_edges + new_weighted_edges

In [54]:
concat_edges[-100:]

[(8114, 1000252, 1000000),
 (8116, 1000252, 1000000),
 (8130, 1000252, 1000000),
 (8147, 1000252, 1000000),
 (8149, 1000252, 1000000),
 (8150, 1000252, 1000000),
 (8151, 1000252, 1000000),
 (8560, 1000251, 1000000),
 (8573, 1000248, 1000000),
 (9336, 1000066, 1000000),
 (9385, 1000243, 1000000),
 (9389, 1000243, 1000000),
 (9432, 1000096, 1000000),
 (9472, 1000235, 1000000),
 (9482, 1000170, 1000000),
 (9593, 1000013, 1000000),
 (9669, 1000220, 1000000),
 (9697, 1000052, 1000000),
 (9886, 1000019, 1000000),
 (9969, 1000019, 1000000),
 (10059, 1000181, 1000000),
 (10064, 1000181, 1000000),
 (10080, 1000181, 1000000),
 (10081, 1000181, 1000000),
 (10082, 1000181, 1000000),
 (10083, 1000181, 1000000),
 (10084, 1000181, 1000000),
 (10711, 1000281, 1000000),
 (10729, 1000093, 1000000),
 (10753, 1000141, 1000000),
 (10825, 1000280, 1000000),
 (10835, 1000280, 1000000),
 (10886, 1000068, 1000000),
 (11061, 1000118, 1000000),
 (11106, 1000203, 1000000),
 (11148, 1000001, 1000000),
 (11152, 100

In [30]:
save_edges(concat_edges, "edges/book_tag_class_w_full.edgelist", weights=True)