## Populate an RDF database

This notebook reports the main steps to download CSV files, process them and create an RDF dataset from them accordingly to an ontology. 

To measure execution time in Jupyter notebooks: <code>pip install ipython-autotime</code>

In [1]:
# required libraries
import pandas as pd
import os
from pathlib import Path
from rdflib import Graph, Literal, RDF, URIRef, Namespace, RDFS
from rdflib.namespace import XSD, SKOS
# libraries for matching strings 
from fuzzywuzzy import fuzz, process



In [20]:
# melody ontology namespace
MEL = Namespace("http://www.dei.unipd.it/~gdb/ontology/melody#")
print(MEL)

# parameters and URLs
path = str(Path(os.path.abspath(os.getcwd())).parent.parent.absolute())
print(path)

# saving folder
savePath =  path + '/PopulateRDFdb/PopulateGenres/'

http://www.dei.unipd.it/~gdb/ontology/melody#
c:\Users\Manuel\Documents\università\magistrale\primo-semestre\graph-databases\MELODY


In [21]:
import unicodedata

def normalize_uri(name):
    name = unicodedata.normalize('NFKD', name).encode('ASCII', 'ignore').decode('ASCII')
    name = name.replace(" ", "")
    # Remove commas and apostrophes
    name = name.replace(",", "").replace("'", "")
    # Replace & with n
    name = name.replace("&", "n")
    # Replace +
    name = name.replace("+", "")

    if name == '-':
        name = 'NotAvailable'

    return name

## Genres

In [22]:
# Load the CSV files in memory
genresURL = path + '/csv/classified_genres.csv'
genres = pd.read_csv(genresURL, sep=',', index_col='Macro Genre')
print(genres.info())

<class 'pandas.core.frame.DataFrame'>
Index: 32 entries, Ambient to Unclassified
Data columns (total 1 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Sub Genres  32 non-null     object
dtypes: object(1)
memory usage: 512.0+ bytes
None


In [23]:
#create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("mel", MEL)
g.bind("xsd", XSD)
g.bind("skos", SKOS)
g.bind("rdfs", RDFS)

In [24]:
%%time 
#measure execution time

Genre = URIRef(MEL['Genre'])
GenreSchema = URIRef(MEL['GenreSchema'])
g.add((GenreSchema, RDF.type, SKOS.ConceptScheme))

#iterate over the grammy dataframe
for index, row in genres.iterrows():
  # Create macro genre SKOS Concept Scheme
  index = index.strip().title()
  # index = index.replace(' ', '-')
  MainGenre = URIRef(MEL[normalize_uri(index)])
  g.add((MainGenre, RDF.type, SKOS.Concept))
  g.add((MainGenre, SKOS.inScheme, GenreSchema))
  g.add((MainGenre, RDF.type, Genre))
  g.add((MainGenre, RDFS.label, Literal(index, lang='en')))
  for subGenre in row['Sub Genres'].split(', '):
    subGenre = subGenre.strip().title()
    # print('sub genre before replace is ', subGenre)
    # print('sub genre after replace is ', subGenre.replace(' ', ''))
    SubGenre = URIRef(MEL[normalize_uri(subGenre.replace(' ', ''))])
    # Define sub genre as SKOS concept...
    g.add((SubGenre, RDF.type, SKOS.Concept))
    # ...and add it to its SKOS Concept scheme
    g.add((SubGenre, SKOS.broader, MainGenre))
    g.add((SubGenre, RDF.type, Genre))
    g.add((SubGenre, RDFS.label, Literal(subGenre, lang='en')))

CPU times: total: 15.6 ms
Wall time: 43.8 ms


In [25]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(savePath + 'genres.ttl', 'w', encoding='utf-8') as file:
  file.write(g.serialize(format='turtle'))


--- saving serialization ---
CPU times: total: 62.5 ms
Wall time: 75.9 ms
