In [1]:
# Imports

import os
from pathlib import Path
import pandas as pd
import json

# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace

# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [2]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")
rdfPath = os.path.join(absPath, "rdf")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Create RDF directory if not exists
if not os.path.exists(rdfPath):
    os.mkdir(rdfPath)

# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")
genresPath = os.path.join(datasetsPath, "genres.csv")
marketsPath = os.path.join(datasetsPath, "markets.csv")
tracksPath = os.path.join(datasetsPath, "tracks.csv")
albumsPath = os.path.join(datasetsPath, "albums.csv")
artistsPath = os.path.join(datasetsPath, "artists.csv")
peoplePath = os.path.join(datasetsPath, "people.csv")

# Setup Turtle paths
genresTTLPath = os.path.join(rdfPath, "genres.ttl")
marketsTTLPath = os.path.join(rdfPath, "markets.ttl")
tracksTTLPath = os.path.join(rdfPath, "tracks.ttl")
albumsTTLPath = os.path.join(rdfPath, "albums.ttl")
artistsTTLPath = os.path.join(rdfPath, "artists.ttl")
peopleTTLPath = os.path.join(rdfPath, "people.ttl")
appearanceTTLPath = os.path.join(rdfPath, "appearance.ttl")


In [3]:
# Construct the ontologies namespaces not known by RDFlib

# Country Ontology
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")

# Spotify Ontology
SO = Namespace("https://www.dei.unipd.it/~martinelli/spotify/spotifyOntology#")

In [4]:
def createGraph():
    # Create the graph
    g = Graph()

    # Bind the namespaces to a prefix for more readable output
    g.bind("foaf", FOAF)
    g.bind("xsd", XSD)
    g.bind("countries", CNS)
    g.bind("so", SO)

    return g

# Genres

In [5]:
# Create Graph
g = createGraph()

In [6]:
# Load the CSV files in memory
genres = pd.read_csv(genresPath, sep=",", index_col="genre")

In [7]:
def createGenreID(genre):
    # Replace all special chars with "-"
    genreID = ""
    for char in genre:
        genreID += char if char.isalnum() else "-"
        
    return genreID


In [8]:
# Iterate over the album DataFrame
for genre, row in genres.iterrows():
    # Create genre ID from name
    genreID = createGenreID(genre)

    # Create the node to add to the Graph
    Genre = URIRef(SO[genreID])

    # Add triples using store's add() method.
    g.add((Genre, RDF.type, SO.Genre))

    # Add the name of the genre
    g.add((Genre, SO["name"], Literal(genre, datatype=XSD.string)))


In [9]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(genresTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Artists

In [10]:
# Create Graph
g = createGraph()

In [11]:
# Load the CSV files in memory
artists = pd.read_csv(artistsPath, sep=",", index_col="id")

In [12]:
# Iterate over the album DataFrame
for artistID, row in artists.iterrows():
    # Create the node to add to the Graph
    Artist = URIRef(SO[artistID])

    # Add triples using store's add() method.
    g.add((Artist, RDF.type, SO.Artist))

    # Add the name of the artist
    g.add((Artist, SO["name"], Literal(row["name"], datatype=XSD.string)))

    # Add the popularity of the artist
    g.add((Artist, SO["popularity"], Literal(row["popularity"], datatype=XSD.int)))

    # Load genres as array
    genres = row["genres"].split(",") if not pd.isnull(row["genres"]) else []

    for genre in genres:
        # Create the RDF node
        Genre = URIRef(SO[createGenreID(genre)])

        # Add the edge connecting the Album and the Country
        g.add((Artist, SO["hasGenre"], Genre))


In [13]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(artistsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Albums

In [14]:
# Create Graph
g = createGraph()

In [15]:
# Load the CSV files in memory
albums = pd.read_csv(albumsPath, sep=",", index_col="id")

In [16]:
# Iterate over the album DataFrame
for albumID, row in albums.iterrows():
    # Create the node to add to the Graph 
    Album = URIRef(SO[albumID])

    # Add triples using store's add() method.
    g.add((Album, RDF.type, SO.Album))

    # Add the name of the album
    g.add((Album, SO["name"], Literal(row["title"], datatype=XSD.string)))
    
    # Add the total tracks of the album
    g.add((Album, SO["totalTracks"], Literal(row["total_tracks"], datatype=XSD.int)))

    # Manage release date taking into account release precision
    releaseDate = row["release_date"]
    if(row["release_date_precision"]=="year"):
        releaseDate += "-01-01"
    elif(row["release_date_precision"]=="month"):
        releaseDate += "-01"
    
    # Add the release date of the album
    g.add((Album, SO["releaseDate"], Literal(releaseDate, datatype=XSD.date)))    
    
    # Add album type
    albumType = URIRef(SO[row["album_type"]])
    g.add((Album, SO["isTypeOf"], albumType))  

    # Load countries as array
    countries = row["available_countries"].split(",") if not pd.isnull(row["available_countries"]) else []

    for country in countries:
        # Create the RDF node
        Country = URIRef(CNS[country.lower()])

        # Add the edge connecting the Album and the Country 
        g.add((Album, SO["isAvailableIn"], Country))  
     


In [17]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(albumsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


## Tracks

In [18]:
# Create Graph
g = createGraph()

In [19]:
# Load the CSV files in memory
tracks = pd.read_csv(tracksPath, sep=",", index_col="id")

In [20]:
# Iterate over the tracks DataFrame

for trackID, row in tracks.iterrows():
    # Create the node to add to the Graph 
    Track = URIRef(SO[trackID])

    # Add triples using store's add() method.
    g.add((Track, RDF.type, SO.Track))

    # Add the name of the track
    g.add((Track, SO["name"], Literal(row["title"], datatype=XSD.string)))
    
    # Add all the technical charateristics
    g.add((Track, SO["duration"], Literal(row["duration"], datatype=XSD.int)))
    g.add((Track, SO["popularity"], Literal(row["popularity"], datatype=XSD.int)))
    g.add((Track, SO["explicit"], Literal(row["explicit"], datatype=XSD.boolean)))
    g.add((Track, SO["key"], Literal(row["key"], datatype=XSD.int)))
    g.add((Track, SO["tempo"], Literal(row["tempo"], datatype=XSD.float)))
    g.add((Track, SO["mode"], Literal(row["mode"], datatype=XSD.int)))
    g.add((Track, SO["time_signature"], Literal(row["time_signature"], datatype=XSD.int)))
    g.add((Track, SO["acousticness"], Literal(row["acousticness"], datatype=XSD.float)))
    g.add((Track, SO["danceability"], Literal(row["danceability"], datatype=XSD.float)))
    g.add((Track, SO["energy"], Literal(row["energy"], datatype=XSD.float)))
    g.add((Track, SO["loudness"], Literal(row["loudness"], datatype=XSD.float)))
    g.add((Track, SO["liveness"], Literal(row["liveness"], datatype=XSD.float)))
    g.add((Track, SO["valence"], Literal(row["valence"], datatype=XSD.float)))
    g.add((Track, SO["speechiness"], Literal(row["speechiness"], datatype=XSD.float)))
    g.add((Track, SO["instrumentalness"], Literal(row["instrumentalness"], datatype=XSD.float)))

    # Load countries as array
    countries = row["available_countries"].split(",") if not pd.isnull(row["available_countries"]) else []

    for country in countries:
        # Create the RDF node
        Country = URIRef(CNS[country.lower()])

        # Add the edge connecting the Track and the Country 
        g.add((Track, SO["isAvailableIn"], Country))  


    # Load artists as array
    artists = row["artists"].split(",")

    for artist in artists:
        # Create the RDF node
        Artist = URIRef(SO[artist])

        # Add the edge connecting the Track and the Artist
        g.add((Artist, SO["partecipateIn"], Track))  

    
    #Retrieve albumID
    albumID = row["album"]

    # Create the RDF node
    Album = URIRef(SO[albumID])

    # Add the edge connecting the Track and the Artist
    g.add((Track, SO["isPartOf"], Album))  


In [21]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(tracksTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


## Charts

In [None]:
# Create Graph
g = createGraph()

In [None]:
# Load the CSV files in memory
charts = pd.read_csv(spotifyChartsPath , sep=",")

In [None]:
# Iterate over the charts DataFrame

"""
A  differenza degli altri csv dove ho un id ben preciso, qui devo creare un id che rappresenti quella specifica classifica 
combinando il paese di riferimento e la data di riferimento in modo da ottenere degli id univoci.

Anche per il nome della classifica penso vada fatto un discorso analogo, mentre per il numero di tracce e per la tipologia
possiamo usare dei valori fissi visto che abbiamo solamente questi dati

Il problema è che dallo stesso file dobbiamo ottenere sia gli id per costruire gli oggetti di tipo classifica che i dati 
sia i dati necessari a collegare una specifica track a una posizione in classifica
"""

""" 
for row in tracks.iterrows():
    #Retrieve info about the chart
    country = row["country"]
    date = row["date"]

    #Create a uniqueID
    chartID = "TOP100_" + country + "_" + date

    # Create the node to add to the Graph 
    Chart = URIRef(SO[chartID])

    # Add triples using store's add() method.
    g.add((Chart, RDF.type, SO.Chart))

    # Add the name of the Chart
    g.add((Track, SO["name"], Literal("TOP 100 " + Country, datatype=XSD.string)))

    # Add the  date of the chart
    g.add((Album, SO["date"], Literal(date, datatype=XSD.date)))    
    
    # Add the number of tracks
    g.add((Album, SO["totalTracks"], Literal("100", datatype=XSD.int)))

    # Add chart type
    chartType = URIRef(SO["Top"]])
    g.add((Chart, SO["isTypeOf"], chartType))  
"""