In [1]:
# Imports

import os
import time
import datetime
from pathlib import Path
import pandas as pd
import json
import spotifyCredentials
import requests
import base64

# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace

# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD


In [2]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")
genresPath = os.path.join(datasetsPath, "genres.csv")
marketsPath = os.path.join(datasetsPath, "markets.csv")
tracksPath = os.path.join(datasetsPath, "tracks.csv")
albumsPath = os.path.join(datasetsPath, "albums.csv")
artistsPath = os.path.join(datasetsPath, "artists.csv")
peoplePath = os.path.join(datasetsPath, "people.csv")

# saving folder
savePath =  os.path.join(datasetsPath, "rdf")


In [3]:
# Construct the ontologies namespaces not known by RDFlib

#Country Ontology
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")

#Spotify Ontology
SO = Namespace("https://www.dei.unipd.it/~martinelli/spotify/spotifyOntology#")

# Albums

In [4]:
#Create the graph
g = Graph()

# Bind the namespaces to a prefix for more readable output
g.bind("foaf", FOAF)
g.bind("xsd", XSD)
g.bind("countries", CNS)
g.bind("so", SO)

# Load the CSV files in memory
albums = pd.read_csv(albumsPath, sep=',', index_col='id')

In [5]:
%%time 
#measure execution time

#iterate over the league dataframe
for index, row in albums.iterrows():
    # Create the node to add to the Graph
    # the node has the namespace + the album ID as URI
    idU = str(index)
    
    Album = URIRef(SO[idU])

    # Add triples using store's add() method.
    g.add((Album, RDF.type, SO.Album))

    g.add((Album, SO['name'], Literal(row['title'], datatype=XSD.string)))
    g.add((Album, SO['totalTracks'], Literal(row['total_tracks'], datatype=XSD.int)))

    #Manage release date taking into account release precision
    date = row['release_date']
    if(row['release_date_precision']=='year'):
        date += '-01-01'
    elif(row['release_date_precision']=='month'):
        date += '-01'
     
    g.add((Album, SO['releaseDate'], Literal(date, datatype=XSD.date)))    
    
    #Manage album type
    albumType = URIRef(SO[row['album_type']])
    g.add((Album, SO['isTypeOf'], albumType))  

    #Manage available markets
    countries = json.loads(row["available_countries"].replace("'",'"'))

    for c in countries:
        # create the RDF node
        Country = URIRef(CNS[str(c).lower()])

        # add the edge connecting the Album and the Country 
        g.add((Album, SO['isAvailableIn'], Country))  
     

Wall time: 52.9 s


In [6]:
%%time
# print all the data in the Turtle format
print("--- saving serialization ---")
with open(os.path.join(savePath, "albums.ttl"), 'w', encoding="utf-8") as file:
    file.write(g.serialize(format='turtle'))

--- saving serialization ---
Wall time: 1min 51s
