In [None]:
# Imports

import os
from pathlib import Path
import pandas as pd
import datetime

# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace

# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [None]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")
rdfPath = os.path.join(absPath, "rdf")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Create RDF directory if not exists
if not os.path.exists(rdfPath):
    os.mkdir(rdfPath)

# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")
genresPath = os.path.join(datasetsPath, "genres.csv")
marketsPath = os.path.join(datasetsPath, "markets.csv")
tracksPath = os.path.join(datasetsPath, "tracks.csv")
albumsPath = os.path.join(datasetsPath, "albums.csv")
artistsPath = os.path.join(datasetsPath, "artists.csv")
peoplePath = os.path.join(datasetsPath, "people.csv")

# Countries
countriesPath = os.path.join(datasetsPath, "countries2.csv")
altCountriesPath = os.path.join(datasetsPath, "altCountries.csv")

# Setup Turtle paths
genresTTLPath = os.path.join(rdfPath, "genres.ttl")
marketsTTLPath = os.path.join(rdfPath, "markets.ttl")
tracksTTLPath = os.path.join(rdfPath, "tracks.ttl")
albumsTTLPath = os.path.join(rdfPath, "albums.ttl")
artistsTTLPath = os.path.join(rdfPath, "artists.ttl")
peopleTTLPath = os.path.join(rdfPath, "people.ttl")
chartsTTLPath = os.path.join(rdfPath, "charts.ttl")
appearanceTTLPath = os.path.join(rdfPath, "appearance.ttl")


In [None]:
# Construct the ontologies namespaces not known by RDFlib

# Country Ontology
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")

# Spotify Ontology
SO = Namespace("https://www.dei.unipd.it/~martinelli/spotify/spotifyOntology#")

# Utilites

In [None]:
def createGraph():
    # Create the graph
    g = Graph()

    # Bind the namespaces to a prefix for more readable output
    g.bind("foaf", FOAF)
    g.bind("xsd", XSD)
    g.bind("countries", CNS)
    g.bind("so", SO)

    return g

In [None]:
def loadCountries():
    countries = pd.read_csv(countriesPath, sep=",")
    altCountries = pd.read_csv(altCountriesPath, sep=",")
    altCountries.columns = ["AlternativeName", "Name"]
    
    return countries, altCountries


def getCountryCode(countryName, countries, altCountries):
    # Try to retrieve ISO CODE of the country
    try:
        matchedCountries = countries[countries["Name"].str.contains(countryName)]
        countryCode = matchedCountries["Code"].iloc[0]
    except IndexError as e:
        # Look if an alternative name was used
        alternativeMatchedCountries = altCountries[altCountries["AlternativeName"].str.contains(countryName)]
        countryName = alternativeMatchedCountries["Name"].iloc[0]

        matchedCountries = countries[countries["Name"] == countryName]
        countryCode = matchedCountries["Code"].iloc[0]
    
    return countryCode, countryName


# Genres

In [None]:
# Create Graph
g = createGraph()

In [None]:
# Load the CSV files in memory
genres = pd.read_csv(genresPath, sep=",", index_col="genre")

In [None]:
def createGenreID(genre):
    # Replace all special chars with "-"
    genreID = ""
    for char in genre:
        genreID += char if char.isalnum() else "-"
        
    return genreID


In [None]:
# Iterate over the album DataFrame
for genre, row in genres.iterrows():
    # Create genre ID from name
    genreID = createGenreID(genre)

    # Create the node to add to the Graph
    Genre = URIRef(SO[genreID])

    # Add triples using store's add() method.
    g.add((Genre, RDF.type, SO.Genre))

    # Add the name of the genre
    g.add((Genre, SO["name"], Literal(genre, datatype=XSD.string)))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(genresTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

# Artists

In [None]:
# Create Graph
g = createGraph()

In [None]:
# Load the CSV files in memory
artists = pd.read_csv(artistsPath, sep=",", index_col="id")

In [None]:
# Iterate over the album DataFrame
for artistID, row in artists.iterrows():
    # Create the node to add to the Graph
    Artist = URIRef(SO[artistID])

    # Add triples using store's add() method.
    g.add((Artist, RDF.type, SO.Artist))

    # Add the name of the artist
    g.add((Artist, SO["name"], Literal(row["name"], datatype=XSD.string)))

    # Add the popularity of the artist
    g.add((Artist, SO["popularity"], Literal(row["popularity"], datatype=XSD.int)))

    # Load genres as array
    genres = row["genres"].split(",") if not pd.isnull(row["genres"]) else []

    for genre in genres:
        # Create the RDF node
        Genre = URIRef(SO[createGenreID(genre)])

        # Add the edge connecting the Album and the Country
        g.add((Artist, SO["hasGenre"], Genre))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(artistsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

# Albums

In [None]:
# Create Graph
g = createGraph()

In [None]:
# Load the CSV files in memory
albums = pd.read_csv(albumsPath, sep=",", index_col="id")

In [None]:
# Iterate over the album DataFrame
for albumID, row in albums.iterrows():
    # Create the node to add to the Graph 
    Album = URIRef(SO[albumID])

    # Add triples using store's add() method.
    g.add((Album, RDF.type, SO.Album))

    # Add the name of the album
    g.add((Album, SO["name"], Literal(row["title"], datatype=XSD.string)))
    
    # Add the total tracks of the album
    g.add((Album, SO["totalTracks"], Literal(row["total_tracks"], datatype=XSD.int)))

    # Manage release date taking into account release precision
    releaseDate = row["release_date"]
    if(row["release_date_precision"]=="year"):
        releaseDate += "-01-01"
    elif(row["release_date_precision"]=="month"):
        releaseDate += "-01"
    
    # Add the release date of the album
    g.add((Album, SO["releaseDate"], Literal(releaseDate, datatype=XSD.date)))    
    
    # Add album type
    albumType = URIRef(SO[row["album_type"]])
    g.add((Album, SO["isTypeOf"], albumType))  

    # Load countries as array
    countries = row["available_countries"].split(",") if not pd.isnull(row["available_countries"]) else []

    for country in countries:
        # Create the RDF node
        Country = URIRef(CNS[country.lower()])

        # Add the edge connecting the Album and the Country 
        g.add((Album, SO["isAvailableIn"], Country))
    
    # Load artists as array
    artists = row["artists"].split(",")

    for artistID in artists:
        # Create the RDF node
        Artist = URIRef(SO[artistID])

        # Add the edge connecting the Album and the Artist
        g.add((Artist, SO["partecipateIn"], Album))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(albumsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

# Tracks

In [None]:
# Create Graph
g = createGraph()

In [None]:
# Load the CSV files in memory
tracks = pd.read_csv(tracksPath, sep=",", index_col="id")

In [None]:
# Iterate over the tracks DataFrame

for trackID, row in tracks.iterrows():
    # Create the node to add to the Graph
    Track = URIRef(SO[trackID])

    # Add triples using store's add() method.
    g.add((Track, RDF.type, SO.Track))

    # Add the name of the track
    g.add((Track, SO["name"], Literal(row["title"], datatype=XSD.string)))

    # Add all the technical charateristics
    g.add((Track, SO["duration"], Literal(row["duration"], datatype=XSD.int)))
    g.add((Track, SO["popularity"], Literal(row["popularity"], datatype=XSD.int)))
    g.add((Track, SO["explicit"], Literal(row["explicit"], datatype=XSD.boolean)))
    g.add((Track, SO["key"], Literal(row["key"], datatype=XSD.int)))
    g.add((Track, SO["tempo"], Literal(row["tempo"], datatype=XSD.float)))
    g.add((Track, SO["mode"], Literal(row["mode"], datatype=XSD.int)))
    g.add((Track, SO["time_signature"], Literal(row["time_signature"], datatype=XSD.int)))
    g.add((Track, SO["acousticness"], Literal(row["acousticness"], datatype=XSD.float)))
    g.add((Track, SO["danceability"], Literal(row["danceability"], datatype=XSD.float)))
    g.add((Track, SO["energy"], Literal(row["energy"], datatype=XSD.float)))
    g.add((Track, SO["loudness"], Literal(row["loudness"], datatype=XSD.float)))
    g.add((Track, SO["liveness"], Literal(row["liveness"], datatype=XSD.float)))
    g.add((Track, SO["valence"], Literal(row["valence"], datatype=XSD.float)))
    g.add((Track, SO["speechiness"], Literal(row["speechiness"], datatype=XSD.float)))
    g.add((Track, SO["instrumentalness"], Literal(row["instrumentalness"], datatype=XSD.float)))

    # Load countries as array
    countries = row["available_countries"].split(",") if not pd.isnull(row["available_countries"]) else []

    for country in countries:
        # Create the RDF node
        Country = URIRef(CNS[country.lower()])

        # Add the edge connecting the Track and the Country
        g.add((Track, SO["isAvailableIn"], Country))

    # Load artists as array
    artists = row["artists"].split(",")

    for artistID in artists:
        # Create the RDF node
        Artist = URIRef(SO[artistID])

        # Add the edge connecting the Track and the Artist
        g.add((Artist, SO["partecipateIn"], Track))

    #Retrieve albumID
    albumID = row["album"]

    # Create the RDF node
    Album = URIRef(SO[albumID])

    # Add the edge connecting the Track and the Artist
    g.add((Track, SO["isPartOf"], Album))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(tracksTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

# Charts

In [None]:
# Create Graph
g = createGraph()

# Load countries dataframe
countries, altCountries = loadCountries()

In [None]:
# Load the CSV files in memory
charts = pd.read_csv(spotifyChartsPath , sep=",")

# Aggregate the original dataframe to identify a specific chart using COUNTRY and DATE
chartsDF = charts.groupby(
    ["country", "date"]).size().reset_index(name="total_tracks")

# Removing global
chartsDF = chartsDF.drop(index=chartsDF[chartsDF["country"] == "Global"].index)
charts = charts.drop(index=charts[charts["country"] == "Global"].index)

totalRows = len(charts.index)


In [None]:
#I iterate through the dataframe
for index, row in chartsDF.iterrows():

    # Retrieve country and date
    countryName = row["country"]
    date = row["date"]
    topNumType = 100
    numTotalTracks = row["total_tracks"]

    # Reformat date
    date = datetime.datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")

    # Get the country code
    countryCode, _ = getCountryCode(countryName, countries, altCountries)

    # Create a uniqueID
    chartID = "top-{}-{}-{}".format(topNumType, countryCode, date)

    # Create the node to add to the Graph
    Chart = URIRef(SO[chartID])

    # Add triples using store's add() method.
    g.add((Chart, RDF.type, SO.Chart))

    # Add the name of the Chart
    chartName = "TOP {} {}".format(topNumType, countryName)
    g.add((Chart, SO["name"], Literal(chartName, datatype=XSD.string)))

    # Add the date of the chart
    g.add((Chart, SO["date"], Literal(date, datatype=XSD.date)))

    # Add the number of tracks
    g.add((Chart, SO["totalTracks"], Literal(row['total_tracks'], datatype=XSD.int)))

    # Add related Country
    # Create the RDF node
    Country = URIRef(CNS[countryCode.lower()])
   
    # Add the edge connecting the Chart and the Country
    g.add((Chart, SO["isReferredTo"], Country))

    # Add chart type
    chartType = URIRef(SO["top"])
    g.add((Chart, SO["isTypeOf"], chartType))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(chartsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

# Appearance

In [None]:
# Create Graph
g = createGraph()

# Load countries dataframe
countries, altCountries = loadCountries()

In [None]:
# I iterate through the dataframe
realIndex = 0

for index, row in charts.iterrows():

    # Create a uniqueID
    appearanceID = "appearance-{}".format(index)

    # Create the node to add to the Graph
    Appearance = URIRef(SO[appearanceID])

    # Add triples using store's add() method.
    g.add((Appearance, RDF.type, SO.Appearance))

    # Add the position of track
    g.add((Chart, SO["position"], Literal(row['position'], datatype=XSD.int)))

    # Get the track id from the uri
    trackID = row['uri'].removeprefix("https://open.spotify.com/track/")

    # Add the edge connecting Appearance to the Track
    Track = URIRef(SO[trackID])
    g.add((Track, SO["appearsIn"], Appearance))

    # Retrieve country and date
    countryName = row["country"]
    date = row["date"]

    # Reformat date
    date = datetime.datetime.strptime(date, "%d/%m/%Y").strftime("%Y-%m-%d")

    # Get the country code
    countryCode, _ = getCountryCode(countryName, countries, altCountries)

    # Create a uniqueID
    chartID = "top-100-{}-{}".format(countryCode, date)

    # Create the node to add to the Graph
    Chart = URIRef(SO[chartID])

    g.add((Appearance, SO["isPositionedIn"], Chart))

    realIndex += 1

    if realIndex % 10000 == 0:
        print("💾 [STATUS INFO] {row}/{totalRows} ({percentage:.2f}%)\n".format(
            row=realIndex, totalRows=totalRows, percentage=((realIndex * 100) / totalRows)))

print("💾 [STATUS INFO] {row}/{totalRows} ({percentage:.2f}%)\n".format(
    row=realIndex, totalRows=totalRows, percentage=((realIndex * 100) / totalRows)))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(appearanceTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

# People

In [None]:
# Create Graph
g = createGraph()

In [None]:
# Load the CSV files in memory
people = pd.read_csv(peoplePath, sep=",", index_col="id")

In [None]:
def autoCompleteDate(dateStr):
    if dateStr is not None:
        if len(dateStr) == 4:
            dateStr += "-01-01"
        elif len(dateStr) == 7:
            dateStr += "-01"

    return dateStr

In [None]:
groupedPeople = people.groupby(["id"])

index = 0

# iterate over each group
for peopleID, peopleGroup in groupedPeople:

    name = peopleGroup.iloc[0]["name"] if not pd.isnull(
        peopleGroup.iloc[0]["name"]) else ""
    surname = peopleGroup.iloc[0]["surname"] if not pd.isnull(
        peopleGroup.iloc[0]["surname"]) else ""
    birthDate= peopleGroup.iloc[0]["birthdate"] if not pd.isnull(
        peopleGroup.iloc[0]["birthdate"]) else ""
    deathDate= peopleGroup.iloc[0]["deathdate"] if not pd.isnull(
        peopleGroup.iloc[0]["deathdate"]) else ""
    nationality= peopleGroup.iloc[0]["nationality"] if not pd.isnull(
        peopleGroup.iloc[0]["nationality"]) else ""

    # Get the list of artists in which the person appear
    artists = []
    for _, row in peopleGroup.iterrows():
        artists.append(row["artist"])

    # Create a uniqueID
    peopleID = "people-{}".format(index)
    index += 1

    # Create the node to add to the Graph
    People = URIRef(SO[peopleID])
    # Add triples using store's add() method.
    g.add((People, RDF.type, SO.People))

    # Add name and surname
    g.add((People, SO["name"], Literal(name, datatype=XSD.string)))
    g.add((People, SO["surname"], Literal(
        surname, datatype=XSD.string)))

    # Manage dates
    birthDate = autoCompleteDate(birthDate)
    birthDate = birthDate if not birthDate is None else ""
    
    deathDate = autoCompleteDate(deathDate)
    deathDate = deathDate if not deathDate is None else ""

    g.add((People, SO["birthdate"], Literal(birthDate, datatype=XSD.date)))
    g.add((People, SO["deathdate"], Literal(deathDate, datatype=XSD.date)))

    # Add nationality
    # Create the RDF node
    if nationality is not None:
        Country = URIRef(CNS[nationality.lower()])
        # Add the edge connecting People and the Country
        g.add((People, SO["hasNationality"], Country))

    #manage edge connectig artists and people
    for artistID in artists:
        # Create the RDF node
        Artist = URIRef(SO[artistID])

        # Add the edge connecting the Track and the Artist
        g.add((People, SO["isMemberOf"], Artist))


In [None]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(peopleTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))