In [1]:
# Imports

import os
from pathlib import Path
import pandas as pd
import datetime

# Load the required libraries
from rdflib import Graph, Literal, RDF, URIRef, Namespace

# rdflib knows about some namespaces, like FOAF
from rdflib.namespace import FOAF, XSD

In [2]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")
rdfPath = os.path.join(absPath, "rdf")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Create RDF directory if not exists
if not os.path.exists(rdfPath):
    os.mkdir(rdfPath)

# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")
genresPath = os.path.join(datasetsPath, "genres.csv")
marketsPath = os.path.join(datasetsPath, "markets.csv")
tracksPath = os.path.join(datasetsPath, "tracks.csv")
albumsPath = os.path.join(datasetsPath, "albums.csv")
artistsPath = os.path.join(datasetsPath, "artists.csv")
peoplePath = os.path.join(datasetsPath, "people.csv")

# Countries
countriesPath = os.path.join(datasetsPath, "countries2.csv")
altCountriesPath = os.path.join(datasetsPath, "altCountries.csv")

# Setup Turtle paths
genresTTLPath = os.path.join(rdfPath, "genres.ttl")
marketsTTLPath = os.path.join(rdfPath, "markets.ttl")
tracksTTLPath = os.path.join(rdfPath, "tracks.ttl")
albumsTTLPath = os.path.join(rdfPath, "albums.ttl")
artistsTTLPath = os.path.join(rdfPath, "artists.ttl")
peopleTTLPath = os.path.join(rdfPath, "people.ttl")
chartsTTLPath = os.path.join(rdfPath, "charts.ttl")
appearanceTTLPath = os.path.join(rdfPath, "appearance.ttl")


In [3]:
# Construct the ontologies namespaces not known by RDFlib

# Country Ontology
CNS = Namespace("http://eulersharp.sourceforge.net/2003/03swap/countries#")

# Spotify Ontology
SO = Namespace("https://www.dei.unipd.it/~martinelli/spotify/spotifyOntology#")

# Utilites

In [4]:
def createGraph():
    # Create the graph
    g = Graph()

    # Bind the namespaces to a prefix for more readable output
    g.bind("foaf", FOAF)
    g.bind("xsd", XSD)
    g.bind("countries", CNS)
    g.bind("so", SO)

    return g

In [5]:
def loadCountries():
    countries = pd.read_csv(countriesPath, sep=",")
    altCountries = pd.read_csv(altCountriesPath, sep=",")
    altCountries.columns = ['AlternativeName', 'Name']
    
    return countries, altCountries


def getCountryCode(countryName, countries, altCountries):
    # Try to retrieve ISO CODE of the country
    try:
        matchedCountries = countries[countries['Name'].str.contains(countryName)]
        countryCode = matchedCountries['Code'].iloc[0]
    except IndexError as e:
        # Look if an alternative name was used
        alternativeMatchedCountries = altCountries[altCountries['AlternativeName'].str.contains(countryName)]
        countryName = alternativeMatchedCountries['Name'].iloc[0]

        matchedCountries = countries[countries['Name'] == countryName]
        countryCode = matchedCountries['Code'].iloc[0]
    
    return countryCode, countryName


# Genres

In [6]:
# Create Graph
g = createGraph()

In [7]:
# Load the CSV files in memory
genres = pd.read_csv(genresPath, sep=",", index_col="genre")

In [8]:
def createGenreID(genre):
    # Replace all special chars with "-"
    genreID = ""
    for char in genre:
        genreID += char if char.isalnum() else "-"
        
    return genreID


In [9]:
# Iterate over the album DataFrame
for genre, row in genres.iterrows():
    # Create genre ID from name
    genreID = createGenreID(genre)

    # Create the node to add to the Graph
    Genre = URIRef(SO[genreID])

    # Add triples using store's add() method.
    g.add((Genre, RDF.type, SO.Genre))

    # Add the name of the genre
    g.add((Genre, SO["name"], Literal(genre, datatype=XSD.string)))


In [10]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(genresTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Artists

In [11]:
# Create Graph
g = createGraph()

In [12]:
# Load the CSV files in memory
artists = pd.read_csv(artistsPath, sep=",", index_col="id")

In [13]:
# Iterate over the album DataFrame
for artistID, row in artists.iterrows():
    # Create the node to add to the Graph
    Artist = URIRef(SO[artistID])

    # Add triples using store's add() method.
    g.add((Artist, RDF.type, SO.Artist))

    # Add the name of the artist
    g.add((Artist, SO["name"], Literal(row["name"], datatype=XSD.string)))

    # Add the popularity of the artist
    g.add((Artist, SO["popularity"], Literal(row["popularity"], datatype=XSD.int)))

    # Load genres as array
    genres = row["genres"].split(",") if not pd.isnull(row["genres"]) else []

    for genre in genres:
        # Create the RDF node
        Genre = URIRef(SO[createGenreID(genre)])

        # Add the edge connecting the Album and the Country
        g.add((Artist, SO["hasGenre"], Genre))


In [14]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(artistsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Albums

In [15]:
# Create Graph
g = createGraph()

In [16]:
# Load the CSV files in memory
albums = pd.read_csv(albumsPath, sep=",", index_col="id")

In [17]:
# Iterate over the album DataFrame
for albumID, row in albums.iterrows():
    # Create the node to add to the Graph 
    Album = URIRef(SO[albumID])

    # Add triples using store's add() method.
    g.add((Album, RDF.type, SO.Album))

    # Add the name of the album
    g.add((Album, SO["name"], Literal(row["title"], datatype=XSD.string)))
    
    # Add the total tracks of the album
    g.add((Album, SO["totalTracks"], Literal(row["total_tracks"], datatype=XSD.int)))

    # Manage release date taking into account release precision
    releaseDate = row["release_date"]
    if(row["release_date_precision"]=="year"):
        releaseDate += "-01-01"
    elif(row["release_date_precision"]=="month"):
        releaseDate += "-01"
    
    # Add the release date of the album
    g.add((Album, SO["releaseDate"], Literal(releaseDate, datatype=XSD.date)))    
    
    # Add album type
    albumType = URIRef(SO[row["album_type"]])
    g.add((Album, SO["isTypeOf"], albumType))  

    # Load countries as array
    countries = row["available_countries"].split(",") if not pd.isnull(row["available_countries"]) else []

    for country in countries:
        # Create the RDF node
        Country = URIRef(CNS[country.lower()])

        # Add the edge connecting the Album and the Country 
        g.add((Album, SO["isAvailableIn"], Country))
    
    # Load artists as array
    artists = row["artists"].split(",")

    for artistID in artists:
        # Create the RDF node
        Artist = URIRef(SO[artistID])

        # Add the edge connecting the Album and the Artist
        g.add((Artist, SO["partecipateIn"], Album))


In [18]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(albumsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Tracks

In [19]:
# Create Graph
g = createGraph()

In [20]:
# Load the CSV files in memory
tracks = pd.read_csv(tracksPath, sep=",", index_col="id")

In [21]:
# Iterate over the tracks DataFrame

for trackID, row in tracks.iterrows():
    # Create the node to add to the Graph
    Track = URIRef(SO[trackID])

    # Add triples using store's add() method.
    g.add((Track, RDF.type, SO.Track))

    # Add the name of the track
    g.add((Track, SO["name"], Literal(row["title"], datatype=XSD.string)))

    # Add all the technical charateristics
    g.add((Track, SO["duration"], Literal(row["duration"], datatype=XSD.int)))
    g.add((Track, SO["popularity"], Literal(row["popularity"], datatype=XSD.int)))
    g.add((Track, SO["explicit"], Literal(row["explicit"], datatype=XSD.boolean)))
    g.add((Track, SO["key"], Literal(row["key"], datatype=XSD.int)))
    g.add((Track, SO["tempo"], Literal(row["tempo"], datatype=XSD.float)))
    g.add((Track, SO["mode"], Literal(row["mode"], datatype=XSD.int)))
    g.add((Track, SO["time_signature"], Literal(row["time_signature"], datatype=XSD.int)))
    g.add((Track, SO["acousticness"], Literal(row["acousticness"], datatype=XSD.float)))
    g.add((Track, SO["danceability"], Literal(row["danceability"], datatype=XSD.float)))
    g.add((Track, SO["energy"], Literal(row["energy"], datatype=XSD.float)))
    g.add((Track, SO["loudness"], Literal(row["loudness"], datatype=XSD.float)))
    g.add((Track, SO["liveness"], Literal(row["liveness"], datatype=XSD.float)))
    g.add((Track, SO["valence"], Literal(row["valence"], datatype=XSD.float)))
    g.add((Track, SO["speechiness"], Literal(row["speechiness"], datatype=XSD.float)))
    g.add((Track, SO["instrumentalness"], Literal(row["instrumentalness"], datatype=XSD.float)))

    # Load countries as array
    countries = row["available_countries"].split(",") if not pd.isnull(row["available_countries"]) else []

    for country in countries:
        # Create the RDF node
        Country = URIRef(CNS[country.lower()])

        # Add the edge connecting the Track and the Country
        g.add((Track, SO["isAvailableIn"], Country))

    # Load artists as array
    artists = row["artists"].split(",")

    for artistID in artists:
        # Create the RDF node
        Artist = URIRef(SO[artistID])

        # Add the edge connecting the Track and the Artist
        g.add((Artist, SO["partecipateIn"], Track))

    #Retrieve albumID
    albumID = row["album"]

    # Create the RDF node
    Album = URIRef(SO[albumID])

    # Add the edge connecting the Track and the Artist
    g.add((Track, SO["isPartOf"], Album))


In [22]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(tracksTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Charts

In [23]:
# Create Graph
g = createGraph()

# Load countries dataframe
countries, altCountries = loadCountries()

In [24]:
# Load the CSV files in memory
charts = pd.read_csv(spotifyChartsPath , sep=",")

# Aggregate the original dataframe to identify a specific chart using COUNTRY and DATE
chartsDF = charts.groupby(['country', 'date']).size().reset_index(name='total_tracks')

# Removing global
chartsDF = chartsDF.drop(index=chartsDF[chartsDF['country'] == 'Global'].index)
charts = charts.drop(index=charts[charts['country'] == 'Global'].index)

totalRows = len(charts.index)

In [25]:
#I iterate through the dataframe
for index, row in chartsDF.iterrows():

    # Retrieve country and date
    countryName = row["country"]
    date = row["date"]
    topNumType = 100
    numTotalTracks = row["total_tracks"]

    # Reformat date
    date = datetime.datetime.strptime(date, '%d/%m/%Y').strftime('%d-%m-%y')

    # Get the country code
    countryCode, _ = getCountryCode(countryName, countries, altCountries)

    # Create a uniqueID
    chartID = "top-{}-{}-{}".format(topNumType, countryCode, date)

    # Create the node to add to the Graph
    Chart = URIRef(SO[chartID])

    # Add triples using store's add() method.
    g.add((Chart, RDF.type, SO.Chart))

    # Add the name of the Chart
    chartName = "TOP {} {}".format(topNumType, countryName)
    g.add((Chart, SO["name"], Literal(chartName, datatype=XSD.string)))

    # Add the date of the chart
    g.add((Chart, SO["date"], Literal(date, datatype=XSD.date)))

    # Add the number of tracks
    g.add((Chart, SO["totalTracks"], Literal(row['total_tracks'], datatype=XSD.int)))

    # Add related Country
    # Create the RDF node
    Country = URIRef(CNS[countryCode.lower()])
   
    # Add the edge connecting the Chart and the Country
    g.add((Chart, SO["isReferredTo"], Country))

    # Add chart type
    chartType = URIRef(SO["top"])
    g.add((Chart, SO["isTypeOf"], chartType))


In [26]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(chartsTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


# Appearance

In [27]:
# Create Graph
g = createGraph()

# Load countries dataframe
countries, altCountries = loadCountries()

In [28]:
# I iterate through the dataframe
for index, row in charts.iterrows():

    # Create a uniqueID
    appearanceID = "appearance-{}".format(index)

    # Create the node to add to the Graph
    Appearance = URIRef(SO[appearanceID])

    # Add triples using store's add() method.
    g.add((Appearance, RDF.type, SO.Appearance))

    # Add the position of track
    g.add((Chart, SO["position"], Literal(row['position'], datatype=XSD.int)))

    # Get the track id from the uri
    trackID = row['uri'].removeprefix("https://open.spotify.com/track/")

    # Add the edge connecting Appearance to the Track
    Track = URIRef(SO[trackID])
    g.add((Track, SO["appearsIn"], Appearance))

    # Retrieve country and date
    countryName = row["country"]
    date = row["date"]

    # Reformat date
    date = datetime.datetime.strptime(date, '%d/%m/%Y').strftime('%d-%m-%y')

    # Get the country code
    countryCode, _ = getCountryCode(countryName, countries, altCountries)

    # Create a uniqueID
    chartID = "top-100-{}-{}".format(countryCode, date)

    # Create the node to add to the Graph
    Chart = URIRef(SO[chartID])

    g.add((Appearance, SO["isPositionedIn"], Chart))

    if index % 10000 == 0:
        print("💾 [STATUS INFO] {row}/{totalRows} ({percentage:.2f}%)\n".format(
            row=index, totalRows=totalRows, percentage=((index * 100) / totalRows)))


💾 [STATUS INFO] 10000/682084 (1.47%)

💾 [STATUS INFO] 20000/682084 (2.93%)

💾 [STATUS INFO] 30000/682084 (4.40%)

💾 [STATUS INFO] 40000/682084 (5.86%)

💾 [STATUS INFO] 50000/682084 (7.33%)

💾 [STATUS INFO] 60000/682084 (8.80%)

💾 [STATUS INFO] 70000/682084 (10.26%)

💾 [STATUS INFO] 80000/682084 (11.73%)

💾 [STATUS INFO] 90000/682084 (13.19%)

💾 [STATUS INFO] 100000/682084 (14.66%)

💾 [STATUS INFO] 110000/682084 (16.13%)

💾 [STATUS INFO] 120000/682084 (17.59%)

💾 [STATUS INFO] 130000/682084 (19.06%)

💾 [STATUS INFO] 140000/682084 (20.53%)

💾 [STATUS INFO] 150000/682084 (21.99%)

💾 [STATUS INFO] 160000/682084 (23.46%)

💾 [STATUS INFO] 170000/682084 (24.92%)

💾 [STATUS INFO] 180000/682084 (26.39%)

💾 [STATUS INFO] 190000/682084 (27.86%)

💾 [STATUS INFO] 200000/682084 (29.32%)

💾 [STATUS INFO] 210000/682084 (30.79%)

💾 [STATUS INFO] 220000/682084 (32.25%)

💾 [STATUS INFO] 230000/682084 (33.72%)

💾 [STATUS INFO] 240000/682084 (35.19%)

💾 [STATUS INFO] 250000/682084 (36.65%)

💾 [STATUS INFO]

In [29]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(appearanceTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING


## People

In [30]:
# Create Graph
g = createGraph()


In [31]:
# Load the CSV files in memory
people = pd.read_csv(peoplePath, sep=",", index_col="id")

In [34]:
groupedPeople = people.groupby(["name","surname"])

counter = 0

# iterate over each group
for peopleID, peopleGroup in groupedPeople:
   peopleObj = {}

   peopleObj["id"] = peopleID
   peopleObj["name"] = peopleGroup.iloc[0]["name"] if not pd.isnull(peopleGroup.iloc[0]["name"]) else None
   peopleObj["surname"] = peopleGroup.iloc[0]["surname"] if not pd.isnull(peopleGroup.iloc[0]["surname"]) else None
   peopleObj["birthdate"] = peopleGroup.iloc[0]["birthdate"] if not pd.isnull(peopleGroup.iloc[0]["birthdate"]) else None
   peopleObj["deathdate"] = peopleGroup.iloc[0]["deathdate"] if not pd.isnull(peopleGroup.iloc[0]["deathdate"]) else None
   peopleObj["nationality"] = peopleGroup.iloc[0]["nationality"] if not pd.isnull(peopleGroup.iloc[0]["nationality"]) else None
   peopleObj["complete_name"] = peopleGroup.iloc[0]["complete_name"] if not pd.isnull(peopleGroup.iloc[0]["complete_name"]) else None
   peopleObj["entity_name"] = peopleGroup.iloc[0]["entity_name"] if not pd.isnull(peopleGroup.iloc[0]["entity_name"]) else None

   artists = []
   for index, row in peopleGroup.iterrows():
      artists.append(row["artist"])
    
   peopleObj["artists"] = artists

   #print(json.dumps(peopleObj, indent=2))

   # Create a uniqueID
   peopleID = "People-{}".format(counter)
   counter+=1

   # Create the node to add to the Graph
   People = URIRef(SO[peopleID])
   # Add triples using store's add() method.
   g.add((People, RDF.type, SO.People))

   # Add name and surname
   g.add((People, SO["name"], Literal(peopleObj["name"], datatype=XSD.string)))
   g.add((People, SO["surname"], Literal(peopleObj["surname"], datatype=XSD.string)))

   # Manage dates
   birthdate = peopleObj["birthdate"]
   if (birthdate is not None):
      if(len(birthdate)==4):
         birthdate += "-01-01"
      elif(len(birthdate)==7):
         birthdate += "-01"

   deathdate = peopleObj["deathdate"]
   if (deathdate is not None):
      if(len(deathdate)==4):
         deathdate += "-01-01"
      elif(len(deathdate)==7):
         deathdate += "-01"

   g.add((People, SO["birthdate"], Literal(birthdate, datatype=XSD.date)))
   g.add((People, SO["deathdate"], Literal(deathdate, datatype=XSD.date)))

   
   # Add nationality
   # Create the RDF node
   if (peopleObj["nationality"] is not None):
      Country = URIRef(CNS[peopleObj["nationality"].lower()])
   # Add the edge connecting People and the Country
   g.add((People, SO["hasNationality"], Country))

   #manage edge connectig artists and people
   for artistID in peopleObj["artists"]:
      # Create the RDF node
      Artist = URIRef(SO[artistID])

      # Add the edge connecting the Track and the Artist
      g.add((People, SO["isMemberOf"], Artist))


In [35]:
# Save all the data in the Turtle format
print("[💾] SAVING")
with open(peopleTTLPath, "w", encoding="utf-8") as fp:
    fp.write(g.serialize(format="turtle"))

[💾] SAVING
