# Spotify Graph Queries

We perform ten queries on our Spotify database relying on ***SPARQLWrapper***.

## Setup

We import all the necessary libraries and we set the URL needed by SPARQLWrapper

In [None]:
# Imports
import os
import pandas as pd
from pandas import json_normalize
import numpy as np
import matplotlib.pyplot as plt

from SPARQLWrapper import SPARQLWrapper, JSON
from IPython.display import display

In [None]:
SPOTIFY_ONTOLOGY = "https://www.dei.unipd.it/~martinelli/spotify/spotifyOntology#"
REPOSITORY_NAME = "eu"

SPARQL_ENDPOINT = "http://localhost:7200/repositories/" + REPOSITORY_NAME

sparql = SPARQLWrapper(SPARQL_ENDPOINT)


## Query utilities
We define a function to perform a SPARQL query

In [None]:
# Execute a SPARQL query and return a DataFrame
def executeQuery(query, maxRows=None):
    global sparql

    # Set the number of rows and columns to display
    pd.set_option("display.max_rows", maxRows, "display.max_columns", None)

    # Set the query
    sparql.setQuery("""
            prefix so: <https://www.dei.unipd.it/~martinelli/spotify/spotifyOntology#>
            prefix countries: <http://eulersharp.sourceforge.net/2003/03swap/countries#>
            prefix xsd: <http://www.w3.org/2001/XMLSchema#>

            {query}
        """.format(query=query))

    # Set the result format
    sparql.setReturnFormat(JSON)

    # Execute the query
    try:
        results = sparql.query().convert()
    except:
        return None, None

    # If it is an ask
    if "boolean" in results.keys():
        return results["boolean"], "ask"
    
    # If it is a select
    if "results" in results.keys():
        # Create a DataFrame from the returned JSON
        resultDF = json_normalize(results["results"]["bindings"])

        # Remove the datadtypes columns
        resultDF = resultDF[[col for col in resultDF.columns if not col.endswith(
            ".type") and not col.endswith(".datatype")]]

        # Remove .value from column names
        resultDF = resultDF.rename(columns=lambda col: col.replace(".value", ""))

        # Remove the spotify ontology prefix from uris (to get only the ID)
        resultDF = resultDF.applymap(lambda x: str(x).replace(
            SPOTIFY_ONTOLOGY, "").replace("http://eulersharp.sourceforge.net/2003/03swap/countries#", ""))

        return resultDF, "select"
    
    return None, None


We define a function to print the result of a query

In [None]:
def printResult(result, resultType):
    if result is None:
        print("🛑 [[NO RESULTS]]\n")

    if resultType == "select":
        print("🗃️ [[RESULT]]\n")
        display(result)

    if resultType == "ask":
        print("🎱 [[ANSWER]]\n")
        print("✔️ Yes" if result else "❌ No")


## Plot utilities

In [None]:
TITLE_PAD = 30
TITLE_DIM = 30
XY_LABEL_PAD = 23
XY_LABEL_DIM = 21
XY_TICKS_DIM = 18
GRID_ALPHA = 0.2
FILL_ALPHA = 0.05
BAR_WIDTH = 0.95
LEGEND_LOC = "upper left"

FIG_SIZE = (18, 8)
FIG_BG_COLOR = "w"
FIG_DPI = 500
FIG_BBOX = "tight"
FIG_PAD = .3
FIG_FORMATS = ["png", "pdf"]


In [None]:
def plotResults(title=None, x=[], yArr=[],
                yLabel=None, xLabel=None, xTicks=None, ticksRotation=0, legend=None,
                bar=False, showGrid=False, gridAxis="both", fillPlot=False,
                figSize=FIG_SIZE, saveTitle=None):

    plt.figure(figsize=figSize, facecolor=FIG_BG_COLOR)
    
    if not title is None:
        plt.title(title, fontsize=TITLE_DIM, pad=TITLE_PAD)
    
    numPlots = len(yArr)
    for i, y in zip(range(numPlots), yArr):
        label = None
        try:
            label = legend[i]
        except:
            pass
        
        yNum = [float(yVal) for yVal in y]

        if bar:
            barWidth = BAR_WIDTH / numPlots
            xOffset = -BAR_WIDTH / 2 + barWidth / 2 + barWidth * i
            plt.bar(x + xOffset, yNum, width=barWidth,
                    label=label, align="center")
        else:
            plt.plot(x, yNum, label=label)

            if fillPlot:
                plt.fill_between(x, yNum, alpha=FILL_ALPHA)
    
    if not xLabel is None:
        plt.xlabel(xLabel, labelpad=XY_LABEL_PAD, fontsize=XY_LABEL_DIM)

    if not yLabel is None:
        plt.ylabel(yLabel, labelpad=XY_LABEL_PAD, fontsize=XY_LABEL_DIM)

    if not xTicks is None:
        plt.xticks(x, xTicks, fontsize=XY_TICKS_DIM, rotation=ticksRotation)
        
    plt.yticks(fontsize=XY_TICKS_DIM)

    if showGrid:
        plt.grid(alpha=GRID_ALPHA, axis=gridAxis)

    if not legend is None:
        plt.legend(loc=LEGEND_LOC)
    
    if not saveTitle is None:
        baseDir = "plots"
        if not os.path.exists(baseDir):
            os.mkdir("plots")

        for figFormat in FIG_FORMATS:
            saveDir = baseDir + "/" + figFormat
            if not os.path.exists(saveDir):
                os.mkdir(saveDir)

            plt.savefig(saveDir + "/" + saveTitle + "." + figFormat,
                        facecolor=FIG_BG_COLOR,
                        dpi=FIG_DPI,
                        bbox_inches=FIG_BBOX,
                        pad_inches=FIG_PAD)

    plt.show()


## Queries

### Query 1
On average how many artist of a specific nationality are in the top 100 of the same country?

In [None]:
result, resultType = executeQuery("""
	select ?country (xsd:integer(AVG(?numArtists)) AS ?avgNumArtists) where {
		{
			select ?date ?country (count(distinct ?artist) AS ?numArtists) where {
				?person 		so:isMemberOf ?artist ;
								so:hasNationality ?country.
				?artist 		so:partecipateIn ?track ;
								so:name ?name.
				?track 			so:appearsIn ?appeareance .
				?appeareance 	so:isPositionedIn ?chart .
				?chart 			so:isReferredTo ?country ;
								so:date ?date .   
			}
			group by ?date ?country
		}
	}
	group by ?country
""")

printResult(result, resultType)

plotResults(
	title="Query 1",
	x=np.arange(len(result)),
	yArr=[result["avgNumArtists"]],
	yLabel="Average number of artists",
	xLabel="Country",
	xTicks=result["country"],
	showGrid=True,
	gridAxis="y",
	bar=True,
	saveTitle="query1"
)


### Query 2
Show the 5 most popular young (less than 30 years) italian artist who published in 2018

In [None]:
result, resultType = executeQuery("""
select distinct ?artistName (avg(?artistAge) as ?artistAvgAge) ?artistPopularity where {
    ?person so:isMemberOf ?artist ;
            so:hasNationality countries:it ;
            so:birthDate ?artistBirthdate .
    ?artist so:partecipateIn ?track ;
            so:name ?artistName ;
    		so:popularity ?artistPopularity .
    ?track so:isPartOf ?album ;
           so:name ?trackName .
	?album so:releaseDate ?releaseDate ;
        	so:name ?albumName .
    filter(?releaseDate >= "2018-01-01"^^xsd:date && ?releaseDate < "2019-01-01"^^xsd:date) .
    filter(?artistBirthdate >= "1991-01-01"^^xsd:date ) .
    bind((year(now()) - year(?artistBirthdate)) as ?artistAge)
} group by ?artistName ?artistPopularity
""")

printResult(result, resultType)

plotResults(
    figSize=(30, 8),
    title="Query 2",
    x=np.arange(len(result)),
    yArr=[result["artistPopularity"]],
    yLabel="Artist popularity",
    xLabel="Artist",
    xTicks=result["artistName"],
    ticksRotation=90,
    showGrid=True,
    gridAxis="y",
    bar=True,
    saveTitle="query2"
)


### Query 3
Show the 30 genres with most danceable songs relative to songs released in 2019

In [None]:
result, resultType = executeQuery("""
    select ?genre (avg(?danceability) as ?avgDanceability) (count(?track) as ?numTrack) where {
        ?artist so:partecipateIn ?track ;
                so:name ?artistName ;
                so:hasGenre ?genre .
        ?track 	so:isPartOf ?album ;
                so:name ?trackName ;
                so:danceability ?danceability .
        ?album 	so:releaseDate ?date ;
                so:name ?albumName .
        filter(?date >= "2019-01-01"^^xsd:date && ?date < "2020-01-01"^^xsd:date)
    }
    group by ?genre
    having (?numTrack > 100)
    limit 30
""")

printResult(result, resultType)

plotResults(
    figSize=(30, 8),
    title="Query 3.1",
    x=np.arange(len(result)),
    yArr=[result["avgDanceability"]],
    xLabel="Genre",
    yLabel="Average Danceability",
    xTicks=result["genre"],
    ticksRotation=90,
    showGrid=True,
    gridAxis="y",
    bar=True,
    saveTitle="query3.1"
)

plotResults(
    figSize=(20, 8),
    title="Query 3.2",
    x=np.arange(len(result)),
    yArr=[result["numTrack"]],
    xLabel="Genre",
    yLabel="Number of tracks",
    xTicks=result["genre"],
    ticksRotation=90,
    showGrid=True,
    gridAxis="y",
    bar=True,
    saveTitle="query3.2"
)


### Query 4
In 2019 there was more distinct singles in top 100 Argentina or top 100 Italy?

In [None]:
result, resultType = executeQuery("""
ask where{
    {
        select (count(distinct ?track) as ?numSinglesAG) where {
            ?track so:isPartOf ?album ;
             	   so:name ?trackName ;
    			   so:appearsIn ?appearance .
        	?album so:isTypeOf so:single;
                	so:name ?albumName .	
        	?appearance so:isPositionedIn ?chart .
        	?chart so:isReferredTo countries:ar ;
                   so:date ?date .
    		filter(?date >= "2018-01-01"^^xsd:date && ?date < "2019-01-01"^^xsd:date)
		}
    } .
    {
        select (count(distinct ?track) as ?numSinglesIT) where {
            ?track so:isPartOf ?album ;
             	   so:name ?trackName ;
    			   so:appearsIn ?appearance .
        	?album so:isTypeOf so:single;
                	so:name ?albumName .	
        	?appearance so:isPositionedIn ?chart .
        	?chart so:isReferredTo countries:it ;
                   so:date ?date .
    		filter((?date >= "2018-01-01"^^xsd:date && ?date < "2019-01-01"^^xsd:date))
		}
    } .
    
    filter(?numSinglesAG > ?numSinglesIT)
}
""")

printResult(result, resultType)


### Query5
How many times artist born after 2000 were present in top 100 Italy in 2020?

In [None]:
result, resultType = executeQuery("""
    select distinct ?trackName ?artistName ?completeName ?birthDate (count(?chartDate) as ?numAppearances) where { 
        ?track a so:Track ;
            so:name ?trackName ;
            so:appearsIn ?appearance .
        ?artist a so:Artist ;
                so:partecipateIn ?track ;
                so:name ?artistName .
        ?artistPerson so:isMemberOf ?artist ;
                    so:name ?realName ;
                    so:surname ?realSurname ;
                    so:birthDate ?birthDate ;
                    so:deathDate ?deathDate .
        ?appearance a so:Appearance ;
                    so:isPositionedIn ?chart.
        ?chart a so:Chart;
            so:isReferredTo countries:it ;
            so:date ?chartDate ;
            so:name ?chartName .

        bind(concat(?realName, " ", ?realSurname) AS ?completeName) .
        filter(?birthDate >= "2000-01-01"^^xsd:date) .
        filter(?chartDate > "2020-01-01"^^xsd:date) .
    } group by ?trackName ?artistName ?completeName ?birthDate
""")

printResult(result, resultType)


### Query 6
Show average of different features of tracks through the months of the year in Italy

In [None]:
result, resultType = executeQuery("""
       select ?country ?month
              (avg(?energy) as ?avgEnergy)
              (avg(?danceability) as ?avgDanceability)
              (avg(?valence) as ?avgValence)
       where {
              ?track a so:Track ;
                     so:appearsIn ?appearance ;
                     so:danceability ?danceability ;
                     so:energy ?energy ;
                     so:valence ?valence .
              ?appearance so:isPositionedIn ?chart ;
                            so:position ?position .
              ?chart so:date ?date ;
                     so:isReferredTo countries:it .

              bind(month(?date) as ?month)
       } group by ?month ?country
       order by asc(?country) asc(?month) 
""")

printResult(result, resultType)

plotResults(
    figSize=(20, 8),
    title="Query 6",
    x=np.arange(len(result)),
    yArr=[result["avgEnergy"], result["avgDanceability"], result["avgValence"]],
    xLabel="Month",
    xTicks=["Jan", "Feb", "Mar", "Apr", "May", "Jun",
            "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"],
    legend=["Energy", "Danceability", "Valence"],
    showGrid=True,
    fillPlot=True,
    saveTitle="query6"
)


### Query 7
How many albums (at least 2) contains the same track?

In [None]:
result, resultType = executeQuery("""
    select ?trackNameWithArtists (group_concat(distinct ?albumName; separator=", ") as ?albums) (count(distinct ?album) as ?albumCount) where {
        {
            select ?track ?trackName ?albumType ?album ?albumName (group_concat(?artistName; separator=", ") as ?artists) where {
                ?track a so:Track ;
                    so:name ?trackName ;
                    so:isPartOf ?album .
                ?album so:isTypeOf ?albumType ;
                    so:name ?albumName .
                ?artist so:partecipateIn ?track ;
                        so:name ?artistName .
            } group by ?track ?trackName ?albumType ?album ?albumName
        } .

        bind(concat(?trackName, " (", ?artists, ")") as ?trackNameWithArtists) .
    } group by ?trackNameWithArtists
    having (count(distinct ?album) > 1)
    order by desc(?albumCount)
""")

printResult(result, resultType)

### Query 8
Show how many explicit tracks appears in the top 100 for each country

In [None]:
result, resultType = executeQuery("""
    select ?country (count(distinct ?track) as ?numExplicitTracks) where {
        ?track so:appearsIn ?appearance ;
            so:explicit ?explicit .
        ?appearance so:isPositionedIn ?chart ;
                    so:position ?position .
        ?chart so:isReferredTo ?country .
        
        filter(?explicit = "true"^^xsd:boolean) .
    } group by ?country
    order by desc(?country)
""")

printResult(result, resultType)

plotResults(
	title="Query 11",
	x=np.arange(len(result)),
	yArr=[result["numExplicitTracks"]],
	yLabel="Number of explicit tracks",
	xLabel="Country",
	xTicks=result["country"],
	showGrid=True,
	gridAxis="y",
	bar=True,
	saveTitle="query11"
)

### Query 9
How many songs from Japan or South Korea reaches Top 20 in US, Canada or Mexico?

In [None]:
result, resultType = executeQuery("""
        select ?chartCountry (count(distinct ?track) as ?numTracks) where {
                ?person so:isMemberOf ?artist ;
                        so:hasNationality ?originCountry .
                ?artist so:partecipateIn ?track .
                ?track so:appearsIn ?appeareance .
                ?appeareance so:isPositionedIn ?chart ;
                                so:position ?position .
                ?chart so:isReferredTo ?chartCountry .

                filter(?originCountry = countries:jp || ?originCountry = countries:kr) .
                filter(?chartCountry = countries:us || ?chartCountry = countries:mx || ?chartCountry = countries:ca) .
                filter(?position <= 20) .
        } group by ?chartCountry
        order by desc(?numTracks)
""")

printResult(result, resultType)


### Query 10
Show the 30 albums with the most high number of songs present in Top 30 at the same time. Show also how many tracks are contained in the album

In [None]:
result, resultType = executeQuery("""
    select ?chartName ?chartDate ?albumName ?numTracks ?totalTracks where {
        ?chart so:name ?chartName ;
            so:date ?chartDate .
        ?album so:name ?albumName ;
            so:totalTracks ?totalTracks .
        {
            select ?chart ?album (count(distinct ?track) as ?numTracks) where {
                ?track so:isPartOf ?album ;
                    so:appearsIn ?appereance .
                ?appereance so:isPositionedIn ?chart ;
                            so:position ?position .
                filter(?position <= 30)
            } group by ?chart ?album
            order by desc(?numTracks)
            limit 30
        }
    }
""")

printResult(result, resultType)


### Query 11
The most listened artist and his nationality for each country

In [None]:
result, resultType = executeQuery("""
    select ?country ?artistName (group_concat(distinct ?nationality; separator=", ") as ?nationalities) where {
        ?artist so:name ?artistName .
        ?person so:isMemberOf ?artist ;
                so:hasNationality ?nationality .
        {
            select ?country_ (max(?numTracks) as ?maxNumTracks) where {
                {
                    select ?country_ ?artist (count(?track) as ?numTracks) where {
                        ?person so:isMemberOf ?artist ;
                                so:hasNationality ?nationality .
                        ?artist so:partecipateIn ?track ;
                                so:name ?artistName .
                        ?track so:appearsIn ?appearance .
                        ?appearance so:isPositionedIn ?chart .
                        ?chart so:isReferredTo ?country_ .
                    } group by ?country_ ?artist
                }
            } group by ?country_
        } .
        {
            select ?country ?artist (count(?track) as ?numTracks) where {
                ?person so:isMemberOf ?artist ;
                        so:hasNationality ?nationality .
                ?artist so:partecipateIn ?track ;
                        so:name ?artistName .
                ?track so:appearsIn ?appearance .
                ?appearance so:isPositionedIn ?chart .
                ?chart so:isReferredTo ?country .
            } group by ?country ?artist
        } .
        
        filter(?numTracks = ?maxNumTracks)
    } group by ?country ?artistName
    order by asc(?country)
""")

printResult(result, resultType)
