## Spotify Data Ingestion

In this notebook we process CSV files and create a property graph.

In [8]:
# required libraries
import pandas as pd
import os
from pathlib import Path

In [9]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")
rdfPath = os.path.join(absPath, "rdf")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Create RDF directory if not exists
if not os.path.exists(rdfPath):
    os.mkdir(rdfPath)

# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")
genresPath = os.path.join(datasetsPath, "genres.csv")
marketsPath = os.path.join(datasetsPath, "markets.csv")
tracksPath = os.path.join(datasetsPath, "tracks.csv")
albumsPath = os.path.join(datasetsPath, "albums.csv")
artistsPath = os.path.join(datasetsPath, "artists.csv")
peoplePath = os.path.join(datasetsPath, "people.csv")

# Countries
countriesPath = os.path.join(datasetsPath, "countries.csv")
altCountriesPath = os.path.join(datasetsPath, "altCountries.csv")

### Connection to Neo4j

In [10]:
# Neo4J params class
class Neo4jParams:
  def __init__(self, user, psw,dbname,db_psw,uri):
    self.user = user
    self.psw = psw
    self.dbname = dbname
    self.dbpsw = dbpsw
    self.uri = uri

In [11]:
#DB parameters
user="neo4j"
psw="neo4j"
dbname="SpotifyDB"
dbpsw="SpotifyDB"
uri = "bolt://localhost:7687"

params = Neo4jParams(user,psw,dbname,dbpsw,uri)

In [12]:
from neo4j import GraphDatabase

# test class

class Driver:

    def __init__(self, uri, user, password):
        self.driver = GraphDatabase.driver(uri, auth=(user, password))

    def close(self):
        self.driver.close()

    def print_greeting(self, message):
        with self.driver.session() as session:
            greeting = session.write_transaction(self._create_and_return_greeting, message)
            print(greeting)

    @staticmethod
    def _create_and_return_greeting(tx, message):
        result = tx.run("CREATE (a:Greeting) "
                        "SET a.message = $message "
                        "RETURN a.message + ', from node ' + id(a)", message=message)
        return result.single()[0]


if __name__ == "__main__":
    greeter = Driver("bolt://localhost:7687", "neo4j", "SpotifyDB")
    greeter.print_greeting("hello, world")
    greeter.close()

hello, world, from node 6


### Data ingestion

In [13]:
# Load the CSV files in memory
tracks = pd.read_csv(tracksPath, sep=",", index_col="id")
tracks.info()

<class 'pandas.core.frame.DataFrame'>
Index: 26801 entries, 5aAx2yezTd8zXrkmtKl66Z to 32BVeAkiV5rKmAlAUkjf9O
Data columns (total 21 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   Unnamed: 0           26801 non-null  int64  
 1   uri                  26801 non-null  object 
 2   title                26801 non-null  object 
 3   duration             26801 non-null  int64  
 4   popularity           26801 non-null  int64  
 5   explicit             26801 non-null  bool   
 6   key                  26801 non-null  int64  
 7   tempo                26801 non-null  float64
 8   mode                 26801 non-null  int64  
 9   time_signature       26801 non-null  int64  
 10  acousticness         26801 non-null  float64
 11  danceability         26801 non-null  float64
 12  energy               26801 non-null  float64
 13  loudness             26801 non-null  float64
 14  liveness             26801 non-null  float64
 15  val

In [14]:
# connect to the DB
driver = GraphDatabase.driver(params.uri, auth=(params.user, params.dbpsw))
# create a session
session = driver.session()

for trackID, row in tracks.iterrows():

    # Retrieve the title
    title = row["title"]

    session.run(""" CREATE(newNode:Track {title: $title}) 
                    RETURN newNode""",
    title = title
    )


    # Retrieve all the technical charateristics
    duration = int(row["duration"]) 
    popularity = int(row["popularity"]) 
    explicit = row["explicit"] 
    key = int(row["key"]) 
    tempo = row["tempo"] 
    mode = int(row["mode"]) 
    time_signature = int(row["time_signature"]) 
    acousticness = row["acousticness"] 
    danceability = row["danceability"] 
    energy = row["energy"] 
    loudness = row["loudness"] 
    liveness = row["liveness"] 
    valence = row["valence"] 
    speechiness = row["speechiness"] 
    instrumentalness = row["instrumentalness"] 