In [None]:
# required libraries
import pandas as pd
import os
from pathlib import Path
import datetime

In [None]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = os.path.join(absPath, "datasets")

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)


# Setup datasets paths
spotifyChartsPath = os.path.join(datasetsPath, "reducedSpotifyCharts.csv")
spotifyChartsCleanPath = os.path.join(datasetsPath, "reducedSpotifyChartsClean.csv")
genresPath = os.path.join(datasetsPath, "genres.csv")
genresCleanPath = os.path.join(datasetsPath, "genresClean.csv")
artistsPath = os.path.join(datasetsPath, "artists.csv")
artistsCleanPath = os.path.join(datasetsPath, "artistsClean.csv")

# Countries
countriesPath = os.path.join(datasetsPath, "countries.csv")
altCountriesPath = os.path.join(datasetsPath, "altCountries.csv")


## Genres

In [None]:
# Load the CSV files in memory
genresDF = pd.read_csv(genresPath, sep=",", index_col="genre")

In [None]:
def createGenreID(genre):
    # Replace all special chars with "-"
    genreID = ""
    for char in genre:
        genreID += char if char.isalnum() else "-"

    return genreID

In [None]:
genreIDs = []
# Iterate over the album DataFrame
for genre, row in genresDF.iterrows():

    # Create genre ID from name
    genreID = createGenreID(genre)

    genreIDs.append(genreID)

# Add the ids to the genre dataframe
genresDF["id"] = genreIDs
genresDF = genresDF.drop(genresDF.columns[0], axis=1)

# Set the ID as new index
genresDF = genresDF.reset_index()
genresDF = genresDF.set_index(["id"])

In [None]:
genresDF.to_csv(genresCleanPath)

## Artists

In [None]:
# Load the CSV files in memory
artistsDF = pd.read_csv(artistsPath, sep=",", index_col="id")


In [None]:
artistsGenresIDs = []

# Iterate over the album DataFrame
for artistID, row in artistsDF.iterrows():
    # Load genres as array
    genres = row["genres"].split(",") if not pd.isnull(row["genres"]) else []

    aristGenresIDs = []
    for genre in genres:
        genreID = createGenreID(genre)
        aristGenresIDs.append(genreID)

    artistsGenresIDs.append(",".join(aristGenresIDs))

# Add the columns to the dataframe
artistsDF["genres"] = artistsGenresIDs

artistsDF = artistsDF.drop(artistsDF.columns[0], axis=1)


In [None]:
artistsDF.to_csv(artistsCleanPath)


## Charts

In [None]:
def loadCountries():
    countries = pd.read_csv(countriesPath, sep=",")
    altCountries = pd.read_csv(altCountriesPath, sep=",")
    altCountries.columns = ["AlternativeName", "Name"]

    return countries, altCountries


def getCountryCode(countryName, countries, altCountries):
    # Try to retrieve ISO CODE of the country
    try:
        matchedCountries = countries[countries["Name"].str.contains(
            countryName)]
        countryCode = matchedCountries["Code"].iloc[0]
    except IndexError as e:
        # Look if an alternative name was used
        alternativeMatchedCountries = altCountries[altCountries["AlternativeName"].str.contains(
            countryName)]
        countryName = alternativeMatchedCountries["Name"].iloc[0]

        matchedCountries = countries[countries["Name"] == countryName]
        countryCode = matchedCountries["Code"].iloc[0]

    return countryCode, countryName

In [None]:
# Load countries dataframe
countries, altCountries = loadCountries()

# Load the CSV files in memory
chartsDF = pd.read_csv(spotifyChartsPath, sep=",")

# Removing global
chartsDF = chartsDF.drop(index=chartsDF[chartsDF["country"] == "Global"].index)
chartsDF = chartsDF.drop(chartsDF.columns[0], axis=1)


In [None]:
chartIDs = []
chartNames = []
chartTypes = []
countryIDs = []
chartDates = []
trackIDs = []

# I iterate through the dataframe
for index, row in chartsDF.iterrows():

    # Retrieve country and date
    countryName = row["country"]
    chartDate = row["date"]
    topNumType = 100

    # Reformat date
    chartDate = datetime.datetime.strptime(
        chartDate, "%d/%m/%Y").strftime("%Y-%m-%d")
    
    # Get the track ID
    trackID = row['uri'].removeprefix("https://open.spotify.com/track/")
    
    # Get the country code
    countryCode, _ = getCountryCode(countryName, countries, altCountries)

    # Create a uniqueID
    chartID = "top-{}-{}-{}".format(topNumType, countryCode, chartDate)

    # Add the name of the Chart
    chartName = "TOP {} {}".format(topNumType, countryName)

    chartIDs.append(chartID)
    chartNames.append(chartName)
    chartTypes.append("top")
    countryIDs.append(countryCode)
    chartDates.append(chartDate)
    trackIDs.append(trackID)

# Add the columns to the dataframe
chartsDF["id"] = chartIDs
chartsDF["country_code"] = countryIDs
chartsDF["name"] = chartNames
chartsDF["type"] = chartTypes
chartsDF["date"] = chartDates
chartsDF["trackID"] = trackIDs

# Set the ID as new index
chartsDF = chartsDF.reset_index()
chartsDF = chartsDF.set_index(["id"])

chartsDF = chartsDF.drop(chartsDF.columns[0], axis=1)


In [None]:
chartsDF.to_csv(spotifyChartsCleanPath)
