In [None]:
# Imports

import os
import time
from pathlib import Path
import pandas as pd
import json
import spotifyCredentials
import requests
import base64

# Setup

In [None]:
# Get absolute path
absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = absPath + "/datasets"

# Create dataset directory if not exists
if not os.path.exists(datasetsPath):
    os.mkdir(datasetsPath)

# Setup datasets paths
genresPath = datasetsPath + "/genres.csv"
tracksPath = datasetsPath + "/tracks.csv"
albumsPath = datasetsPath + "/albums.csv"
artistsPath = datasetsPath + "/artists.csv"
peoplePath = datasetsPath + "/people.csv"

# Spotify API Functions

In [None]:
# Spotify credentials to get access token to Spotify API

SPOTIFY_CLIENT_ID = spotifyCredentials.SPOTIFY_CLIENT_ID
SPOTIFY_SECRET_ID = spotifyCredentials.SPOTIFY_SECRET_ID
SPOTIFY_REFRESH_TOKEN = spotifyCredentials.SPOTIFY_REFRESH_TOKEN

In [None]:
# Function to get the Spotify API Access Token
def getAccessToken():
    # Spotify Basic Authorization Code
    authBasic = base64.b64encode("{}:{}".format(SPOTIFY_CLIENT_ID, SPOTIFY_SECRET_ID).encode()).decode()

    # Request to get Access Token from Client ID and Secret ID
    accessTokenRequest = requests.post("https://accounts.spotify.com/api/token",
        data={
            "grant_type":"refresh_token",
            "refresh_token":SPOTIFY_REFRESH_TOKEN
        },
        headers={
            "Authorization": "Basic " + authBasic
        }
    )

    # Get Access Token
    accessToken = None

    if accessTokenRequest.status_code == 200:
        accessToken = accessTokenRequest.json()["access_token"]

    return accessToken

accessToken = getAccessToken()
print("ü™ô [INFO] Access Token is: {}".format(accessToken))


In [None]:
# Function to get all available Genres from Spotify API
def getAllGenres(fromError=False):
    global accessToken

    availableGenresRequest = requests.get("https://api.spotify.com/v1/recommendations/available-genre-seeds",
        headers={
            "Authorization": "Bearer " + accessToken
        })

    if availableGenresRequest.status_code == 200:
        return availableGenresRequest.json()
    elif availableGenresRequest.status_code == 401 and fromError is False: # The access token has expired
        accessToken = getAccessToken()
        return getAllGenres(fromError=True)
    elif availableGenresRequest.status_code == 429 and fromError is False:  # The api rate limit has reached
        time.sleep(30)
        return getAllGenres(fromError=True)
    else:
        raise Exception(availableGenresRequest.text)


In [None]:
# Function to get Track info from Spotify API
def getTrackInfo(trackID, fromError=False):
    global accessToken

    trackInfoRequest = requests.get("https://api.spotify.com/v1/tracks/{id}".format(id=trackID),
        headers={
            "Authorization": "Bearer " + accessToken
        })
    
    if trackInfoRequest.status_code == 200:
        return trackInfoRequest.json()
    elif trackInfoRequest.status_code == 401 and fromError is False: # The access token has expired
        accessToken = getAccessToken()
        return getTrackInfo(trackID, fromError=True)
    elif trackInfoRequest.status_code == 429 and fromError is False:  # The api rate limit has reached
        time.sleep(30)
        return getTrackInfo(trackID, fromError=True)
    else:
        raise Exception(trackInfoRequest.text)


In [None]:
def getAudioFeatures(trackID, fromError=False):
    global accessToken

    audioFeaturesRequest = requests.get("	https://api.spotify.com/v1/audio-features/{id}".format(id=trackID),
                                    headers={
        "Authorization": "Bearer " + accessToken
    })

    if audioFeaturesRequest.status_code == 200:
        return audioFeaturesRequest.json()
    elif audioFeaturesRequest.status_code == 401 and fromError is False:  # The access token has expired
        accessToken = getAccessToken()
        return getAudioFeatures(trackID, fromError=True)
    elif audioFeaturesRequest.status_code == 429 and fromError is False:  # The api rate limit has reached
        time.sleep(30)
        return getAudioFeatures(trackID, fromError=True)
    else:
        raise Exception(audioFeaturesRequest.text)


# Parse Data

In [None]:
# Setup DataFrames

genresDF = None
genresCols = ["genre"]

tracksDF = None
tracksCols = ["id", "uri", "title", "duration", "popularity", "explicit",
              "key", "tempo", "mode", "time_signature", "acousticness", "danceability",
              "energy", "loudness", "liveness", "valence", "speechiness", "instrumentalness",
              "artists", "album", "available_countries"]

albumDF = None
albumCols= []

artistsDF = None
artistsCols= []

peopleDF = None
peopleCols= []


## Get genres

In [None]:
# Get all genres
availableGenres = getAllGenres()

# Add genres to DataFrame
genresDF =  pd.DataFrame(availableGenres["genres"], columns=["genre"])

# Print genres DataFrame info
genresDF.info()

# Save genres dataset to file
genresDF.to_csv(genresPath)


## Get tracks

In [None]:
# Load Spotify Charts
trackCharts = pd.read_csv(os.path.join(
    datasetsPath, "spotifyCharts.csv"), sep=",")

# Drop NaN columns
trackCharts = trackCharts.dropna()

# Print track charts info
trackCharts.info()

In [None]:
# Get only uris
spotifyTrackLinks = pd.DataFrame(trackCharts["uri"].drop_duplicates())

# Print tracks uris info
spotifyTrackLinks.info()


In [None]:
# Setup DataFrame
tracksDF = pd.DataFrame([], columns=tracksCols)
tracksDF = tracksDF.set_index("id")

# Iterate over spotify uris DataFrame
index = 0
for rowID, rowData in spotifyTrackLinks.iterrows():
    spotifyUri = rowData.loc["uri"]

    # Get the track id from the uri
    trackID = spotifyUri.removeprefix("https://open.spotify.com/track/")

    # Try to get track info and audio features
    try:
        trackInfo = getTrackInfo(trackID)
        trackAudioFeatures = getAudioFeatures(trackID)
    except Exception as e:
        print("\t‚õî [ERROR] Cannot retrieve data with track {id}\n\tüó®Ô∏è [RESPONSE] {resp}\n".format(index=index, id=trackID, resp=e))
        continue
    
    # Get only artists ids
    artistsIDs = []
    for artist in trackInfo["artists"]:
        artistsIDs.append(artist["id"])

    # Get and setup the track information needed
    trackObject = {
        "id": trackInfo["id"],
        "uri": trackInfo["external_urls"]["spotify"],
        "title": trackInfo["name"],
        "duration": trackInfo["duration_ms"],
        "popularity": trackInfo["popularity"],
        "explicit": trackInfo["explicit"],
        "key": trackAudioFeatures["key"],
        "tempo": trackAudioFeatures["tempo"],
        "mode": trackAudioFeatures["mode"],
        "time_signature": trackAudioFeatures["time_signature"],
        "acousticness": trackAudioFeatures["acousticness"],
        "danceability": trackAudioFeatures["danceability"],
        "energy": trackAudioFeatures["energy"],
        "loudness": trackAudioFeatures["loudness"],
        "liveness": trackAudioFeatures["liveness"],
        "valence": trackAudioFeatures["valence"],
        "speechiness": trackAudioFeatures["speechiness"],
        "instrumentalness": trackAudioFeatures["instrumentalness"],
        "artists": artistsIDs,
        "album": trackInfo["album"]["id"],
        "available_countries": trackInfo["available_markets"]
    }

    # Create 1 row DataFrame for the track
    trackObjDF = pd.DataFrame([list(trackObject.values())], columns=tracksCols)

    # Add the track info to the dataframe
    tracksDF = pd.concat([tracksDF, trackObjDF], ignore_index=True)
    tracksDF = tracksDF.set_index("id")

    # Print stats every 5000 tracks
    if index % 5000 == 0:
        print("üéµ [STATUS INFO #{row}]\n".format(row=index))
        print(tracksDF.info())

    # Save DataFrame to file every 100 tracks
    if index % 100 == 0:
        print("\n\tüíæ [INFO] Dataset saved\n\n".format(index=index))
        tracksDF.to_csv(tracksPath)
    
    # Increment real index
    index += 1

# Print info about tracks DataFrame
print(tracksDF.info())

# Save genres dataset to file
tracksDF.to_csv(tracksPath)
