In [1]:
# Imports

import os
from pathlib import Path
import pandas as pd
import spotifyCredentials
import requests
import base64

In [4]:
# Set paths

absPath = str(Path(os.path.abspath(os.getcwd())).absolute())
datasetsPath = absPath + "/datasets"

# Spotify API Functions

In [31]:
# Spotify credentials to get access token to Spotify API

SPOTIFY_CLIENT_ID = spotifyCredentials.SPOTIFY_CLIENT_ID
SPOTIFY_SECRET_ID = spotifyCredentials.SPOTIFY_SECRET_ID
SPOTIFY_REFRESH_TOKEN = spotifyCredentials.SPOTIFY_REFRESH_TOKEN

In [32]:
# Function to get the Spotify API Access Token
def getAccessToken():
    # Spotify Basic Authorization Code
    authBasic = base64.b64encode("{}:{}".format(SPOTIFY_CLIENT_ID, SPOTIFY_SECRET_ID).encode()).decode()

    # Request to get Access Token from Client ID and Secret ID
    accessTokenRequest = requests.post("https://accounts.spotify.com/api/token",
        data={
            "grant_type":"refresh_token",
            "refresh_token":SPOTIFY_REFRESH_TOKEN
        },
        headers={
            "Authorization": "Basic " + authBasic
        }
    )

    # Get Access Token
    accessToken = None

    if accessTokenRequest.status_code == 200:
        accessToken = accessTokenRequest.json()["access_token"]

    return accessToken

accessToken = getAccessToken()
print("Auth Token is {}".format(accessToken))

Auth Token is BQCKij7wWcFkefQLJHQ3_47y9WIv5cNHOwftx0vli7Xe-6JhKbofee7wLeP9J660f2aIhSCQlkKEAVS4-TL0TAjBIjJeO8oWIcOLVrMlFGQxftT_dbUR6siInPVcm4DytqiV4xqlc1YqdNfVIqrdEGiPNOo3gOQNCMu6l6lGpHt7CUzWIR3-hSHAlmmxn2yuyrgy67SI8B1BCVaTjq8n7lusriVgXAXSLugbQGpr4omiqlDV1gg_JSOBkKugVvtiwb0yChny7oGcgEOnR7Jg4TQf4uFLySXUyLEXCKxGvw


In [41]:
# Function to get all available Genres from Spotify API
def getAllGenres(fromError=False):
    global accessToken

    availableGenresRequest = requests.get("https://api.spotify.com/v1/recommendations/available-genre-seeds",
        headers={
            "Authorization": "Bearer " + accessToken
        })

    if availableGenresRequest.status_code == 200:
        return availableGenresRequest.json()
    elif availableGenresRequest.status_code == 401 and fromError is False:
        accessToken = getAccessToken()
        return getAllGenres(fromError=True)
    else:
        return None


In [37]:
# Function to get Track info from Spotify API
def getTrackInfo(trackID, fromError=False):
    global accessToken

    trackInfoRequest = requests.get("https://api.spotify.com/v1/tracks/{id}".format(id=trackID),
        headers={
            "Authorization": "Bearer " + accessToken
        })
    
    if trackInfoRequest.status_code == 200:
        return trackInfoRequest.json()
    elif trackInfoRequest.status_code == 401 and fromError is False:
        accessToken = getAccessToken()
        return getTrackInfo(trackID, fromError=True)
    else:
        return None

In [39]:
def getAudioFeatures(trackID, fromError=False):
    global accessToken

    audioFeaturesRequest = requests.get("	https://api.spotify.com/v1/audio-features/{id}".format(id=trackID),
                                    headers={
        "Authorization": "Bearer " + accessToken
    })

    if audioFeaturesRequest.status_code == 200:
        return audioFeaturesRequest.json()
    elif audioFeaturesRequest.status_code == 401 and fromError is False:
        accessToken = getAccessToken()
        return getAudioFeatures(trackID, fromError=True)
    else:
        return None


# Parse Data

In [15]:
# Load Spotify Charts
trackCharts = pd.read_csv(os.path.join(datasetsPath, "spotifyCharts.csv"), sep=",")

trackCharts.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9807001 entries, 0 to 9807000
Data columns (total 8 columns):
 #   Column      Dtype  
---  ------      -----  
 0   Unnamed: 0  int64  
 1   country     object 
 2   date        object 
 3   position    float64
 4   uri         object 
 5   track       object 
 6   title       object 
 7   artist      object 
dtypes: float64(1), int64(1), object(6)
memory usage: 598.6+ MB


# Get datas

In [16]:
# Drop NaN columns
trackCharts = trackCharts.dropna()

# Get only uris
spotifyTrackLinks = pd.DataFrame(trackCharts["uri"].drop_duplicates())

spotifyTrackLinks.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 65459 entries, 0 to 9806983
Data columns (total 1 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   uri     65459 non-null  object
dtypes: object(1)
memory usage: 1022.8+ KB


In [97]:
# Setup DataFrames

genresDF = pd.DataFrame(columns=["genre"])
tracksDF = pd.DataFrame(columns=["id", "uri", "title", "duration", "popularity", "explicit",
                                "key", "tempo", "mode", "time_signature", "acousticness", "danceability",
                                "energy", "loudness", "liveness", "valence", "speechiness", "instrumentalness",
                                "artists", "album", "available_countries"])
albumDF = pd.DataFrame()
artistsDF = pd.DataFrame()
peopleDF = pd.DataFrame()


## Get genres

In [89]:
# Get all genres
availableGenres = getAllGenres()

# Add genres to DataFrame
genresDF.append(availableGenres["genres"])

# Save genres to file
genresDF.to_csv(datasetsPath + "/genres.csv")

{'genres': ['acoustic', 'afrobeat', 'alt-rock', 'alternative', 'ambient', 'anime', 'black-metal', 'bluegrass', 'blues', 'bossanova', 'brazil', 'breakbeat', 'british', 'cantopop', 'chicago-house', 'children', 'chill', 'classical', 'club', 'comedy', 'country', 'dance', 'dancehall', 'death-metal', 'deep-house', 'detroit-techno', 'disco', 'disney', 'drum-and-bass', 'dub', 'dubstep', 'edm', 'electro', 'electronic', 'emo', 'folk', 'forro', 'french', 'funk', 'garage', 'german', 'gospel', 'goth', 'grindcore', 'groove', 'grunge', 'guitar', 'happy', 'hard-rock', 'hardcore', 'hardstyle', 'heavy-metal', 'hip-hop', 'holidays', 'honky-tonk', 'house', 'idm', 'indian', 'indie', 'indie-pop', 'industrial', 'iranian', 'j-dance', 'j-idol', 'j-pop', 'j-rock', 'jazz', 'k-pop', 'kids', 'latin', 'latino', 'malay', 'mandopop', 'metal', 'metal-misc', 'metalcore', 'minimal-techno', 'movies', 'mpb', 'new-age', 'new-release', 'opera', 'pagode', 'party', 'philippines-opm', 'piano', 'pop', 'pop-film', 'post-dubstep'

## Get tracks

In [98]:
spotifyUri = spotifyTrackLinks.values[0][0]

trackID = spotifyUri.removeprefix("https://open.spotify.com/track/")

trackInfo = getTrackInfo(trackID)
trackAudioFeatures = getAudioFeatures(trackID)

artistsIDs = []
for artist in trackInfo["artists"]:
    artistsIDs.append(artist["id"])

trackObject = {
    "id": trackInfo["id"],
    "uri": trackInfo["external_urls"]["spotify"],
    "title": trackInfo["name"],
    "duration": trackInfo["duration_ms"],
    "popularity": trackInfo["popularity"],
    "explicit": trackInfo["explicit"],
    "key": trackAudioFeatures["key"],
    "tempo": trackAudioFeatures["tempo"],
    "mode": trackAudioFeatures["mode"],
    "time_signature": trackAudioFeatures["time_signature"],
    "acousticness": trackAudioFeatures["acousticness"],
    "danceability": trackAudioFeatures["danceability"],
    "energy": trackAudioFeatures["energy"],
    "loudness": trackAudioFeatures["loudness"],
    "liveness": trackAudioFeatures["liveness"],
    "valence": trackAudioFeatures["valence"],
    "speechiness": trackAudioFeatures["speechiness"],
    "instrumentalness": trackAudioFeatures["instrumentalness"],
    "artists": artistsIDs,
    "album": trackInfo["album"]["id"],
    "available_countries": trackInfo["available_markets"]
}

tracksDF.append(pd.DataFrame([list(trackObject.values())]))

tracksDF.to_csv(datasetsPath + "/tracks.csv")
