# Spotipy API Fetch

#### Import Relevant Libraries

In [22]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

import pandas as pd
import numpy as np
import time

import os
import ast

DATA_DIRECTORY_PATH = os.getcwd() + "/../Data"

#### Use the client ID and the client ID secret to authenticate with the Spotify API (Spotipy).

In [4]:
# Note: CLIENT_ID and CLIENT_ID_SECRET were removed to keep them confidential
CLIENT_ID = # Insert CLIENT_ID
CLIENT_ID_SECRET  = # Insert CLIENT_ID_SECRET

client_credentials_manager = SpotifyClientCredentials(CLIENT_ID,CLIENT_ID_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

## Get most popular artists from Spotify

The first step in generating the dataframe consists of the retrieval of artists IDs from the Spotify API. The sample of chosen artists for this analysis are the 1,000 most popular artists on Spotify provided in this [list](https://chartmasters.org/most-streamed-artists-ever-on-spotify/). After loading the top 1,000 artists (including corresponding features) into a dataframe, the list of artist names is used for the **get_artist_metadata** function. This function uses the Spotify API's [search endpoint](https://developer.spotify.com/documentation/web-api/reference/search/search/) to gather information based on the provided artist names and then returns a dataframe with the artists' names and metadata which is eventually merged with the top 1,000 artists dataframe.

In [5]:
top_artists = pd.read_csv(DATA_DIRECTORY_PATH +"/Top_1000_Artists.csv",sep=",",index_col=0)
top_artists.head()

Unnamed: 0,Artist Name,Streams,Tracks,1b+,100m+,10m+,1m+,Last Update
1,Drake,34422345387,223,3,90,208,219,11.06.20
2,Ed Sheeran,26548511656,204,5,49,140,174,11.06.20
3,Post Malone,21650426333,64,8,47,57,60,11.06.20
4,Eminem,20367427868,265,1,46,197,239,11.06.20
5,Ariana Grande,20209530076,181,2,46,107,157,11.06.20


In [6]:
df_artist_metadata = pd.DataFrame(columns=["Follower_Count","Genres","hred","Artist_ID","Popularity"])

def get_artist_metadata(artists):
    """
    Returns a dataframe with metadata for artists on Spotify.
        
        Parameters:
            artists (list): List of full names of artists on Spotify
            
        Returns:
            df_artist_metadata (Dataframe): Pandas dataframe containing artist metadata
    """
    
    print("Starting to retrieve metadata...\n\n")
    
    # Initzialize list to store metadata
    meta_list = list()

    ctr = 1
    
    # Iterate over all artists from input
    for artist in artists:
        print(f"\nRetrieving metadata for {artist} ({ctr} of {len(artists)}).")
        
        # Run search query for artist in Spotify API
        obj = sp.search(q=f'"{artist}""',type="artist", limit = 20)
        
        # Iterate over all returned artists
        for element in obj["artists"]["items"]:
            
            # Check if returned name and input name exact equals
            if element["name"] == artist:
                
                # Retrieve metadata from returned json and assign them to variables
                name = element["name"]
                follower_count = element["followers"]["total"]
                genres = element["genres"]
                href = element["href"]
                artist_id = element["id"]
                popularity = element["popularity"]
                
                # Add list of retrieved metadata to metadata list
                meta_list.append([follower_count,
                                  genres,
                                  href,
                                  artist_id,
                                  name,
                                  popularity])   
                
                print(f"Data retrieved for {name}.")
                break

        ctr += 1
    
    # Convert metadata list to dataframe
    df_artist_metadata = pd.DataFrame(meta_list, columns=["Follower_Count",
                                                          "Genres",
                                                          "href",
                                                          "Artist_ID",
                                                          "Name",
                                                          "Popularity"])
    
    print("\n... Function successfully executed.")
    
    # Return dataframe
    return df_artist_metadata
        

#### Call function to retrieve artist metadata

In [5]:
artist_metadata = get_artist_metadata(list(top_artists["Artist Name"]))

Starting to retrieve metadata...



Retrieving metadata for Drake (1 of 1000).
Data retrieved for Drake.

Retrieving metadata for Ed Sheeran (2 of 1000).
Data retrieved for Ed Sheeran.

Retrieving metadata for Post Malone (3 of 1000).
Data retrieved for Post Malone.

Retrieving metadata for Eminem (4 of 1000).
Data retrieved for Eminem.

Retrieving metadata for Ariana Grande (5 of 1000).
Data retrieved for Ariana Grande.

Retrieving metadata for Justin Bieber (6 of 1000).
Data retrieved for Justin Bieber.

Retrieving metadata for The Weeknd (7 of 1000).
Data retrieved for The Weeknd.

Retrieving metadata for J Balvin (8 of 1000).
Data retrieved for J Balvin.

Retrieving metadata for Bad Bunny (9 of 1000).
Data retrieved for Bad Bunny.

Retrieving metadata for Kanye West (10 of 1000).
Data retrieved for Kanye West.

Retrieving metadata for Khalid (11 of 1000).
Data retrieved for Khalid.

Retrieving metadata for Ozuna (12 of 1000).
Data retrieved for Ozuna.

Retrieving metadata for Coldp

Data retrieved for Bob Marley & The Wailers.

Retrieving metadata for Luis Fonsi (107 of 1000).
Data retrieved for Luis Fonsi.

Retrieving metadata for Green Day (108 of 1000).
Data retrieved for Green Day.

Retrieving metadata for Romeo Santos (109 of 1000).
Data retrieved for Romeo Santos.

Retrieving metadata for Harry Styles (110 of 1000).
Data retrieved for Harry Styles.

Retrieving metadata for KAROL G (111 of 1000).
Data retrieved for KAROL G.

Retrieving metadata for Lil Baby (112 of 1000).
Data retrieved for Lil Baby.

Retrieving metadata for Frank Ocean (113 of 1000).
Data retrieved for Frank Ocean.

Retrieving metadata for Anne-Marie (114 of 1000).
Data retrieved for Anne-Marie.

Retrieving metadata for Metro Boomin (115 of 1000).
Data retrieved for Metro Boomin.

Retrieving metadata for Sech (116 of 1000).
Data retrieved for Sech.

Retrieving metadata for ZAYN (117 of 1000).
Data retrieved for ZAYN.

Retrieving metadata for Trippie Redd (118 of 1000).
Data retrieved for Tri

Data retrieved for Paulo Londra.

Retrieving metadata for Luke Bryan (207 of 1000).
Data retrieved for Luke Bryan.

Retrieving metadata for Rauw Alejandro (208 of 1000).
Data retrieved for Rauw Alejandro.

Retrieving metadata for Oasis (209 of 1000).
Data retrieved for Oasis.

Retrieving metadata for Myke Towers (210 of 1000).
Data retrieved for Myke Towers.

Retrieving metadata for Florence + The Machine (211 of 1000).
Data retrieved for Florence + The Machine.

Retrieving metadata for Florida Georgia Line (212 of 1000).
Data retrieved for Florida Georgia Line.

Retrieving metadata for Alesso (213 of 1000).
Data retrieved for Alesso.

Retrieving metadata for Ne-Yo (214 of 1000).
Data retrieved for Ne-Yo.

Retrieving metadata for Paramore (215 of 1000).
Data retrieved for Paramore.

Retrieving metadata for Jason Mraz (216 of 1000).
Data retrieved for Jason Mraz.

Retrieving metadata for Chance the Rapper (217 of 1000).
Data retrieved for Chance the Rapper.

Retrieving metadata for Slip

Data retrieved for Prince Royce.

Retrieving metadata for Doja Cat (308 of 1000).
Data retrieved for Doja Cat.

Retrieving metadata for CNCO (309 of 1000).
Data retrieved for CNCO.

Retrieving metadata for Mark Ronson (310 of 1000).
Data retrieved for Mark Ronson.

Retrieving metadata for Arcangel (311 of 1000).
Data retrieved for Arcangel.

Retrieving metadata for Yandel (312 of 1000).
Data retrieved for Yandel.

Retrieving metadata for Tones And I (313 of 1000).
Data retrieved for Tones And I.

Retrieving metadata for James Blunt (314 of 1000).
Data retrieved for James Blunt.

Retrieving metadata for Armin van Buuren (315 of 1000).
Data retrieved for Armin van Buuren.

Retrieving metadata for Gusttavo Lima (316 of 1000).
Data retrieved for Gusttavo Lima.

Retrieving metadata for Hans Zimmer (317 of 1000).
Data retrieved for Hans Zimmer.

Retrieving metadata for Akon (318 of 1000).
Data retrieved for Akon.

Retrieving metadata for Rudimental (319 of 1000).
Data retrieved for Rudimenta

Data retrieved for Aventura.

Retrieving metadata for Papa Roach (408 of 1000).
Data retrieved for Papa Roach.

Retrieving metadata for Ski Mask The Slump God (409 of 1000).
Data retrieved for Ski Mask The Slump God.

Retrieving metadata for Years & Years (410 of 1000).
Data retrieved for Years & Years.

Retrieving metadata for Bonez MC (411 of 1000).
Data retrieved for Bonez MC.

Retrieving metadata for TWICE (412 of 1000).
Data retrieved for TWICE.

Retrieving metadata for The Police (413 of 1000).
Data retrieved for The Police.

Retrieving metadata for Juan Magán (414 of 1000).
Data retrieved for Juan Magán.

Retrieving metadata for Rick Ross (415 of 1000).
Data retrieved for Rick Ross.

Retrieving metadata for Norah Jones (416 of 1000).
Data retrieved for Norah Jones.

Retrieving metadata for Soda Stereo (417 of 1000).
Data retrieved for Soda Stereo.

Retrieving metadata for Maren Morris (418 of 1000).
Data retrieved for Maren Morris.

Retrieving metadata for EXO (419 of 1000).
Dat

Data retrieved for Los Ángeles Azules.

Retrieving metadata for Two Door Cinema Club (510 of 1000).
Data retrieved for Two Door Cinema Club.

Retrieving metadata for George Michael (511 of 1000).
Data retrieved for George Michael.

Retrieving metadata for SAINt JHN (512 of 1000).
Data retrieved for SAINt JHN.

Retrieving metadata for Vampire Weekend (513 of 1000).
Data retrieved for Vampire Weekend.

Retrieving metadata for Sabrina Carpenter (514 of 1000).
Data retrieved for Sabrina Carpenter.

Retrieving metadata for All Time Low (515 of 1000).
Data retrieved for All Time Low.

Retrieving metadata for Earth, Wind & Fire (516 of 1000).
Data retrieved for Earth, Wind & Fire.

Retrieving metadata for Stormzy (517 of 1000).
Data retrieved for Stormzy.

Retrieving metadata for Brytiago (518 of 1000).
Data retrieved for Brytiago.

Retrieving metadata for Limp Bizkit (519 of 1000).
Data retrieved for Limp Bizkit.

Retrieving metadata for Cali Y El Dandee (520 of 1000).
Data retrieved for Cal

Data retrieved for CHVRCHES.

Retrieving metadata for Arizona Zervas (609 of 1000).
Data retrieved for Arizona Zervas.

Retrieving metadata for fun. (610 of 1000).
Data retrieved for fun..

Retrieving metadata for Dierks Bentley (611 of 1000).
Data retrieved for Dierks Bentley.

Retrieving metadata for John Newman (612 of 1000).
Data retrieved for John Newman.

Retrieving metadata for Jarabe De Palo (613 of 1000).
Data retrieved for Jarabe De Palo.

Retrieving metadata for Yellow Claw (614 of 1000).
Data retrieved for Yellow Claw.

Retrieving metadata for Joey Bada$$ (615 of 1000).
Data retrieved for Joey Bada$$.

Retrieving metadata for Glass Animals (616 of 1000).
Data retrieved for Glass Animals.

Retrieving metadata for DJ Luian (617 of 1000).
Data retrieved for DJ Luian.

Retrieving metadata for Ciara (618 of 1000).
Data retrieved for Ciara.

Retrieving metadata for Sfera Ebbasta (619 of 1000).
Data retrieved for Sfera Ebbasta.

Retrieving metadata for James TW (620 of 1000).
Data

Data retrieved for Alice In Chains.

Retrieving metadata for Sufjan Stevens (710 of 1000).
Data retrieved for Sufjan Stevens.

Retrieving metadata for Kris Kross Amsterdam (711 of 1000).
Data retrieved for Kris Kross Amsterdam.

Retrieving metadata for Showtek (712 of 1000).
Data retrieved for Showtek.

Retrieving metadata for Marco Antonio Solís (713 of 1000).
Data retrieved for Marco Antonio Solís.

Retrieving metadata for Seether (714 of 1000).
Data retrieved for Seether.

Retrieving metadata for MNEK (715 of 1000).
Data retrieved for MNEK.

Retrieving metadata for Los Tigres Del Norte (716 of 1000).
Data retrieved for Los Tigres Del Norte.

Retrieving metadata for Charlie Brown Jr. (717 of 1000).
Data retrieved for Charlie Brown Jr..

Retrieving metadata for Cash Cash (718 of 1000).
Data retrieved for Cash Cash.

Retrieving metadata for Luciano (719 of 1000).
Data retrieved for Luciano.

Retrieving metadata for Fergie (720 of 1000).
Data retrieved for Fergie.

Retrieving metadata f

Data retrieved for Yung Gravy.

Retrieving metadata for KSHMR (813 of 1000).
Data retrieved for KSHMR.

Retrieving metadata for Willie Nelson (814 of 1000).
Data retrieved for Willie Nelson.

Retrieving metadata for Dido (815 of 1000).
Data retrieved for Dido.

Retrieving metadata for Nat King Cole (816 of 1000).
Data retrieved for Nat King Cole.

Retrieving metadata for Brett Young (817 of 1000).
Data retrieved for Brett Young.

Retrieving metadata for MadeinTYO (818 of 1000).
Data retrieved for MadeinTYO.

Retrieving metadata for Mary J. Blige (819 of 1000).
Data retrieved for Mary J. Blige.

Retrieving metadata for La Adictiva Banda San José de Mesillas (820 of 1000).
Data retrieved for La Adictiva Banda San José de Mesillas.

Retrieving metadata for Banda Los Recoditos (821 of 1000).
Data retrieved for Banda Los Recoditos.

Retrieving metadata for Shelley FKA DRAM (822 of 1000).
Data retrieved for Shelley FKA DRAM.

Retrieving metadata for Cyndi Lauper (823 of 1000).
Data retrieved

Data retrieved for Wu-Tang Clan.

Retrieving metadata for Bushido (911 of 1000).
Data retrieved for Bushido.

Retrieving metadata for Lucas Lucco (912 of 1000).
Data retrieved for Lucas Lucco.

Retrieving metadata for Ultimo (913 of 1000).
Data retrieved for Ultimo.

Retrieving metadata for Gemitaiz (914 of 1000).
Data retrieved for Gemitaiz.

Retrieving metadata for KC Rebell (915 of 1000).
Data retrieved for KC Rebell.

Retrieving metadata for Deftones (916 of 1000).
Data retrieved for Deftones.

Retrieving metadata for Pixies (917 of 1000).
Data retrieved for Pixies.

Retrieving metadata for DVBBS (918 of 1000).
Data retrieved for DVBBS.

Retrieving metadata for beabadoobee (919 of 1000).
Data retrieved for beabadoobee.

Retrieving metadata for A$AP Mob (920 of 1000).
Data retrieved for A$AP Mob.

Retrieving metadata for James Blake (921 of 1000).
Data retrieved for James Blake.

Retrieving metadata for Tom Petty and the Heartbreakers (922 of 1000).
Data retrieved for Tom Petty and 

In [6]:
print(artist_metadata.shape)
artist_metadata.head()

(1000, 6)


Unnamed: 0,Follower_Count,Genres,href,Artist_ID,Name,Popularity
0,50509972,"[canadian hip hop, canadian pop, hip hop, pop ...",https://api.spotify.com/v1/artists/3TVXtAsR1In...,3TVXtAsR1Inumwj472S9r4,Drake,100
1,71571877,"[pop, uk pop]",https://api.spotify.com/v1/artists/6eUKZXaKkcv...,6eUKZXaKkcviH0Ku9w2n3V,Ed Sheeran,93
2,29076628,"[dfw rap, melodic rap, rap]",https://api.spotify.com/v1/artists/246dkjvS1zL...,246dkjvS1zLTtiykXe5h60,Post Malone,95
3,38966309,"[detroit hip hop, hip hop, rap]",https://api.spotify.com/v1/artists/7dGJo4pcD2V...,7dGJo4pcD2V6oG8kP0tJRR,Eminem,94
4,52367445,"[dance pop, pop, post-teen pop]",https://api.spotify.com/v1/artists/66CXWjxzNUs...,66CXWjxzNUsdJxJ2JdwvnR,Ariana Grande,98


#### Merge retrieved metadata to artists list

In [12]:
top_artists_with_metadata = pd.merge(left=top_artists,
                                     right=artist_metadata,
                                     left_on="Artist Name",
                                     right_on="Name").drop("Name",axis=1)
top_artists_with_metadata.head()

Unnamed: 0,Artist Name,Streams,Tracks,1b+,100m+,10m+,1m+,Last Update,Follower_Count,Genres,href,Artist_ID,Popularity
0,Drake,34422345387,223,3,90,208,219,11.06.20,50509972,"[canadian hip hop, canadian pop, hip hop, pop ...",https://api.spotify.com/v1/artists/3TVXtAsR1In...,3TVXtAsR1Inumwj472S9r4,100
1,Ed Sheeran,26548511656,204,5,49,140,174,11.06.20,71571877,"[pop, uk pop]",https://api.spotify.com/v1/artists/6eUKZXaKkcv...,6eUKZXaKkcviH0Ku9w2n3V,93
2,Post Malone,21650426333,64,8,47,57,60,11.06.20,29076628,"[dfw rap, melodic rap, rap]",https://api.spotify.com/v1/artists/246dkjvS1zL...,246dkjvS1zLTtiykXe5h60,95
3,Eminem,20367427868,265,1,46,197,239,11.06.20,38966309,"[detroit hip hop, hip hop, rap]",https://api.spotify.com/v1/artists/7dGJo4pcD2V...,7dGJo4pcD2V6oG8kP0tJRR,94
4,Ariana Grande,20209530076,181,2,46,107,157,11.06.20,52367445,"[dance pop, pop, post-teen pop]",https://api.spotify.com/v1/artists/66CXWjxzNUs...,66CXWjxzNUsdJxJ2JdwvnR,98


In [9]:
top_artists_with_metadata.shape

(1000, 13)

#### Export dataframe to csv

In [11]:
top_artists_with_metadata.to_csv(DATA_DIRECTORY_PATH + "/artists_with_metadata.csv")

## Get albums of top artists
After the artists dataframe is created, the next step is to find the albums that this artist has published. This is achieved using the [get-an-artists-albums endpoint](https://developer.spotify.com/documentation/web-api/reference/artists/get-artists-albums/) which is used in the **get_albums** function. Using this endpoint up to 50 albums can be retrieved per request. To account for artists who have published more than 50 albums, the **album_cursor** function is used to go through all available pages of an artist's albums ensuring that all relevant albums are retreived. Relevant information is then fetched from the returned album objects and stored in a dataframe.

In [8]:
top_artists_with_metadata = pd.read_csv(DATA_DIRECTORY_PATH + "/artists_with_metadata.csv", index_col=0)

In [50]:
def album_cursor(artist_id):
    """
    Takes an artist ID as input and return all albums published by this artist.
    
        Parameters:
            artist_id (string): Spotify ID of the artist.
            
        Returns:
            albums (list): List of all album objects of the artist.
    """
    # Retrieve the first 50 albums of an artist
    results = sp.artist_albums(artist_id,album_type="album", limit=50)
    
    # Assign items of the album to albums variable
    albums = results['items']
    
    # Check whether there is an additional page
    while results['next']:
        
        # Go to the next page
        results = sp.next(results)
        
        # Add items to albums list
        albums.extend(results['items'])
    
    # return albums list
    return albums

In [51]:
def get_albums(artist_ids):
    """
    Takes list of artist IDs and returns dataframe with albums and corresponding metadata.
    
        Parameters:
            artist_ids (list): List of Spotify IDs of the artists
            
        Returns:
            df_albums (Dataframe): Pandas dataframe containing albums and metadata
    """
    
    
    print("Starting to retrieve albums...\n\n")
    
    # Initzialize list to store metadata
    album_info = list()
    
    ctr = 1
    
    # Iterate over artist IDs
    for artist_id in artist_ids:
        
        # Call album_cursor function to get all albums of artist
        albums = album_cursor(artist_id)
        
        # Iterate over returned albums
        for album in albums:
        
            # Retrieve metadata from album object
            album_id = album["id"]
            href = album["href"]
            name = album["name"]
            release_date = album["release_date"]
            release_date_precision = album["release_date_precision"]
            total_tracks = album["total_tracks"]
            uri = album["uri"]
            
            # Add list of retrieved metadata to metadata list
            album_info.append([artist_id,
                               album_id,
                               href,
                               name,
                               release_date,
                               release_date_precision,
                               total_tracks,
                               uri])
            
        if ctr % 10 == 0:
            print(f"{ctr} of {len(artist_ids)} artists' albums retrieved.\n")
        
        ctr += 1
    
    # Convert metadata list to dataframe
    df_albums = pd.DataFrame(album_info, columns=["Artist ID",
                                                  "Album ID",
                                                  "href",
                                                  "Album Name",
                                                  "Release Date",
                                                  "Release Date Precision",
                                                  "Total Tracks",
                                                  "URI"])
    
    print("\n... Function successfully executed.")
    
    # Return dataframe
    return df_albums

#### Call function to retrieve albums and their metadata

In [65]:
df_albums = get_albums(list(top_artists_with_metadata["Artist_ID"]))

Starting to retrieve albums...


10 of 1000 artists' albums retrieved.

20 of 1000 artists' albums retrieved.

30 of 1000 artists' albums retrieved.

40 of 1000 artists' albums retrieved.

50 of 1000 artists' albums retrieved.

60 of 1000 artists' albums retrieved.

70 of 1000 artists' albums retrieved.

80 of 1000 artists' albums retrieved.

90 of 1000 artists' albums retrieved.

100 of 1000 artists' albums retrieved.

110 of 1000 artists' albums retrieved.

120 of 1000 artists' albums retrieved.

130 of 1000 artists' albums retrieved.

140 of 1000 artists' albums retrieved.

150 of 1000 artists' albums retrieved.

160 of 1000 artists' albums retrieved.

170 of 1000 artists' albums retrieved.

180 of 1000 artists' albums retrieved.

190 of 1000 artists' albums retrieved.

200 of 1000 artists' albums retrieved.

210 of 1000 artists' albums retrieved.

220 of 1000 artists' albums retrieved.

230 of 1000 artists' albums retrieved.

240 of 1000 artists' albums retrieved.

250 of 1000 arti

In [66]:
df_albums.head()

Unnamed: 0,Artist ID,Album ID,href,Album Name,Release Date,Release Date Precision,Total Tracks,URI
0,3TVXtAsR1Inumwj472S9r4,6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/albums/6OQ9gBfg5EXe...,Dark Lane Demo Tapes,2020-05-01,day,14,spotify:album:6OQ9gBfg5EXeNAEwGSs6jK
1,3TVXtAsR1Inumwj472S9r4,45c1tgTktunRMmfh3WVh8U,https://api.spotify.com/v1/albums/45c1tgTktunR...,Dark Lane Demo Tapes,2020-05-01,day,14,spotify:album:45c1tgTktunRMmfh3WVh8U
2,3TVXtAsR1Inumwj472S9r4,7dqpveMVcWgbzqYrOdkFTD,https://api.spotify.com/v1/albums/7dqpveMVcWgb...,Care Package,2019-08-02,day,17,spotify:album:7dqpveMVcWgbzqYrOdkFTD
3,3TVXtAsR1Inumwj472S9r4,6CY70qRxPutN3VKfYhNREa,https://api.spotify.com/v1/albums/6CY70qRxPutN...,Care Package,2019-08-02,day,17,spotify:album:6CY70qRxPutN3VKfYhNREa
4,3TVXtAsR1Inumwj472S9r4,2podUJIFG8hLfFz7Kqe8yJ,https://api.spotify.com/v1/albums/2podUJIFG8hL...,So Far Gone,2019-02-14,day,18,spotify:album:2podUJIFG8hLfFz7Kqe8yJ


#### Merge albums dataframe with artists dataframe to get artist name

In [67]:
df_albums = pd.merge(left=df_albums,
                     right=top_artists_with_metadata[["Artist_ID","Artist Name"]],
                     left_on="Artist ID",
                     right_on="Artist_ID")

df_albums = df_albums.drop("Artist_ID",axis=1)
df_albums.head()

Unnamed: 0,Artist ID,Album ID,href,Album Name,Release Date,Release Date Precision,Total Tracks,URI,Artist Name
0,3TVXtAsR1Inumwj472S9r4,6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/albums/6OQ9gBfg5EXe...,Dark Lane Demo Tapes,2020-05-01,day,14,spotify:album:6OQ9gBfg5EXeNAEwGSs6jK,Drake
1,3TVXtAsR1Inumwj472S9r4,45c1tgTktunRMmfh3WVh8U,https://api.spotify.com/v1/albums/45c1tgTktunR...,Dark Lane Demo Tapes,2020-05-01,day,14,spotify:album:45c1tgTktunRMmfh3WVh8U,Drake
2,3TVXtAsR1Inumwj472S9r4,7dqpveMVcWgbzqYrOdkFTD,https://api.spotify.com/v1/albums/7dqpveMVcWgb...,Care Package,2019-08-02,day,17,spotify:album:7dqpveMVcWgbzqYrOdkFTD,Drake
3,3TVXtAsR1Inumwj472S9r4,6CY70qRxPutN3VKfYhNREa,https://api.spotify.com/v1/albums/6CY70qRxPutN...,Care Package,2019-08-02,day,17,spotify:album:6CY70qRxPutN3VKfYhNREa,Drake
4,3TVXtAsR1Inumwj472S9r4,2podUJIFG8hLfFz7Kqe8yJ,https://api.spotify.com/v1/albums/2podUJIFG8hL...,So Far Gone,2019-02-14,day,18,spotify:album:2podUJIFG8hLfFz7Kqe8yJ,Drake


In [68]:
df_albums.shape

(22806, 9)

In [69]:
len(df_albums["Album ID"].unique())

22663

#### Drop duplicate album IDs

In [70]:
df_albums = df_albums.drop_duplicates(subset=["Album ID"])

In [71]:
df_albums.shape

(22663, 9)

In [72]:
df_albums.head()

Unnamed: 0,Artist ID,Album ID,href,Album Name,Release Date,Release Date Precision,Total Tracks,URI,Artist Name
0,3TVXtAsR1Inumwj472S9r4,6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/albums/6OQ9gBfg5EXe...,Dark Lane Demo Tapes,2020-05-01,day,14,spotify:album:6OQ9gBfg5EXeNAEwGSs6jK,Drake
1,3TVXtAsR1Inumwj472S9r4,45c1tgTktunRMmfh3WVh8U,https://api.spotify.com/v1/albums/45c1tgTktunR...,Dark Lane Demo Tapes,2020-05-01,day,14,spotify:album:45c1tgTktunRMmfh3WVh8U,Drake
2,3TVXtAsR1Inumwj472S9r4,7dqpveMVcWgbzqYrOdkFTD,https://api.spotify.com/v1/albums/7dqpveMVcWgb...,Care Package,2019-08-02,day,17,spotify:album:7dqpveMVcWgbzqYrOdkFTD,Drake
3,3TVXtAsR1Inumwj472S9r4,6CY70qRxPutN3VKfYhNREa,https://api.spotify.com/v1/albums/6CY70qRxPutN...,Care Package,2019-08-02,day,17,spotify:album:6CY70qRxPutN3VKfYhNREa,Drake
4,3TVXtAsR1Inumwj472S9r4,2podUJIFG8hLfFz7Kqe8yJ,https://api.spotify.com/v1/albums/2podUJIFG8hL...,So Far Gone,2019-02-14,day,18,spotify:album:2podUJIFG8hLfFz7Kqe8yJ,Drake


#### Store albums dataframe as csv

In [73]:
df_albums.to_csv(DATA_DIRECTORY_PATH + "/album_data.csv")

## Get tracks from albums
Now that the albums are available, the respective tracks should be identified. Whenever information from more than one album should be retrieved, Spotify advises to use the [albums endpoint](https://developer.spotify.com/documentation/web-api/reference/albums/get-album/) which is called in the **get_album_metadata** function. In fact, this function has two purposes. First, it captures the IDs of all albums that are fed into it. Second, it retrieves some album metadata that has not been available with the endpoint used to get album metadata from the artist IDs. As the albums endpoint only accepts a maximum of 20 album IDs at a time, the data is fed in batches. After the function finished executing, it returns both a list of all the tracks and a dataframe with the additional album metadata which is, in turn, merged with the albums dataframe from above.

In [77]:
df_albums = pd.read_csv(DATA_DIRECTORY_PATH + "/album_data.csv", index_col=0)

In [78]:
def track_cursor(tracks):
    """
    Takes a tracks object from album as input and returns list of all track IDs.
    
        Parameters:
            tracks (dict): Dictionary containing information about tracks from an album
            
        Returns:
            track_ids (list): List of all Spotify track IDs of album.
    """
    
    # Extract track IDs from tracks dict
    track_ids = [track["id"] for track in tracks["items"]]
    
    # Check whether there is an additional page
    while tracks["next"]:
        
        # Go to next page
        tracks = sp.next(tracks)
        
        # Add IDs to track ID list
        track_ids.extend([track["id"] for track in tracks["items"]])
    
    # Return list of track IDs
    return track_ids      

In [79]:
def get_album_metadata(album_ids):
    """
    Takes a list of album IDs as input and returns track list and album metadata.
    
        Parameters:
            album_ids (string): List of Spotify album IDs
            
        Returns:
            df_albums (dataframe): Pandas dataframe with metadata for albums
            track_list (list): List of Spotify track IDs
    """
    print("Starting to retrieve album metadata...\n\n")
    
    # Initialize list for album metadata
    album_info = list()
    
    # Initialize list for track IDs
    track_list = list()
    
    ctr = 0

    
    id_count = len(album_ids)
    # Function only accepts 20 albums at a time, therefore feed data in batches
    for i in range(int(id_count/20) + 1):
        end_loc = min((i+1)*20,id_count)
        current_batch = album_ids[i*20:end_loc]
        
        # Retreive album data for IDs of current batch
        albums = sp.albums(current_batch)
        
        # Iterate over returned albums and assign info to variables
        for album in albums["albums"]:
            album_id = album["id"]
            copyrights = [i["text"] for i in album["copyrights"]]
            popularity = album["popularity"]
            
            # Add list of retrieved metadata to metadata list
            album_info.append([album_id,
                               copyrights,
                               popularity])
            
            # Add new tracks to track list by retreiving them through calling track_cursor function
            track_list.extend(track_cursor(album["tracks"]))

        if ctr % 100 == 0:
            print(f"{ctr} of {len(album_ids)} artists' albums retrieved.\n")
        
        ctr += 20
        
    # Convert metadata list to dataframe
    df_albums = pd.DataFrame(album_info, columns=["Album ID",
                                                  "Copyrights",
                                                  "Popularity"])
        
    print("\n... Function successfully executed.")
    
    # Return album dataframe and track list
    return df_albums, track_list

#### Call function to retrieve album metadata and track list

In [80]:
album_info, track_list = get_album_metadata(list(df_albums["Album ID"]))

Starting to retrieve album metadata...


0 of 22663 artists' albums retrieved.

100 of 22663 artists' albums retrieved.

200 of 22663 artists' albums retrieved.

300 of 22663 artists' albums retrieved.

400 of 22663 artists' albums retrieved.

500 of 22663 artists' albums retrieved.

600 of 22663 artists' albums retrieved.

700 of 22663 artists' albums retrieved.

800 of 22663 artists' albums retrieved.

900 of 22663 artists' albums retrieved.

1000 of 22663 artists' albums retrieved.

1100 of 22663 artists' albums retrieved.

1200 of 22663 artists' albums retrieved.

1300 of 22663 artists' albums retrieved.

1400 of 22663 artists' albums retrieved.

1500 of 22663 artists' albums retrieved.

1600 of 22663 artists' albums retrieved.

1700 of 22663 artists' albums retrieved.

1800 of 22663 artists' albums retrieved.

1900 of 22663 artists' albums retrieved.

2000 of 22663 artists' albums retrieved.

2100 of 22663 artists' albums retrieved.

2200 of 22663 artists' albums retrieved.

2300 

19300 of 22663 artists' albums retrieved.

19400 of 22663 artists' albums retrieved.

19500 of 22663 artists' albums retrieved.

19600 of 22663 artists' albums retrieved.

19700 of 22663 artists' albums retrieved.

19800 of 22663 artists' albums retrieved.

19900 of 22663 artists' albums retrieved.

20000 of 22663 artists' albums retrieved.

20100 of 22663 artists' albums retrieved.

20200 of 22663 artists' albums retrieved.

20300 of 22663 artists' albums retrieved.

20400 of 22663 artists' albums retrieved.

20500 of 22663 artists' albums retrieved.

20600 of 22663 artists' albums retrieved.

20700 of 22663 artists' albums retrieved.

20800 of 22663 artists' albums retrieved.

20900 of 22663 artists' albums retrieved.

21000 of 22663 artists' albums retrieved.

21100 of 22663 artists' albums retrieved.

21200 of 22663 artists' albums retrieved.

21300 of 22663 artists' albums retrieved.

21400 of 22663 artists' albums retrieved.

21500 of 22663 artists' albums retrieved.

21600 of 22

In [81]:
album_info

Unnamed: 0,Album ID,Copyrights,Popularity
0,6OQ9gBfg5EXeNAEwGSs6jK,"[© 2020 OVO, under exclusive license to Republ...",86
1,45c1tgTktunRMmfh3WVh8U,"[© 2020 OVO, under exclusive license to Republ...",58
2,7dqpveMVcWgbzqYrOdkFTD,"[© 2019 OVO, ℗ 2019 OVO]",73
3,6CY70qRxPutN3VKfYhNREa,"[© 2019 OVO, ℗ 2019 OVO]",43
4,2podUJIFG8hLfFz7Kqe8yJ,"[© 2019 Young Money/Cash Money Records, ℗ 2019...",42
...,...,...,...
22658,4uQ5kFmXQdCxz3WvM4UUzy,"[2019 Partisan Records, 2019 Partisan Records]",70
22659,5bP82ZIls6rzhpf5Qu6AzC,"[© 2017 Partisan Records, ℗ 2017 Partisan Reco...",69
22660,1bUuI29ZOu2QsU9kzM9Cvw,[2017 Pod / Inertia under exclusive licence fr...,53
22661,2mxFsS5yylSTHNivV53HoA,"[2017 Partisan Records, 2017 Partisan Records]",75


#### Merge album metadata with remaining information on albums

In [82]:
df_albums_with_metadata = pd.merge(left=df_albums,
                                   right=album_info,
                                   left_on="Album ID",
                                   right_on="Album ID")

#### Store final album dataset

In [84]:
df_albums_with_metadata.to_csv(DATA_DIRECTORY_PATH + "/albums_with_metadata.csv")

## Get general information about tracks
The list of track IDs allows us to gather more information related to them using the [get-several-tracks endpoint](https://developer.spotify.com/documentation/web-api/reference/tracks/get-several-tracks/). Just like the [albums endpoint](https://developer.spotify.com/documentation/web-api/reference/albums/get-album/) this endpoint allows for the retrieval of several tracks at once with a maximum of 50 tracks per request. For that reason, the **get_track_metadata** function also feeds the data in batches to the API before creating a dataframe out of the returned data.

In [85]:
def get_track_metadata(track_ids):
    """
    Takes list of track IDs as input and returns dataframe with track metadata
    
        Parameters:
            track_ids (list): List of Spotify track IDs
            
        Returns:
            df_tracks (dataframe): Pandas dataframe with metadata on tracks
    """
    
    # Initialize list to store track metadata
    track_info = list()
    
    ctr = 50
    
    id_count = len(track_ids)
    
    # Function only accepts 50 tracks at a time, therefore feed data in batches
    for i in range(int(id_count/50)+1):
        end_loc = min((i+1)*50,id_count)
        current_batch = track_ids[i*50:end_loc]
        
        # Ensure that batch is not empty which would cause an error (happens when length of track IDs is multiple of 50)
        if len(current_batch) == 0:
            break
        
        # Retrieve information on tracks from current batch
        tracks = sp.tracks(current_batch)
        
        # Iterate over tracks and collect relevant metadata
        for track in tracks["tracks"]:
            track_id = track["id"]
            album_id = track["album"]["id"]
            album_name = track["album"]["name"]
            duration_ms = track["duration_ms"]
            explicit = track["explicit"]
            href = track["href"]
            name = track["name"]
            popularity = track["popularity"]
            uri = track["uri"]
            
            artist_names =  [artist["name"] for artist in track["artists"]]
            artist_ids = [artist["id"] for artist in track["artists"]]
            
            # Store metadata in list
            track_info.append([name,
                               artist_names,
                               album_name,
                               duration_ms,
                               explicit,
                               popularity,
                               track_id,
                               artist_ids,
                               album_id,
                               href,
                               uri])
                 
        if ctr % 10000 == 0:
            print(f"{ctr} of {id_count} tracks retrieved.\n")
            
        ctr += 50
                  
    # Convert metadata list to dataframe
    df_tracks = pd.DataFrame(track_info, columns=["Track Name",
                                                  "Artist Names",
                                                  "Album Name",
                                                  "Duration in ms",
                                                  "Explicit",
                                                  "Popularity",
                                                  "Track ID",
                                                  "Artist IDs",
                                                  "Album ID",
                                                  "Href",
                                                  "URI"])
        
    print("\n... Function successfully executed.")
    
    # Return dataframe
    return df_tracks

#### Call function to retrieve track metadata

In [86]:
df_tracks = get_track_metadata(track_list)

10000 of 445578 tracks retrieved.

20000 of 445578 tracks retrieved.

30000 of 445578 tracks retrieved.

40000 of 445578 tracks retrieved.

50000 of 445578 tracks retrieved.

60000 of 445578 tracks retrieved.

70000 of 445578 tracks retrieved.

80000 of 445578 tracks retrieved.

90000 of 445578 tracks retrieved.

100000 of 445578 tracks retrieved.

110000 of 445578 tracks retrieved.

120000 of 445578 tracks retrieved.

130000 of 445578 tracks retrieved.

140000 of 445578 tracks retrieved.

150000 of 445578 tracks retrieved.

160000 of 445578 tracks retrieved.

170000 of 445578 tracks retrieved.

180000 of 445578 tracks retrieved.

190000 of 445578 tracks retrieved.

200000 of 445578 tracks retrieved.

210000 of 445578 tracks retrieved.

220000 of 445578 tracks retrieved.

230000 of 445578 tracks retrieved.

240000 of 445578 tracks retrieved.

250000 of 445578 tracks retrieved.

260000 of 445578 tracks retrieved.

270000 of 445578 tracks retrieved.

280000 of 445578 tracks retrieved.

2

In [87]:
df_tracks

Unnamed: 0,Track Name,Artist Names,Album Name,Duration in ms,Explicit,Popularity,Track ID,Artist IDs,Album ID,Href,URI
0,Deep Pockets,[Drake],Dark Lane Demo Tapes,222928,True,63,3IvMYBE7A3c7to1aEcfFJk,[3TVXtAsR1Inumwj472S9r4],6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/3IvMYBE7A3c7...,spotify:track:3IvMYBE7A3c7to1aEcfFJk
1,When To Say When,[Drake],Dark Lane Demo Tapes,223124,True,65,5TCBWmEBrin7etRa4Lswr1,[3TVXtAsR1Inumwj472S9r4],6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/5TCBWmEBrin7...,spotify:track:5TCBWmEBrin7etRa4Lswr1
2,Chicago Freestyle (feat. Giveon),"[Drake, Giveon]",Dark Lane Demo Tapes,220487,True,84,4wVOKKEHUJxHCFFNUWDn0B,"[3TVXtAsR1Inumwj472S9r4, 4fxd5Ee7UefO4CUXgwJ7IP]",6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/4wVOKKEHUJxH...,spotify:track:4wVOKKEHUJxHCFFNUWDn0B
3,Not You Too (feat. Chris Brown),"[Drake, Chris Brown]",Dark Lane Demo Tapes,269680,True,68,3Q4gttWQ6hxqWOa3tHoTNi,"[3TVXtAsR1Inumwj472S9r4, 7bXgB6jMjp9ATFy66eO08Z]",6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/3Q4gttWQ6hxq...,spotify:track:3Q4gttWQ6hxqWOa3tHoTNi
4,Toosie Slide,[Drake],Dark Lane Demo Tapes,247058,True,80,466cKvZn1j45IpxDdYZqdA,[3TVXtAsR1Inumwj472S9r4],6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/466cKvZn1j45...,spotify:track:466cKvZn1j45IpxDdYZqdA
...,...,...,...,...,...,...,...,...,...,...,...
445573,Sweet,[Cigarettes After Sex],Cigarettes After Sex,291640,False,42,2KhrPRV0V1FS2l4eQMJUWt,[1QAJqy2dA3ihHBFIHRphZj],5chTLnnxlxQVAgEv6YbEBe,https://api.spotify.com/v1/tracks/2KhrPRV0V1FS...,spotify:track:2KhrPRV0V1FS2l4eQMJUWt
445574,Opera House,[Cigarettes After Sex],Cigarettes After Sex,364680,False,33,2Ddfm2NQwTRsf7YVlt258S,[1QAJqy2dA3ihHBFIHRphZj],5chTLnnxlxQVAgEv6YbEBe,https://api.spotify.com/v1/tracks/2Ddfm2NQwTRs...,spotify:track:2Ddfm2NQwTRsf7YVlt258S
445575,Truly,[Cigarettes After Sex],Cigarettes After Sex,243640,False,31,6i2XzqnaxvpNVTUw8EjPMU,[1QAJqy2dA3ihHBFIHRphZj],5chTLnnxlxQVAgEv6YbEBe,https://api.spotify.com/v1/tracks/6i2XzqnaxvpN...,spotify:track:6i2XzqnaxvpNVTUw8EjPMU
445576,John Wayne,[Cigarettes After Sex],Cigarettes After Sex,258013,False,37,4xbN7giFhdGbSQTPyG178F,[1QAJqy2dA3ihHBFIHRphZj],5chTLnnxlxQVAgEv6YbEBe,https://api.spotify.com/v1/tracks/4xbN7giFhdGb...,spotify:track:4xbN7giFhdGbSQTPyG178F


## Get audio features of tracks

Aside from the general track metadata, the Spotify API also allows users to gather audio features of a track which give an idea of what a track is like. They can be requested with the [audio-features endpoint](https://developer.spotify.com/documentation/web-api/reference/tracks/get-audio-features/) which, in this case, is called by the **get_audio_features** function which works largely the same as the **get_track_metadata** function. The only notable difference is that not all songs contain audio features which requires error handling for TypeErrors which appear every time a song without audio features is detected.

In [88]:
def get_audio_features(track_ids):
    """
    Takes list of track IDs as input and returns dataframe with audio features.
    
        Parameters:
            track_ids (list): List of Spotify track IDs
            
        Returns:
            df_audio_features (dataframe): Pandas dataframe with audio features tracks
    """    
    
    # Initialize list to store audio features
    audio_features = list()
    
    ctr = 100
    
    id_count = len(track_ids)
    
    # Function only accepts 100 tracks at a time, therefore feed data in batches
    for i in range(int(id_count/100)+1):
        end_loc = min((i+1)*100,id_count)
        current_batch = track_ids[i*100:end_loc]
        
        # Ensure that batch is not empty which would cause an error (happens when length of track IDs is multiple of 50)
        if len(current_batch) == 0:
            break
        
        # Retrieve audio features of tracks from current batch
        tracks = sp.audio_features(current_batch)
        
        # Iterate over tracks
        for track in tracks:
            
            # Try to capture all audio feature informamtion of tracks (will only work if audio features are present)
            try:
                track_id = track["id"]
                danceability = track["danceability"]
                energy = track["energy"]
                key = track["key"]
                loudness = track["loudness"]
                mode = track["mode"]
                speechiness = track["speechiness"]
                acousticness = track["acousticness"]
                instrumentalness = track["instrumentalness"]
                liveness = track["liveness"]
                valence = track["valence"]
                tempo = track["tempo"]
                time_signature = track["time_signature"]

                # Append audio features to list
                audio_features.append([track_id,
                                       acousticness,
                                       danceability,
                                       energy,
                                       instrumentalness,
                                       liveness,
                                       loudness,
                                       speechiness,
                                       valence,
                                       tempo,
                                       time_signature,
                                       key,
                                       mode])
            
            # Handle type errors by printing out error message and resuming with next track
            except TypeError:
                print(f"Song without audio features detected!\n")
                
        if ctr % 10000 == 0:
            print(f"Metadata {ctr} of {id_count} tracks retrieved.\n")
            
        ctr += 100
    
    # Convert list of audio features to dataframe
    df_audio_features = pd.DataFrame(audio_features,columns=["Track ID",
                                                             "Acousticness",
                                                             "Danceability",
                                                             "Energy",
                                                             "Instrumentalness",
                                                             "Liveness",
                                                             "Loudness",
                                                             "Speechiness",
                                                             "Valence",
                                                             "Tempo",
                                                             "Time Signature",
                                                             "Key",
                                                             "Mode"])
        
    print("\n... Function successfully executed.")
    
    # Return audio features dataframe
    return df_audio_features

#### Call function to get audio features of each track

In [89]:
df_audio_features = get_audio_features(track_list)

Metadata 10000 of 445578 tracks retrieved.

Metadata 20000 of 445578 tracks retrieved.

Metadata 30000 of 445578 tracks retrieved.

Metadata 40000 of 445578 tracks retrieved.

Metadata 50000 of 445578 tracks retrieved.

Metadata 60000 of 445578 tracks retrieved.

Metadata 70000 of 445578 tracks retrieved.

Metadata 80000 of 445578 tracks retrieved.

Song without audio features detected!

Metadata 90000 of 445578 tracks retrieved.

Metadata 100000 of 445578 tracks retrieved.

Metadata 110000 of 445578 tracks retrieved.

Metadata 120000 of 445578 tracks retrieved.

Metadata 130000 of 445578 tracks retrieved.

Metadata 140000 of 445578 tracks retrieved.

Metadata 150000 of 445578 tracks retrieved.

Metadata 160000 of 445578 tracks retrieved.

Metadata 170000 of 445578 tracks retrieved.

Metadata 180000 of 445578 tracks retrieved.

Metadata 190000 of 445578 tracks retrieved.

Metadata 200000 of 445578 tracks retrieved.

Song without audio features detected!

Metadata 210000 of 445578 track

In [90]:
df_audio_features

Unnamed: 0,Track ID,Acousticness,Danceability,Energy,Instrumentalness,Liveness,Loudness,Speechiness,Valence,Tempo,Time Signature,Key,Mode
0,3IvMYBE7A3c7to1aEcfFJk,0.482,0.473,0.824,0.000000,0.6050,-3.680,0.1630,0.3740,77.888,4,7,0
1,5TCBWmEBrin7etRa4Lswr1,0.252,0.410,0.820,0.000000,0.5380,-6.808,0.5330,0.5260,170.718,4,1,1
2,4wVOKKEHUJxHCFFNUWDn0B,0.629,0.735,0.449,0.000000,0.1130,-7.507,0.3470,0.0397,122.947,4,10,1
3,3Q4gttWQ6hxqWOa3tHoTNi,0.342,0.458,0.452,0.000019,0.0703,-9.299,0.0470,0.3160,86.318,4,9,0
4,466cKvZn1j45IpxDdYZqdA,0.289,0.830,0.490,0.000003,0.1130,-8.820,0.2090,0.8450,81.604,4,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...
445517,2KhrPRV0V1FS2l4eQMJUWt,0.362,0.448,0.511,0.823000,0.1290,-9.084,0.0274,0.1120,96.321,4,6,1
445518,2Ddfm2NQwTRsf7YVlt258S,0.734,0.319,0.293,0.850000,0.0971,-14.145,0.0378,0.0689,115.481,4,5,1
445519,6i2XzqnaxvpNVTUw8EjPMU,0.148,0.440,0.387,0.559000,0.0844,-9.792,0.0279,0.2670,96.556,4,4,1
445520,4xbN7giFhdGbSQTPyG178F,0.350,0.429,0.457,0.859000,0.0990,-8.375,0.0294,0.1830,124.033,4,3,1


#### Merge audio features with other track information

In [91]:
df_tracks_with_audio_features = pd.merge(left=df_tracks,
                                         right=df_audio_features,
                                         left_on="Track ID",
                                         right_on="Track ID",
                                         how = "left")
df_tracks_with_audio_features.head()

Unnamed: 0,Track Name,Artist Names,Album Name,Duration in ms,Explicit,Popularity,Track ID,Artist IDs,Album ID,Href,...,Energy,Instrumentalness,Liveness,Loudness,Speechiness,Valence,Tempo,Time Signature,Key,Mode
0,Deep Pockets,[Drake],Dark Lane Demo Tapes,222928,True,63,3IvMYBE7A3c7to1aEcfFJk,[3TVXtAsR1Inumwj472S9r4],6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/3IvMYBE7A3c7...,...,0.824,0.0,0.605,-3.68,0.163,0.374,77.888,4.0,7.0,0.0
1,When To Say When,[Drake],Dark Lane Demo Tapes,223124,True,65,5TCBWmEBrin7etRa4Lswr1,[3TVXtAsR1Inumwj472S9r4],6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/5TCBWmEBrin7...,...,0.82,0.0,0.538,-6.808,0.533,0.526,170.718,4.0,1.0,1.0
2,Chicago Freestyle (feat. Giveon),"[Drake, Giveon]",Dark Lane Demo Tapes,220487,True,84,4wVOKKEHUJxHCFFNUWDn0B,"[3TVXtAsR1Inumwj472S9r4, 4fxd5Ee7UefO4CUXgwJ7IP]",6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/4wVOKKEHUJxH...,...,0.449,0.0,0.113,-7.507,0.347,0.0397,122.947,4.0,10.0,1.0
3,Not You Too (feat. Chris Brown),"[Drake, Chris Brown]",Dark Lane Demo Tapes,269680,True,68,3Q4gttWQ6hxqWOa3tHoTNi,"[3TVXtAsR1Inumwj472S9r4, 7bXgB6jMjp9ATFy66eO08Z]",6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/3Q4gttWQ6hxq...,...,0.452,1.9e-05,0.0703,-9.299,0.047,0.316,86.318,4.0,9.0,0.0
4,Toosie Slide,[Drake],Dark Lane Demo Tapes,247058,True,80,466cKvZn1j45IpxDdYZqdA,[3TVXtAsR1Inumwj472S9r4],6OQ9gBfg5EXeNAEwGSs6jK,https://api.spotify.com/v1/tracks/466cKvZn1j45...,...,0.49,3e-06,0.113,-8.82,0.209,0.845,81.604,4.0,1.0,0.0


#### Ensure that row count remained the same

In [92]:
df_tracks_with_audio_features.shape

(445578, 23)

In [105]:
# Merge in release date of tracks (based on release date of albums)
df_tracks_complete = pd.merge(left=df_tracks_with_audio_features,
                              right=df_albums[["Album ID","Artist ID","Release Date"]],
                              left_on="Album ID",
                              right_on="Album ID")

In [114]:
df_tracks_complete1 = pd.merge(left=df_tracks_complete,
                               right=top_artists_with_metadata[["Artist_ID","Genres"]],
                               left_on="Artist ID",
                               right_on="Artist_ID")
df_tracks_complete1 = df_tracks_complete1.drop(["Artist ID","Artist_ID"],axis=1)

In [116]:
df_tracks_complete1.columns

Index(['Track Name', 'Artist Names', 'Album Name', 'Duration in ms',
       'Explicit', 'Popularity', 'Track ID', 'Artist IDs', 'Album ID', 'Href',
       'URI', 'Acousticness', 'Danceability', 'Energy', 'Instrumentalness',
       'Liveness', 'Loudness', 'Speechiness', 'Valence', 'Tempo',
       'Time Signature', 'Key', 'Mode', 'Release Date', 'Genres'],
      dtype='object')

#### Store final track dataset

In [117]:
df_tracks_complete1.to_csv(DATA_DIRECTORY_PATH + "/df_tracks.csv")