## Extract saved songs from Spotify using the Spotipy API

In [3]:
# load api key and code (make sure not to make publically visible)

from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY")
api_secret = os.getenv("API_SECRET")

# print("API_KEY: ", api_key)
# print("API_SECRET: ", api_svf cecret)

In [4]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd

#replace with Spotify credentials before pushing to github
cid = api_key
secret = api_secret

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)

sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)


In [5]:
def call_playlist(creator, playlist_id):
    
    #step1

    playlist_features_list = ["artist","album","track_name", "track_id",
                              "danceability","energy","key","loudness","mode", 
                              "speechiness","instrumentalness","liveness","valence",
                              "tempo", "duration_ms","time_signature"]
    
    playlist_df = pd.DataFrame(columns = playlist_features_list)
    
    #step2
    
    playlist = sp.user_playlist_tracks(creator, playlist_id)["items"]
    for track in playlist:
        # Create empty dict
        playlist_features = {}
        # Get metadata
        playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
        playlist_features["album"] = track["track"]["album"]["name"]
        playlist_features["track_name"] = track["track"]["name"]
        playlist_features["track_id"] = track["track"]["id"]
        
        # Get audio features
        audio_features = sp.audio_features(playlist_features["track_id"])[0]
        for feature in playlist_features_list[4:]:
            playlist_features[feature] = audio_features[feature]
        
        # Concat the dfs
        track_df = pd.DataFrame(playlist_features, index = [0])
        playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
        playlist_df.to_csv('spotify_discover_weekly_2023_Aug10.csv', index=False)
    #Step 3
        
    return playlist_df

In [6]:
call_playlist('Jeremy Osir', '37i9dQZEVXcLIlAlyCSA6s')

Unnamed: 0,artist,album,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Merlyn Wood,ONE OF US,ONE OF US,5DXTW2txslEyAtsJby1LAz,0.781,0.574,10,-6.729,0,0.338,8.4e-05,0.251,0.656,160.966,223603,4
1,Meechy Darko,Gothic Luxury,BLK Magîc,0tOeHPjaB22chZyHd4NmFP,0.496,0.567,7,-7.118,1,0.045,2.1e-05,0.137,0.363,129.098,322022,4
2,KayCyy,TW2052,THE SUN (Prod. Gesaffelstein),106yiMHU6uhZXvFyAcKKuA,0.608,0.764,8,-5.773,1,0.0389,1.1e-05,0.0846,0.188,133.01,188827,4
3,Kenny Mason,6,DRACULA,5Wmwjvvj82mo8RgE6pYbHH,0.806,0.545,8,-7.856,0,0.226,0.0,0.152,0.51,139.984,178293,4
4,Pi’erre Bourne,Good Movie,DJ In The Car,7w4yrrz2HMkqjhDrLjA4yy,0.76,0.635,0,-11.379,0,0.0274,0.00213,0.208,0.933,132.039,189152,4
5,Smino,Luv 4 Rent,No L's,12CNybio0Bxean3F3uYugU,0.786,0.65,1,-6.472,1,0.31,0.0,0.127,0.492,130.993,175019,4
6,Smooky MarGielaa,Mamacita (feat. A$AP Rocky),Mamacita (feat. A$AP Rocky),1SlHZ51oGKV56qtPVFyJlR,0.656,0.628,1,-6.603,0,0.228,0.0,0.112,0.31,81.311,185749,4
7,Robb Bank$,FALCON OF THE MILLENNIUM - FALCONIA (DELUXE),OUTSIDE,3MMDqI7L2fM1NWZYs6FY53,0.792,0.553,3,-8.138,0,0.193,0.0,0.202,0.673,140.086,165785,4
8,grouptherapy.,"i was mature for my age, but i was still a child",Peak,7Ivms2W5VboIiOGJF7trey,0.572,0.512,9,-7.614,1,0.143,0.0,0.323,0.235,151.023,203609,4
9,Kris Yute,Breeze,Breeze,1Dt6sLHeMKlUY9BYbnsSmK,0.771,0.558,10,-8.004,1,0.129,0.0,0.115,0.516,90.032,160000,4


In [8]:
#Load saved tracks into pandas dataframe
tracks2017 = pd.read_csv("./top_2017_tracks_1650917514.csv")
tracks2018 = pd.read_csv("./top_2018_tracks_1650917336.csv")
tracks2019 = pd.read_csv("./top_2019_tracks_1650917226.csv")
tracks2020 = pd.read_csv("./top_2020_tracks_1650917163.csv")
tracks2021 = pd.read_csv("./top_tracks_2021.csv")
topUSAtracks = pd.read_csv("./top_USA_tracks_1651603346.csv")
discover_10Aug2023 = pd.read_csv("./spotify_discover_weekly_2023_Aug10.csv")

In [9]:
merged_toptracks = pd.concat([tracks2017, tracks2018, tracks2019, tracks2020, tracks2021], axis= 0)
merged_toptracks.info(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 99
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              500 non-null    object 
 1   id                500 non-null    object 
 2   popularity        500 non-null    int64  
 3   duration          500 non-null    int64  
 4   danceability      500 non-null    float64
 5   energy            500 non-null    float64
 6   key               500 non-null    int64  
 7   loudness          500 non-null    float64
 8   mode              500 non-null    int64  
 9   speechiness       500 non-null    float64
 10  acousticness      500 non-null    float64
 11  instrumentalness  500 non-null    float64
 12  liveness          500 non-null    float64
 13  valence           500 non-null    float64
 14  tempo             500 non-null    float64
dtypes: float64(9), int64(4), object(2)
memory usage: 62.5+ KB


In [23]:
# topUSA tracks df
topUSA_df = topUSAtracks

In [11]:
# Pull track ids, which will be used to pull additional song attributes from spotipy
track_ids = merged_toptracks["id"]
print(len(track_ids))
track_ids = list(track_ids)

500


In [12]:
# Repeat above, for topUSA tracks
track_ids = topUSAtracks["id"]
print(len(track_ids))
track_ids = list(track_ids)

50


In [13]:
# Define a formula to pull additional track attributes — track name, album, artist, and release date
def getTrackFeatures(id):
  meta = sp.track(id)

  # meta
  #name = meta['name']
  #album = meta['album']['name']
  #artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']


  track = [release_date]
  return track

In [14]:
import time

In [15]:
# pull track info and save into a dataframe

tracks = []
for i in range(len(track_ids)):
  time.sleep(.5)
  track = getTrackFeatures(track_ids[i])
  tracks.append(track)

# create dataset
toptracks_attributes = pd.DataFrame(tracks, columns = ['release_date'])
toptracks_attributes.to_csv("top_track_attributes_Aug2023.csv", sep = ',')

In [16]:
toptracks_attributes['index_col'] = toptracks_attributes.index # add index column; will be used when merging track info with the attributes

In [17]:
toptracks_attributes.index_col

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
26    26
27    27
28    28
29    29
30    30
31    31
32    32
33    33
34    34
35    35
36    36
37    37
38    38
39    39
40    40
41    41
42    42
43    43
44    44
45    45
46    46
47    47
48    48
49    49
Name: index_col, dtype: int64

In [18]:
toptracks_attributes.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 2 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   release_date  50 non-null     object
 1   index_col     50 non-null     int64 
dtypes: int64(1), object(1)
memory usage: 928.0+ bytes


In [19]:
merged_toptracks

Unnamed: 0,name,id,popularity,duration,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo
0,Close But Not Quite,255TTKJjoyiLYixY0MDbID,0,209201,0.803,0.604,5,-9.559,1,0.1260,0.5930,0.001980,0.1890,0.558,145.994
1,I'll Remember,0FgRpZf36QtANUUiOcod0I,7,142657,0.414,0.223,1,-10.169,1,0.0410,0.9900,0.002380,0.1180,0.197,63.950
2,oh baby,53PkA8aXiwH4ppa0V0iO7o,59,349693,0.580,0.622,5,-12.005,1,0.0352,0.0192,0.653000,0.6480,0.781,169.442
3,Sober Thoughts - Lido's Forgotten Sunday Service,1VkAGG7MomLsvhbIxPEzfP,6,177268,0.634,0.700,10,-8.068,1,0.2660,0.0045,0.000000,0.1300,0.592,94.956
4,Bad Decisions,184c1MtPKQENzNJd7fIh1t,0,297786,0.630,0.709,5,-8.150,0,0.0351,0.0491,0.008130,0.1280,0.407,105.992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Proud of You (feat. Lil Uzi Vert & Yung Kayo),16eZmQoeajKH8G2nPe2chk,58,211733,0.848,0.451,5,-7.539,0,0.2280,0.0208,0.000000,0.1940,0.223,135.941
96,Available,1b6tPXXCV2fSNtR3SKWUQA,58,195333,0.702,0.608,3,-6.843,1,0.0359,0.0736,0.000000,0.1290,0.597,75.012
97,willow,0lx2cLdOt3piJbcaXIV74f,77,214706,0.392,0.574,7,-9.195,1,0.1700,0.8330,0.001790,0.1450,0.529,81.112
98,Stoned at the Nail Salon,51dhTHnxNi215JiLXdyurb,13,266432,0.466,0.113,2,-13.325,1,0.0414,0.9470,0.000075,0.0986,0.104,125.369


In [20]:
# add index to original merged top tracks dataframe
merged_toptracks.index = range(0, (len(merged_toptracks)))
merged_toptracks['index_col'] = merged_toptracks.index
merged_toptracks.index_col

0        0
1        1
2        2
3        3
4        4
      ... 
495    495
496    496
497    497
498    498
499    499
Name: index_col, Length: 500, dtype: int64

In [24]:
# add index to original top USA tracks dataframe
topUSA_df.index = range(0, (len(topUSA_df)))
topUSA_df['index_col'] = topUSA_df.index
topUSA_df.index_col

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
26    26
27    27
28    28
29    29
30    30
31    31
32    32
33    33
34    34
35    35
36    36
37    37
38    38
39    39
40    40
41    41
42    42
43    43
44    44
45    45
46    46
47    47
48    48
49    49
Name: index_col, dtype: int64

In [25]:
# merge track + attribute dfs for analysis
top_tracks_df = pd.merge(merged_toptracks, toptracks_attributes, on = ['index_col'])


In [26]:
topUSA_df = pd.merge(topUSA_df, toptracks_attributes, on = ['index_col'])

In [27]:
top_tracks_df.tail()

Unnamed: 0,name,id,popularity,duration,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,index_col,release_date
45,Me vs. Me,1RaJMyCatBXw5hGCpVzTp4,46,247619,0.799,0.778,1,-7.852,0,0.105,0.693,0.0101,0.118,0.806,125.996,45,2021-09-10
46,Kukere,5FisFa9JwDjpv5Jn505X8e,0,225436,0.752,0.843,1,-4.414,1,0.0576,0.0164,0.00338,0.049,0.965,130.071,46,2020-01-10
47,"Shadow Man (feat. Phoelix, Smino & Saba)",1h2LHhmyAXi8tPNLi806JA,57,311000,0.611,0.454,0,-7.436,0,0.201,0.469,2e-06,0.189,0.444,93.004,47,2021-03-05
48,April Love,5FSzjSHdkCFWA35RHbP79n,29,231685,0.644,0.263,8,-10.555,1,0.0781,0.345,0.000267,0.0898,0.242,69.015,48,2022-04-29
49,Supermodel,5wTVNpi5WDByxBgKgUE6MU,70,181120,0.613,0.442,6,-8.874,1,0.288,0.651,0.0,0.26,0.252,119.737,49,2021-09-10


In [28]:
#top_tracks_df = top_tracks_df.drop(columns=['name_y'])
top_tracks_df.columns

Index(['name', 'id', 'popularity', 'duration', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'index_col', 'release_date'],
      dtype='object')

In [47]:
# top_tracks_df.rename(columns= {"name_x" : "title"}, inplace= True)

In [48]:
# top_tracks_df.columns

Index(['title', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date'],
      dtype='object')

In [29]:
#topUSA_df.drop(columns=['name_y'])
#topUSA_df.rename(columns= {"name_x" : "title"}, inplace= True)
topUSA_df.columns

Index(['name', 'id', 'popularity', 'duration', 'danceability', 'energy', 'key',
       'loudness', 'mode', 'speechiness', 'acousticness', 'instrumentalness',
       'liveness', 'valence', 'tempo', 'index_col', 'release_date'],
      dtype='object')

In [30]:
# Create a "release year" column based on release date
top_tracks_df['release_year'] = top_tracks_df['release_date'].str[0:4]
top_tracks_df['release_year'] = pd.to_numeric(top_tracks_df['release_year'])
top_tracks_df['release_year'].describe
# print(saved_tracks_df)

<bound method NDFrame.describe of 0     2022
1     2022
2     2022
3     2022
4     2020
5     2022
6     2022
7     2022
8     2022
9     2021
10    2014
11    2021
12    2022
13    2021
14    2021
15    2015
16    2021
17    2021
18    2007
19    2022
20    2022
21    2022
22    2022
23    2022
24    2017
25    2021
26    2022
27    2021
28    2014
29    2022
30    2013
31    2022
32    2022
33    2022
34    2022
35    2021
36    2022
37    2017
38    2021
39    2022
40    2021
41    2022
42    2021
43    2021
44    2021
45    2021
46    2020
47    2021
48    2022
49    2021
Name: release_year, dtype: int64>

In [33]:
# Create a "release year" column based on release date
topUSA_df['release_year'] = topUSA_df['release_date'].str[0:4]
topUSA_df['release_year'] = pd.to_numeric(topUSA_df['release_year'])
topUSA_df['release_year'].describe

<bound method NDFrame.describe of 0     2022
1     2022
2     2022
3     2022
4     2020
5     2022
6     2022
7     2022
8     2022
9     2021
10    2014
11    2021
12    2022
13    2021
14    2021
15    2015
16    2021
17    2021
18    2007
19    2022
20    2022
21    2022
22    2022
23    2022
24    2017
25    2021
26    2022
27    2021
28    2014
29    2022
30    2013
31    2022
32    2022
33    2022
34    2022
35    2021
36    2022
37    2017
38    2021
39    2022
40    2021
41    2022
42    2021
43    2021
44    2021
45    2021
46    2020
47    2021
48    2022
49    2021
Name: release_year, dtype: int64>

In [36]:
top_tracks_df.release_year.value_counts()

2022    24
2021    17
2020     2
2017     2
2014     2
2015     1
2013     1
2007     1
Name: release_year, dtype: int64

In [35]:
topUSA_df.release_year.value_counts()

2022    24
2021    17
2020     2
2017     2
2014     2
2015     1
2013     1
2007     1
Name: release_year, dtype: int64