## Extract saved songs from Spotify using the Spotipy API

In [2]:
import spotipy
from spotipy.oauth2 import SpotifyOAuth
from pandas import DataFrame
# from creds import client_id, client_secret
from time import time

redirect_uri = 'https://example.com/callback'

#replace with Spotify credentials
client_id = 'XXXXXXXX'
client_secret = 'XXXXXXXX'

FEATURE_KEYS = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
OFFSET=0
SAVED_TRACKS_LIMIT=50
FEATURE_LIMIT = 100

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri=redirect_uri,
                                               scope="user-library-read"))
liked_tracks=list()

#replace with Spotify username, and playlist ID
user = 'XXXXXXXXX'
playlist_id = "XXXXXXXX"


while(True):
    paged_tracks = sp.user_playlist_tracks(user, playlist_id, limit=100)
    liked_tracks.extend([{'name':el['track']['name'], 
                          'id':el['track']['id'], 
                          #'artist_name': el['track']['artist_name'],
                          'popularity': el['track']['popularity'],
                          'duration': el['track']['duration_ms']} for el in paged_tracks['items']])
    print(f'Fetched {len(liked_tracks)} tracks')
    OFFSET+=SAVED_TRACKS_LIMIT
    if paged_tracks['next'] is None:
        break

def get_windowed_track_ids(liked_tracks, limit):
    for i in range(0, len(liked_tracks), limit): 
        track_window = liked_tracks[i:i + limit]
        yield track_window, [t['id'] for t in track_window]

track_feature_list = list()
print('')

for track_window, track_window_ids in get_windowed_track_ids(liked_tracks, FEATURE_LIMIT):
    track_features = sp.audio_features(tracks=track_window_ids)
    for index, _track in enumerate(track_window):
        _track.update({k:v for k,v in track_features[index].items() if k in FEATURE_KEYS})
        track_feature_list.append(_track)
    print(f'Fetched features for {len(track_feature_list)} tracks')

df=DataFrame.from_dict(track_feature_list)
top_songs = f'top_USA_tracks_{int(time())}.csv'
df.to_csv(top_songs, index=False)
print('')
print(f'Saved features to {top_songs}')

Fetched 50 tracks

Fetched features for 50 tracks

Saved features to top_USA_tracks_1651603346.csv


In [4]:
import pandas as pd

In [5]:
#Load saved tracks into pandas dataframe
tracks2017 = pd.read_csv("./top_2017_tracks_1650917514.csv")
tracks2018 = pd.read_csv("./top_2018_tracks_1650917336.csv")
tracks2019 = pd.read_csv("./top_2019_tracks_1650917226.csv")
tracks2020 = pd.read_csv("./top_2020_tracks_1650917163.csv")
tracks2021 = pd.read_csv("./top_tracks_2021.csv")
topUSAtracks = pd.read_csv("./top_USA_tracks_1651603346.csv")

In [20]:
merged_toptracks = pd.concat([tracks2017, tracks2018, tracks2019, tracks2020, tracks2021], axis= 0)
merged_toptracks.info(5)

<class 'pandas.core.frame.DataFrame'>
Int64Index: 500 entries, 0 to 99
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              500 non-null    object 
 1   id                500 non-null    object 
 2   popularity        500 non-null    int64  
 3   duration          500 non-null    int64  
 4   danceability      500 non-null    float64
 5   energy            500 non-null    float64
 6   key               500 non-null    int64  
 7   loudness          500 non-null    float64
 8   mode              500 non-null    int64  
 9   speechiness       500 non-null    float64
 10  acousticness      500 non-null    float64
 11  instrumentalness  500 non-null    float64
 12  liveness          500 non-null    float64
 13  valence           500 non-null    float64
 14  tempo             500 non-null    float64
dtypes: float64(9), int64(4), object(2)
memory usage: 62.5+ KB


In [14]:
# topUSA tracks df
topUSA_df = df

In [21]:
# Pull track ids, which will be used to pull additional song attributes from spotipy
track_ids = merged_toptracks["id"]
print(len(track_ids))
track_ids = list(track_ids)

500


In [6]:
# Repeat above, for topUSA tracks
track_ids = topUSAtracks["id"]
print(len(track_ids))
track_ids = list(track_ids)

50


In [7]:
# Define a formula to pull additional track attributes — track name, album, artist, and release date
def getTrackFeatures(id):
  meta = sp.track(id)

  # meta
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']


  track = [name, album, artist, release_date]
  return track

In [8]:
import time

In [9]:
# pull track info and save into a dataframe

tracks = []
for i in range(len(track_ids)):
  time.sleep(.5)
  track = getTrackFeatures(track_ids[i])
  tracks.append(track)

# create dataset
toptracks_attributes = pd.DataFrame(tracks, columns = ['name', 'album', 'artist', 'release_date'])
toptracks_attributes.to_csv("top_track_attributes.csv", sep = ',')

In [10]:
toptracks_attributes['index_col'] = toptracks_attributes.index # add index column; will be used when merging track info with the attributes

In [11]:
toptracks_attributes.index_col

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
26    26
27    27
28    28
29    29
30    30
31    31
32    32
33    33
34    34
35    35
36    36
37    37
38    38
39    39
40    40
41    41
42    42
43    43
44    44
45    45
46    46
47    47
48    48
49    49
Name: index_col, dtype: int64

In [12]:
toptracks_attributes.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50 entries, 0 to 49
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          50 non-null     object
 1   album         50 non-null     object
 2   artist        50 non-null     object
 3   release_date  50 non-null     object
 4   index_col     50 non-null     int64 
dtypes: int64(1), object(4)
memory usage: 2.1+ KB


In [30]:
merged_toptracks

Unnamed: 0,name,id,popularity,duration,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,index_col
0,Close But Not Quite,255TTKJjoyiLYixY0MDbID,0,209201,0.803,0.604,5,-9.559,1,0.1260,0.5930,0.001980,0.1890,0.558,145.994,0
1,I'll Remember,0FgRpZf36QtANUUiOcod0I,7,142657,0.414,0.223,1,-10.169,1,0.0410,0.9900,0.002380,0.1180,0.197,63.950,1
2,oh baby,53PkA8aXiwH4ppa0V0iO7o,59,349693,0.580,0.622,5,-12.005,1,0.0352,0.0192,0.653000,0.6480,0.781,169.442,2
3,Sober Thoughts - Lido's Forgotten Sunday Service,1VkAGG7MomLsvhbIxPEzfP,6,177268,0.634,0.700,10,-8.068,1,0.2660,0.0045,0.000000,0.1300,0.592,94.956,3
4,Bad Decisions,184c1MtPKQENzNJd7fIh1t,0,297786,0.630,0.709,5,-8.150,0,0.0351,0.0491,0.008130,0.1280,0.407,105.992,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Proud of You (feat. Lil Uzi Vert & Yung Kayo),16eZmQoeajKH8G2nPe2chk,58,211733,0.848,0.451,5,-7.539,0,0.2280,0.0208,0.000000,0.1940,0.223,135.941,95
96,Available,1b6tPXXCV2fSNtR3SKWUQA,58,195333,0.702,0.608,3,-6.843,1,0.0359,0.0736,0.000000,0.1290,0.597,75.012,96
97,willow,0lx2cLdOt3piJbcaXIV74f,77,214706,0.392,0.574,7,-9.195,1,0.1700,0.8330,0.001790,0.1450,0.529,81.112,97
98,Stoned at the Nail Salon,51dhTHnxNi215JiLXdyurb,13,266432,0.466,0.113,2,-13.325,1,0.0414,0.9470,0.000075,0.0986,0.104,125.369,98


In [32]:
# add index to original merged top tracks dataframe
merged_toptracks.index = range(0, (len(merged_toptracks)))
merged_toptracks['index_col'] = merged_toptracks.index
merged_toptracks.index_col

0        0
1        1
2        2
3        3
4        4
      ... 
495    495
496    496
497    497
498    498
499    499
Name: index_col, Length: 500, dtype: int64

In [15]:
# add index to original top USA tracks dataframe
topUSA_df.index = range(0, (len(topUSA_df)))
topUSA_df['index_col'] = topUSA_df.index
topUSA_df.index_col

0      0
1      1
2      2
3      3
4      4
5      5
6      6
7      7
8      8
9      9
10    10
11    11
12    12
13    13
14    14
15    15
16    16
17    17
18    18
19    19
20    20
21    21
22    22
23    23
24    24
25    25
26    26
27    27
28    28
29    29
30    30
31    31
32    32
33    33
34    34
35    35
36    36
37    37
38    38
39    39
40    40
41    41
42    42
43    43
44    44
45    45
46    46
47    47
48    48
49    49
Name: index_col, dtype: int64

In [44]:
# merge track + attribute dfs for analysis
top_tracks_df = pd.merge(merged_toptracks, toptracks_attributes, on = ['index_col'])


In [16]:
topUSA_df = pd.merge(topUSA_df, toptracks_attributes, on = ['index_col'])

In [45]:
top_tracks_df.tail()

Unnamed: 0,name_x,id,popularity,duration,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,index_col,name_y,album,artist,release_date
495,Proud of You (feat. Lil Uzi Vert & Yung Kayo),16eZmQoeajKH8G2nPe2chk,58,211733,0.848,0.451,5,-7.539,0,0.228,0.0208,0.0,0.194,0.223,135.941,495,Proud of You (feat. Lil Uzi Vert & Yung Kayo),Slime Language 2,Young Stoner Life,2021-04-16
496,Available,1b6tPXXCV2fSNtR3SKWUQA,58,195333,0.702,0.608,3,-6.843,1,0.0359,0.0736,0.0,0.129,0.597,75.012,496,Available,Changes,Justin Bieber,2020-02-14
497,willow,0lx2cLdOt3piJbcaXIV74f,77,214706,0.392,0.574,7,-9.195,1,0.17,0.833,0.00179,0.145,0.529,81.112,497,willow,evermore,Taylor Swift,2020-12-11
498,Stoned at the Nail Salon,51dhTHnxNi215JiLXdyurb,13,266432,0.466,0.113,2,-13.325,1,0.0414,0.947,7.5e-05,0.0986,0.104,125.369,498,Stoned at the Nail Salon,Solar Power,Lorde,2021-08-20
499,9 5 . s o u t h,5R691ipUYRDYW6ehapjoj6,67,196946,0.713,0.793,2,-5.277,1,0.397,0.271,0.0,0.727,0.203,71.724,499,9 5 . s o u t h,The Off-Season,J. Cole,2021-05-14


In [46]:
top_tracks_df = top_tracks_df.drop(columns=['name_y'])
top_tracks_df.columns

Index(['name_x', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date'],
      dtype='object')

In [47]:
top_tracks_df.rename(columns= {"name_x" : "title"}, inplace= True)

In [48]:
top_tracks_df.columns

Index(['title', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date'],
      dtype='object')

In [19]:
topUSA_df.drop(columns=['name_y'])
topUSA_df.rename(columns= {"name_x" : "title"}, inplace= True)
topUSA_df.columns

Index(['title', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'name_y', 'album', 'artist', 'release_date'],
      dtype='object')

In [49]:
# Create a "release year" column based on release date
top_tracks_df['release_year'] = top_tracks_df['release_date'].str[0:4]
top_tracks_df['release_year'] = pd.to_numeric(top_tracks_df['release_year'])
top_tracks_df['release_year'].describe
# print(saved_tracks_df)

<bound method NDFrame.describe of 0      2017
1      2017
2      2017
3      2016
4      2016
       ... 
495    2021
496    2020
497    2020
498    2021
499    2021
Name: release_year, Length: 500, dtype: int64>

In [52]:
top_tracks_df.release_year.value_counts()


2018    103
2019    100
2020     98
2017     80
2021     57
2016     28
2015     11
2011      5
2014      5
2013      3
2012      2
2007      2
1997      2
2006      1
2005      1
2004      1
1992      1
Name: release_year, dtype: int64

In [53]:
top_tracks_df.columns

Index(['title', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date', 'release_year'],
      dtype='object')

In [54]:
top_tracks_df = top_tracks_df.drop(columns=['release_date'])

In [20]:
topUSA_df = topUSA_df.drop(columns=['release_date'])

In [55]:
# Save as csv, will use this file for EDA and cluster analysis
top_tracks_df.to_csv("top_tracks_final.csv", sep = ',', index= False)

In [21]:
topUSA_df.to_csv("top_USA_final.csv", sep = ',', index= False)