## Extract songs that Jeremy loves from Spotify using the Spotipy API

In [1]:
# load api key and code (make sure not to make publically visible)

from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY")
api_secret = os.getenv("API_SECRET")


In [29]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from time import time
from pandas import DataFrame

#replace with Spotify credentials before pushing to github
cid = api_key
secret = api_secret

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)

sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)


In [37]:
# Alternative to pull 4500+ songs
from spotipy.oauth2 import SpotifyOAuth

redirect_uri = 'https://example.com/callback'

FEATURE_KEYS = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
OFFSET=0
SAVED_TRACKS_LIMIT=50
FEATURE_LIMIT = 100

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=cid,
                                               client_secret=secret,
                                               redirect_uri=redirect_uri,
                                               scope="user-library-read"))

liked_tracks=list()
print('')

while(True):
    paged_tracks = sp.current_user_saved_tracks(offset=OFFSET, limit=SAVED_TRACKS_LIMIT)
    liked_tracks.extend([{'name':el['track']['name'], 
                          'id':el['track']['id'], 
                          #'artist_name': el['track']['artist_name'],
                          'popularity': el['track']['popularity'],
                          'duration': el['track']['duration_ms']} for el in paged_tracks['items']])
    print(f'Fetched {len(liked_tracks)} tracks')
    OFFSET+=SAVED_TRACKS_LIMIT
    if paged_tracks['next'] is None:
        break

def get_windowed_track_ids(liked_tracks, limit):
    for i in range(0, len(liked_tracks), limit): 
        track_window = liked_tracks[i:i + limit]
        yield track_window, [t['id'] for t in track_window]

track_feature_list = list()
print('')

for track_window, track_window_ids in get_windowed_track_ids(liked_tracks, FEATURE_LIMIT):
    track_features = sp.audio_features(tracks=track_window_ids)
    for index, _track in enumerate(track_window):
        _track.update({k:v for k,v in track_features[index].items() if k in FEATURE_KEYS})
        track_feature_list.append(_track)
    print(f'Fetched features for {len(track_feature_list)} tracks')

songs_j_loves_df =DataFrame.from_dict(track_feature_list)
loved_songs = 'songs_j_loves.csv'
songs_j_loves_df.to_csv(loved_songs, index=False)
print('')
print(f'Saved features to {loved_songs}')




KeyError: 'artist'

In [42]:
songs_j_loves_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4565 entries, 0 to 4564
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4565 non-null   object 
 1   id                4565 non-null   object 
 2   popularity        4565 non-null   int64  
 3   duration          4565 non-null   int64  
 4   danceability      4565 non-null   float64
 5   energy            4565 non-null   float64
 6   key               4565 non-null   int64  
 7   loudness          4565 non-null   float64
 8   mode              4565 non-null   int64  
 9   speechiness       4565 non-null   float64
 10  acousticness      4565 non-null   float64
 11  instrumentalness  4565 non-null   float64
 12  liveness          4565 non-null   float64
 13  valence           4565 non-null   float64
 14  tempo             4565 non-null   float64
dtypes: float64(9), int64(4), object(2)
memory usage: 535.1+ KB


In [7]:
# This code chunk only pulls 100 tracks at a time - workshop later
# def call_playlist(creator, playlist_id):
    
#     #step1

#     playlist_features_list = ["artist","album","track_name", "track_id",
#                               "danceability","energy","key","loudness","mode", 
#                               "speechiness","instrumentalness","liveness","valence",
#                               "tempo", "duration_ms","time_signature"]
    
#     playlist_df = pd.DataFrame(columns = playlist_features_list)
    
#     #step2
    
#     playlist = sp.user_playlist_tracks(creator, playlist_id)["items"]
    
#     for track in playlist:
#         # Create empty dict
#         playlist_features = {}
#         # Get metadata
#         playlist_features["artist"] = track["track"]["album"]["artists"][0]["name"]
#         playlist_features["album"] = track["track"]["album"]["name"]
#         playlist_features["track_name"] = track["track"]["name"]
#         playlist_features["track_id"] = track["track"]["id"]
        
#         # Get audio features
#         audio_features = sp.audio_features(playlist_features["track_id"])[0]
#         for feature in playlist_features_list[4:]:
#             playlist_features[feature] = audio_features[feature]
        
#         # Concat the dfs
#         track_df = pd.DataFrame(playlist_features, index = [0])
#         playlist_df = pd.concat([playlist_df, track_df], ignore_index = True)
#         playlist_df.to_csv('songs_j_loves.csv', index=False)
#     #Step 3
        
#     return playlist_df

In [8]:
# call_playlist('Jeremy Osir', '6b9w5j7viQQbRZL0vmSTxv')

#https://open.spotify.com/playlist/6b9w5j7viQQbRZL0vmSTxv?si=b617155b8d824890

Unnamed: 0,artist,album,track_name,track_id,danceability,energy,key,loudness,mode,speechiness,instrumentalness,liveness,valence,tempo,duration_ms,time_signature
0,Bring Me The Horizon,sTraNgeRs,sTraNgeRs,5fpq1wF8xa5tSSlcKHdmGQ,0.346,0.939,5,-3.061,1,0.1820,0,0.4330,0.526,183.147,195748,3
1,ThxSoMch,HOW YOU ALWAYS LOOK SO GOOD?,HOW YOU ALWAYS LOOK SO GOOD?,7hnC8qa5dgwXOgbVt3E7Ze,0.653,0.631,11,-8.876,0,0.0758,0.00326,0.0953,0.303,87.642,128596,4
2,ThxSoMch,Waste My Mind,Waste My Mind,6NP8HBlNKQzsq8K9O5dK0f,0.421,0.645,0,-9.056,1,0.0320,0.326,0.3690,0.106,144.838,162207,4
3,EKKSTACY,misery,i want to sleep for 1000 years,7mkzJI9BqAO3twUNuv4Nen,0.599,0.696,2,-5.970,1,0.0235,0.692,0.0848,0.327,99.990,135667,1
4,Gunna,fukumean,fukumean (sped up version),4YjUP1tr06laO1GSXaQqsJ,0.737,0.751,4,-6.606,0,0.1630,0,0.1120,0.404,162.380,100020,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,Partiboi69,Ride Hard,Bloodstream,41EvLMzjL6drfwvJ7qaVhR,0.635,0.880,11,-8.419,0,0.0716,0.74,0.5850,0.766,140.040,268000,4
96,Flyleaf,Flyleaf,I'm So Sick,0bqb8cyz8MfgE5IYklNYWD,0.431,0.959,10,-4.257,1,0.0855,6.44e-06,0.1190,0.268,153.926,176160,4
97,Hoobastank,Hoobastank (20th Anniversary Edition),Crawling In The Dark,1PJRDeZSoZk7gtisdTYfLi,0.588,0.927,6,-4.776,0,0.1130,3.83e-05,0.1950,0.652,93.571,175227,4
98,Deftones,Around the Fur,My Own Summer (Shove It),1158ckiB5S4cpsdYHDB9IF,0.443,0.781,1,-4.277,1,0.0377,0.205,0.1660,0.372,71.841,214760,4


In [44]:
# Pull track ids, which will be used to pull additional song attributes from spotipy
track_ids = songs_j_loves_df["id"]
print(len(track_ids))
track_ids = list(track_ids)

4565


In [45]:
# Define a formula to pull additional track attributes — track name, album, artist, and release date
def getTrackFeatures(id):
  meta = sp.track(id)

  # meta
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']


  track = [name, album, artist, release_date]
  return track

In [46]:
import time

In [49]:
# pull track info and save into a dataframe

tracks = []
for i in range(len(track_ids)):
  #time.sleep(.5)
  track = getTrackFeatures(track_ids[i])
  tracks.append(track)

# create dataset
loved_tracks_attributes = pd.DataFrame(tracks, columns = ['name', 'album', 'artist','release_date'])
loved_tracks_attributes.to_csv("loved_tracks_attributes_Aug2023.csv", sep = ',')

In [50]:
loved_tracks_attributes['index_col'] = loved_tracks_attributes.index # add index column; will be used when merging track info with the attributes

In [51]:
loved_tracks_attributes.index_col

0          0
1          1
2          2
3          3
4          4
        ... 
4560    4560
4561    4561
4562    4562
4563    4563
4564    4564
Name: index_col, Length: 4565, dtype: int64

In [52]:
loved_tracks_attributes.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4565 entries, 0 to 4564
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          4565 non-null   object
 1   album         4565 non-null   object
 2   artist        4565 non-null   object
 3   release_date  4565 non-null   object
 4   index_col     4565 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 178.4+ KB


In [69]:
# add index to original loved tracks file for merging
songs_j_loves_df['index_col'] = songs_j_loves_df.index

In [70]:
# merge loved songs and attributes
loved_tracks_df = pd.merge(songs_j_loves_df, loved_tracks_attributes, on = ['index_col'])

In [71]:
#top_tracks_df = top_tracks_df.drop(columns=['name_y'])
loved_tracks_df.columns

Index(['name_x', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'name_y', 'album', 'artist', 'release_date'],
      dtype='object')

In [73]:
loved_tracks_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4565 entries, 0 to 4564
Data columns (total 20 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name_x            4565 non-null   object 
 1   id                4565 non-null   object 
 2   popularity        4565 non-null   int64  
 3   duration          4565 non-null   int64  
 4   danceability      4565 non-null   float64
 5   energy            4565 non-null   float64
 6   key               4565 non-null   int64  
 7   loudness          4565 non-null   float64
 8   mode              4565 non-null   int64  
 9   speechiness       4565 non-null   float64
 10  acousticness      4565 non-null   float64
 11  instrumentalness  4565 non-null   float64
 12  liveness          4565 non-null   float64
 13  valence           4565 non-null   float64
 14  tempo             4565 non-null   float64
 15  index_col         4565 non-null   int64  
 16  name_y            4565 non-null   object 


In [75]:
loved_tracks_df = loved_tracks_df.drop(columns=['name_y', 'index_col'])
loved_tracks_df.columns

Index(['name_x', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'album', 'artist',
       'release_date'],
      dtype='object')

In [76]:
loved_tracks_df.rename(columns= {"name_x" : "name"}, inplace= True)

In [77]:
loved_tracks_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4565 entries, 0 to 4564
Data columns (total 18 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4565 non-null   object 
 1   id                4565 non-null   object 
 2   popularity        4565 non-null   int64  
 3   duration          4565 non-null   int64  
 4   danceability      4565 non-null   float64
 5   energy            4565 non-null   float64
 6   key               4565 non-null   int64  
 7   loudness          4565 non-null   float64
 8   mode              4565 non-null   int64  
 9   speechiness       4565 non-null   float64
 10  acousticness      4565 non-null   float64
 11  instrumentalness  4565 non-null   float64
 12  liveness          4565 non-null   float64
 13  valence           4565 non-null   float64
 14  tempo             4565 non-null   float64
 15  album             4565 non-null   object 
 16  artist            4565 non-null   object 


In [78]:
# Create a "release year" column based on release date
loved_tracks_df['release_year'] = loved_tracks_df['release_date'].str[0:4]
loved_tracks_df['release_year'] = pd.to_numeric(loved_tracks_df['release_year'])
loved_tracks_df['release_year'].describe
# print(saved_tracks_df)

<bound method NDFrame.describe of 0       2022
1       2022
2       2023
3       2022
4       2023
        ... 
4560    2015
4561    2015
4562    2015
4563    2015
4564    2015
Name: release_year, Length: 4565, dtype: int64>

In [79]:
loved_tracks_df.release_year.value_counts()

2018    909
2017    773
2016    771
2019    450
2015    341
2020    233
2022    210
2021    174
2006     87
2012     63
2014     59
2013     50
2000     41
2023     38
1967     35
2009     33
1990     33
2011     30
2004     30
2010     29
1997     27
1998     25
1994     21
1993     16
2008     13
2007     10
2002      8
2005      7
2001      7
2003      7
1999      6
1996      5
1995      4
0         4
1989      2
1970      2
1992      2
1957      1
1981      1
1985      1
1982      1
1979      1
1987      1
1988      1
1976      1
1972      1
1991      1
Name: release_year, dtype: int64

In [80]:
loved_tracks_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4565 entries, 0 to 4564
Data columns (total 19 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              4565 non-null   object 
 1   id                4565 non-null   object 
 2   popularity        4565 non-null   int64  
 3   duration          4565 non-null   int64  
 4   danceability      4565 non-null   float64
 5   energy            4565 non-null   float64
 6   key               4565 non-null   int64  
 7   loudness          4565 non-null   float64
 8   mode              4565 non-null   int64  
 9   speechiness       4565 non-null   float64
 10  acousticness      4565 non-null   float64
 11  instrumentalness  4565 non-null   float64
 12  liveness          4565 non-null   float64
 13  valence           4565 non-null   float64
 14  tempo             4565 non-null   float64
 15  album             4565 non-null   object 
 16  artist            4565 non-null   object 


In [81]:
# Save as csv, will use this file for EDA and cluster analysis
loved_tracks_df.to_csv("songs_j_loves_final.csv", sep = ',', index= False)