## Extract saved songs from Spotify using the Spotipy API

In [8]:
# Instructions: https://www.linkedin.com/pulse/extracting-your-fav-playlist-info-spotifys-api-samantha-jones/

import spotipy
from spotipy.oauth2 import SpotifyOAuth
from pandas import DataFrame
# from creds import client_id, client_secret
from time import time

redirect_uri = 'https://example.com/callback'

#replace with Spotify credentials
client_id = 'XXXXXX'
client_secret = 'XXXXXX'

FEATURE_KEYS = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
OFFSET=0
SAVED_TRACKS_LIMIT=50
FEATURE_LIMIT = 100

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=client_id,
                                               client_secret=client_secret,
                                               redirect_uri=redirect_uri,
                                               scope="user-library-read"))

liked_tracks=list()
print('')

while(True):
    paged_tracks = sp.current_user_saved_tracks(offset=OFFSET, limit=SAVED_TRACKS_LIMIT)
    liked_tracks.extend([{'name':el['track']['name'], 
                          'id':el['track']['id'], 
                          #'artist_name': el['track']['artist_name'],
                          'popularity': el['track']['popularity'],
                          'duration': el['track']['duration_ms']} for el in paged_tracks['items']])
    print(f'Fetched {len(liked_tracks)} tracks')
    OFFSET+=SAVED_TRACKS_LIMIT
    if paged_tracks['next'] is None:
        break

def get_windowed_track_ids(liked_tracks, limit):
    for i in range(0, len(liked_tracks), limit): 
        track_window = liked_tracks[i:i + limit]
        yield track_window, [t['id'] for t in track_window]

track_feature_list = list()
print('')

for track_window, track_window_ids in get_windowed_track_ids(liked_tracks, FEATURE_LIMIT):
    track_features = sp.audio_features(tracks=track_window_ids)
    for index, _track in enumerate(track_window):
        _track.update({k:v for k,v in track_features[index].items() if k in FEATURE_KEYS})
        track_feature_list.append(_track)
    print(f'Fetched features for {len(track_feature_list)} tracks')

df=DataFrame.from_dict(track_feature_list)
saved_songs = f'liked_tracks_{int(time())}.csv'
df.to_csv(saved_songs, index=False)
print('')
print(f'Saved features to {saved_songs}')


Fetched 50 tracks
Fetched 100 tracks
Fetched 150 tracks
Fetched 200 tracks
Fetched 250 tracks
Fetched 300 tracks
Fetched 350 tracks
Fetched 400 tracks
Fetched 450 tracks
Fetched 500 tracks
Fetched 550 tracks
Fetched 600 tracks
Fetched 650 tracks
Fetched 700 tracks
Fetched 750 tracks
Fetched 800 tracks
Fetched 850 tracks
Fetched 900 tracks
Fetched 950 tracks
Fetched 1000 tracks
Fetched 1050 tracks
Fetched 1100 tracks
Fetched 1150 tracks
Fetched 1200 tracks
Fetched 1250 tracks
Fetched 1300 tracks
Fetched 1350 tracks
Fetched 1400 tracks
Fetched 1450 tracks
Fetched 1500 tracks
Fetched 1550 tracks
Fetched 1600 tracks
Fetched 1650 tracks
Fetched 1700 tracks
Fetched 1750 tracks
Fetched 1800 tracks
Fetched 1850 tracks
Fetched 1900 tracks
Fetched 1950 tracks
Fetched 2000 tracks
Fetched 2050 tracks
Fetched 2100 tracks
Fetched 2150 tracks
Fetched 2200 tracks
Fetched 2250 tracks
Fetched 2300 tracks
Fetched 2350 tracks
Fetched 2400 tracks
Fetched 2450 tracks
Fetched 2500 tracks
Fetched 2550 tracks

In [3]:
import pandas as pd

In [9]:
#Load saved tracks into pandas dataframe
tracks = pd.read_csv("./liked_tracks_1650725844.csv")

In [10]:
# Pull track ids, which will be used to pull additional song attributes from spotipy
track_ids = tracks["id"]
print(len(track_ids))
track_ids = list(track_ids)

4077


In [16]:
# Define a formula to pull additional track attributes — track name, album, artist, and release date
def getTrackFeatures(id):
  meta = sp.track(id)

  # meta
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']


  track = [name, album, artist, release_date]
  return track

In [19]:
import time

In [20]:
# pull track info and save into a dataframe

tracks = []
for i in range(len(track_ids)):
  time.sleep(.5)
  track = getTrackFeatures(track_ids[i])
  tracks.append(track)

# create dataset
df = pd.DataFrame(tracks, columns = ['name', 'album', 'artist', 'release_date'])
df.to_csv("spotify.csv", sep = ',')

In [35]:
df['index_col'] = df.index # add index column; will be used when merging track info with the saved tracks

In [36]:
df.head(5)

Unnamed: 0,name,album,artist,release_date,tmp,index_col
0,Enemy (with JID) - from the series Arcane Leag...,Mercury - Act 1,Imagine Dragons,2021-09-03,1,0
1,Millions,Rap?,Tierra Whack,2021-12-02,1,1
2,Peloton,USEE4YOURSELF,IDK,2021-06-04,1,2
3,drug dealer (feat. Lil Wayne),mainstream sellout,Machine Gun Kelly,2022-03-25,1,3
4,Glow of Love - Satin Jackets Remix,Glow of Love,Paul Harris,2016-05-16,1,4


In [52]:
track_info = df # save track attributes to df

# add index to original saved tracks file
track_df = pd.read_csv("./liked_tracks_1650725844.csv")
track_df['index_col'] = track_df.index

# merge dfs for analysis
saved_tracks_df = pd.merge(track_df, track_info, on = ['index_col'])

In [74]:
saved_tracks_df.tail()

Unnamed: 0,name_x,id,popularity,duration,danceability,energy,key,loudness,mode,speechiness,...,instrumentalness,liveness,valence,tempo,index_col,name_y,album,artist,release_date,tmp
4072,Company,6ImxYXeLDQPIv4qo7bMhSk,60,252640,0.651,0.55,7,-8.215,1,0.0496,...,0.0,0.165,0.119,107.567,4072,Company,If You're Reading This It's Too Late,Drake,2015-02-12,1
4073,You & The 6,2YaDRtIlQiZ5WDDB2YuEOC,55,264906,0.545,0.476,7,-7.444,0,0.284,...,0.0,0.13,0.223,154.362,4073,You & The 6,If You're Reading This It's Too Late,Drake,2015-02-12,1
4074,Jungle,7JXZq0JgG2zTrSOAgY8VMC,72,320400,0.697,0.235,7,-8.651,1,0.0534,...,0.000144,0.107,0.378,99.947,4074,Jungle,If You're Reading This It's Too Late,Drake,2015-02-12,1
4075,6PM In New York,5mZJwWdxAOR4xUvSGZvvMU,54,283306,0.554,0.85,5,-4.155,1,0.251,...,0.0,0.155,0.383,128.429,4075,6PM In New York,If You're Reading This It's Too Late,Drake,2015-02-12,1
4076,"Flex (Ooh, Ooh, Ooh)",2LLFl56eIFGFlnrplUC26f,50,176674,0.728,0.62,2,-7.533,1,0.0434,...,0.0,0.342,0.885,81.998,4076,"Flex (Ooh, Ooh, Ooh)",Summer Sampler,Rich Homie Quan,2015-06-02,1


In [75]:
saved_tracks_df = saved_tracks_df.drop(columns=['tmp', 'name_y'])
saved_tracks_df.columns

Index(['name_x', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date'],
      dtype='object')

In [76]:
saved_tracks_df.rename(columns= {"name_x" : "title"}, inplace= True)

In [77]:
saved_tracks_df.columns

Index(['title', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date'],
      dtype='object')

In [78]:
# Create a "release year" column based on release date
saved_tracks_df['release_year'] = saved_tracks_df['release_date'].str[0:4]
saved_tracks_df['release_year'] = pd.to_numeric(saved_tracks_df['release_year'])
saved_tracks_df['release_year'].describe
# print(saved_tracks_df)

<bound method NDFrame.describe of 0       2021
1       2021
2       2021
3       2022
4       2016
        ... 
4072    2015
4073    2015
4074    2015
4075    2015
4076    2015
Name: release_year, Length: 4077, dtype: int64>

In [80]:
saved_tracks_df = saved_tracks_df.drop(columns=['release_date'])

In [83]:
# Save as csv, will use this file for EDA and cluster analysis
saved_tracks_df.to_csv("saved_tracks_final.csv", sep = ',', index= False)