## Extract songs that Brittain loves from Spotify using the Spotipy API

In [5]:
# load api key and code (make sure not to make publically visible)

from dotenv import load_dotenv
import os

load_dotenv()

api_key = os.getenv("API_KEY_B")
api_secret = os.getenv("API_SECRET_B")


In [6]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import pandas as pd
from time import time
from pandas import DataFrame

#replace with Spotify credentials before pushing to github
cid = api_key
secret = api_secret

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)

sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [7]:
# Use spotipy to pull britt's loved songs
from spotipy.oauth2 import SpotifyOAuth

redirect_uri = 'https://example.com/callback'

FEATURE_KEYS = ['danceability', 'energy', 'key', 'loudness', 'mode', 'speechiness', 
                'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
OFFSET=0
SAVED_TRACKS_LIMIT=50
FEATURE_LIMIT = 100

sp = spotipy.Spotify(auth_manager=SpotifyOAuth(client_id=cid,
                                               client_secret=secret,
                                               redirect_uri=redirect_uri,
                                               scope="user-library-read"))

liked_tracks=list()
print('')

while(True):
    paged_tracks = sp.current_user_saved_tracks(offset=OFFSET, limit=SAVED_TRACKS_LIMIT)
    liked_tracks.extend([{'name':el['track']['name'], 
                          'id':el['track']['id'], 
                          'popularity': el['track']['popularity'],
                          'duration': el['track']['duration_ms']} for el in paged_tracks['items']])
    print(f'Fetched {len(liked_tracks)} tracks')
    OFFSET+=SAVED_TRACKS_LIMIT
    if paged_tracks['next'] is None:
        break

def get_windowed_track_ids(liked_tracks, limit):
    for i in range(0, len(liked_tracks), limit): 
        track_window = liked_tracks[i:i + limit]
        yield track_window, [t['id'] for t in track_window]

track_feature_list = list()
print('')

for track_window, track_window_ids in get_windowed_track_ids(liked_tracks, FEATURE_LIMIT):
    track_features = sp.audio_features(tracks=track_window_ids)
    for index, _track in enumerate(track_window):
        _track.update({k:v for k,v in track_features[index].items() if k in FEATURE_KEYS})
        track_feature_list.append(_track)
    print(f'Fetched features for {len(track_feature_list)} tracks')

songs_b_loves_df =DataFrame.from_dict(track_feature_list)
loved_songs = 'songs_b_loves.csv'
songs_b_loves_df.to_csv(loved_songs, index=False)
print('')
print(f'Saved features to {loved_songs}')

Couldn't read cache at: .cache





Enter the URL you were redirected to:  https://example.com/callback?code=AQApnv4p4LBKJuxgyxIwbh72i6zzQNAPnDDQxROvnBbHFj485Bhn8Jy8o4ccL17fQMHTImBbV6-rhMLQ6SwlmYSlac_7sOcTakWN-__rIgwfoxpJlJjx1MktjjIv-cd26WqWplxS_9UvQd6hRcKiBjtkn3v7tldHM5xJPtKPNk1aiPkEpo1-YbIaNwxWYptD5_9H


Fetched 50 tracks
Fetched 100 tracks
Fetched 150 tracks
Fetched 200 tracks
Fetched 250 tracks
Fetched 300 tracks
Fetched 350 tracks
Fetched 400 tracks
Fetched 450 tracks
Fetched 500 tracks
Fetched 550 tracks
Fetched 600 tracks
Fetched 650 tracks
Fetched 700 tracks
Fetched 750 tracks
Fetched 800 tracks
Fetched 850 tracks
Fetched 900 tracks
Fetched 950 tracks
Fetched 1000 tracks
Fetched 1050 tracks
Fetched 1100 tracks
Fetched 1150 tracks
Fetched 1200 tracks
Fetched 1250 tracks
Fetched 1300 tracks
Fetched 1350 tracks
Fetched 1400 tracks
Fetched 1450 tracks
Fetched 1500 tracks
Fetched 1550 tracks
Fetched 1600 tracks
Fetched 1650 tracks
Fetched 1700 tracks
Fetched 1750 tracks
Fetched 1800 tracks
Fetched 1850 tracks
Fetched 1900 tracks
Fetched 1950 tracks
Fetched 2000 tracks
Fetched 2050 tracks
Fetched 2100 tracks
Fetched 2150 tracks
Fetched 2200 tracks
Fetched 2250 tracks
Fetched 2300 tracks
Fetched 2350 tracks
Fetched 2400 tracks
Fetched 2450 tracks
Fetched 2500 tracks
Fetched 2550 tracks


In [8]:
songs_b_loves_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3445 entries, 0 to 3444
Data columns (total 15 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   name              3445 non-null   object 
 1   id                3445 non-null   object 
 2   popularity        3445 non-null   int64  
 3   duration          3445 non-null   int64  
 4   danceability      3445 non-null   float64
 5   energy            3445 non-null   float64
 6   key               3445 non-null   int64  
 7   loudness          3445 non-null   float64
 8   mode              3445 non-null   int64  
 9   speechiness       3445 non-null   float64
 10  acousticness      3445 non-null   float64
 11  instrumentalness  3445 non-null   float64
 12  liveness          3445 non-null   float64
 13  valence           3445 non-null   float64
 14  tempo             3445 non-null   float64
dtypes: float64(9), int64(4), object(2)
memory usage: 403.8+ KB


In [9]:
# Pull track ids, which will be used to pull additional song attributes from spotipy
track_ids = songs_b_loves_df["id"]
print(len(track_ids))
track_ids = list(track_ids)

3445


In [10]:
# Define a formula to pull additional track attributes — track name, album, artist, and release date
def getTrackFeatures(id):
  meta = sp.track(id)

  # meta
  name = meta['name']
  album = meta['album']['name']
  artist = meta['album']['artists'][0]['name']
  release_date = meta['album']['release_date']


  track = [name, album, artist, release_date]
  return track

In [11]:
# pull track info and save into a dataframe

tracks = []
for i in range(len(track_ids)):
  #time.sleep(.5)
  track = getTrackFeatures(track_ids[i])
  tracks.append(track)

# create dataset
loved_tracks_attributes = pd.DataFrame(tracks, columns = ['name', 'album', 'artist','release_date'])
loved_tracks_attributes.to_csv("loved_tracks_attributes_Aug2023_britt.csv", sep = ',')

In [12]:
loved_tracks_attributes['index_col'] = loved_tracks_attributes.index # add index column; will be used when merging track info with the attributes

In [13]:
loved_tracks_attributes.index_col

0          0
1          1
2          2
3          3
4          4
        ... 
3440    3440
3441    3441
3442    3442
3443    3443
3444    3444
Name: index_col, Length: 3445, dtype: int64

In [14]:
loved_tracks_attributes.info(5)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3445 entries, 0 to 3444
Data columns (total 5 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          3445 non-null   object
 1   album         3445 non-null   object
 2   artist        3445 non-null   object
 3   release_date  3445 non-null   object
 4   index_col     3445 non-null   int64 
dtypes: int64(1), object(4)
memory usage: 134.7+ KB


In [16]:
# add index to original loved tracks file for merging
songs_b_loves_df['index_col'] = songs_b_loves_df.index

In [17]:
# merge loved songs and attributes
loved_tracks_df = pd.merge(songs_b_loves_df, loved_tracks_attributes, on = ['index_col'])

In [18]:
#top_tracks_df = top_tracks_df.drop(columns=['name_y'])
loved_tracks_df.columns

Index(['name_x', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'name_y', 'album', 'artist', 'release_date'],
      dtype='object')

In [19]:
loved_tracks_df = loved_tracks_df.drop(columns=['name_y'])
loved_tracks_df.columns

Index(['name_x', 'id', 'popularity', 'duration', 'danceability', 'energy',
       'key', 'loudness', 'mode', 'speechiness', 'acousticness',
       'instrumentalness', 'liveness', 'valence', 'tempo', 'index_col',
       'album', 'artist', 'release_date'],
      dtype='object')

In [20]:
loved_tracks_df.rename(columns= {"name_x" : "track_name"}, inplace= True)

In [21]:
# Create a "release year" column based on release date
loved_tracks_df['release_year'] = loved_tracks_df['release_date'].str[0:4]
loved_tracks_df['release_year'] = pd.to_numeric(loved_tracks_df['release_year'])
loved_tracks_df['release_year'].describe
# print(saved_tracks_df)

<bound method NDFrame.describe of 0       2023
1       2023
2       2023
3       2023
4       2023
        ... 
3440    2015
3441    2015
3442    2015
3443    2015
3444    2015
Name: release_year, Length: 3445, dtype: int64>

In [22]:
loved_tracks_df.release_year.value_counts()

2016    837
2015    753
2017    314
2019    258
2018    248
2020    171
2014    112
2022    109
2021    103
2010     94
2012     88
2013     51
2000     38
2005     38
2023     32
1975     22
2011     21
2007     15
1966     14
1974     13
1977     13
2009     12
1989     11
2004     10
2006      7
2003      5
2008      5
1979      5
1994      4
1969      4
1997      3
1995      3
1987      3
1971      3
1992      3
1985      2
1970      2
1988      2
1960      2
2002      2
1973      2
1999      1
1998      1
1990      1
1980      1
1976      1
1983      1
1972      1
1967      1
1968      1
2001      1
1996      1
Name: release_year, dtype: int64

In [23]:
# Save as csv, will use this file for EDA and cluster analysis
loved_tracks_df.to_csv("songs_b_loves_final.csv", sep = ',', index= False)