In [1]:
import requests
import pandas as pd
import math
import time

#function used to time how long a function takes
def timer(func,*args):
    start = time.perf_counter()
    output = func(*args)
    print(f"{func.__name__} took {round(time.perf_counter()-start,4)} seconds")
    return output

In [95]:
#api definitions
user_id = "97t83gd83c75t69ogtnzjjray"

playlists = {
    'my_playlist' : "6fTpLZHz57ymQCXvwEbwHX",
    'top_songs_us' : "37i9dQZEVXbLp5XoPON0wI",
    'top_songs_global' : "37i9dQZEVXbNG2KDcFcKOF",
    'viral_us' : "37i9dQZEVXbKuaTI1Z1Afx",
    'viral_global' : "37i9dQZEVXbLiRSasKsNU9",
    'billboard_100' : '6UeSakyzhiEt4NB3UAd6NQ'
}

token = "BQAbPO-1aqhNcGsjZ8C904hwz0ulWDlutoY6dSPBV9nsA8LaE7JajrlFAFgUqGjxeHa1us4lelfPGVCW0lw6z_nRUSl7-vMEoxSOKzk9393feMU1oP8P3NyU339q_kDEzsu0A2XhSb5fsLju8w2XMHP6kluQ-St7zyU63fRywFUyVsPtwaf4U2BOR8q2yfI9siI"

headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': 'Bearer '+token,
}

base_url = 'https://api.spotify.com/v1'

In [3]:
#gets relevant data from json track object
def set_track_data(track):
    track_data = {
        'id':track['track']['id'],
        'date_added':track['added_at'],
        'name':track['track']['name'],
        'artist_ids':[a['id'] for a in track['track']['artists']],
        'album_id':track['track']['album']['id'],
        'duration':track['track']['duration_ms'],
        'explicit':track['track']['explicit'],
        'popularity':track['track']['popularity'],
        'release_date':track['track']['album']['release_date']
    }
    
    return track_data

In [4]:
#gets relevant audio data from json track object
def set_audio_data(track):
    audio_data = {
        'id':track['id'],
        'danceability':track['danceability'],
        'energy':track['energy'],
        'key':track['key'],
        'loudness':track['loudness'],
        'mode':track['mode'],
        'speechiness':track['speechiness'],
        'acousticness':track['acousticness'],
        'instrumentalness':track['instrumentalness'],
        'liveness':track['liveness'],
        'valence':track['valence'],
        'tempo':track['tempo'],
        'time_signature':track['time_signature']
    }
    
    return audio_data

In [5]:
#gets relevant data from json artist object
def set_artist_data(artist):
    artist_data = {
        'id':artist['id'],
        'name':artist['name'],
        'popularity':artist['popularity'],
        'genres':artist['genres'],
        'followers':artist['followers']['total']
    }
    
    return artist_data

In [6]:
#gets relevant data from spotify api given a playlist id
def read_playlist(playlist_id):
    columns = ['id','date_added','name','artist_ids','album_id','duration','explicit','popularity','release_date']
    tracks_info = pd.DataFrame(columns=columns)
    for i in range(50):
        params = {'offset':100*i,'limit':100}
        tracks = requests.get(f"{base_url}/playlists/{playlist_id}/tracks",headers=headers,params=params).json()
        if not tracks['items']:
            break
        for track in tracks['items']:    
            track_data = set_track_data(track)
            tracks_info = tracks_info.append(track_data,ignore_index=True)
    return tracks_info

In [7]:
#given a list of track ids, will request spotify api for specified audio data
def get_audio_features(track_ids):
    columns = ['id','danceability','energy','key','loudness','mode','speechiness',
               'acousticness','instrumentalness','liveness','valence','tempo','time_signature']
    audio_info = pd.DataFrame(columns=columns)
    for i in range(math.ceil(len(track_ids)/50)):
        track_id_slice = track_ids[50*i:50*i+50]  
        tracks = requests.get(f"{base_url}/audio-features",headers=headers,params={'ids':','.join(track_id_slice)}).json()['audio_features']
        for track in tracks:
            audio_data = set_audio_data(track)
            audio_info = audio_info.append(audio_data,ignore_index=True)
    return audio_info

In [8]:
#requests data for given list of artist ids
def get_artists(artist_ids):
    columns = ['id','name','popularity','genres','followers']
    artists_info = pd.DataFrame(columns=columns)
    for i in range(math.ceil(len(artist_ids)/50)):
        artist_id_slice = artist_ids[50*i:50*i+50]
        artists = requests.get(f"{base_url}/artists",headers=headers,params={'ids':','.join(artist_id_slice)}).json()['artists']
        for artist in artists:
            artist_data = set_artist_data(artist)
            artists_info = artists_info.append(artist_data,ignore_index=True)
    return artists_info

In [96]:
#will read in data from a playlist and add track and artist data to the given dfs, which are the main track and artist dfs
#combines all above functions into one
def load_playlist(playlist_id,tracks_df,artists_df,artist_tracks_df,artist_genres_df):
    tracks_info = read_playlist(playlist_id)
    audio_info = get_audio_features(tracks_info['id'].unique())
    
    playlist_tracks = tracks_info[['id','date_added']].copy()
    tracks_data = tracks_info.merge(audio_info,on='id')
    
    artists_data = get_artists(tracks_data['artist_ids'].explode().unique())
    
    artist_list = artist_tracks.merge(all_artists[['id','name']],left_on='artist_id',right_on='id')
    playlist_tracks['artist_list'] = artist_list.groupby('track_id')['name'].apply(lambda x: ', '.join(x))

    artist_tracks_data = tracks_data[['id','artist_ids']].explode('artist_ids').rename(columns={'id':'track_id','artist_ids':'artist_id'})
    artist_tracks_data['artist_pop'] = artist_tracks_data['artist_id'].map(dict(zip(artists_data['id'],artists_data['popularity'])))
    artist_genres_data = artists_data[['id','genres']].explode('genres').rename(columns={'id':'artist_id','genres':'genre'})
    
    artist_list_map = artist_tracks_data.merge(artists_data[['id','name']],left_on='artist_id',right_on='id').groupby('track_id')['name'].apply(lambda x: ', '.join(x))
    tracks_data['artist_list'] = tracks_data['id'].map(artist_list_map)
    
    tracks_df = tracks_df.append(tracks_data)
    tracks_df = tracks_df.drop_duplicates(subset=['id'])
    
    artists_df = artists_df.append(artists_data)
    artists_df = artists_df.drop_duplicates(subset=['id'])
    
    artist_tracks_df = artist_tracks_df.append(artist_tracks_data)
    artist_tracks_df = artist_tracks_df.drop_duplicates(subset=['track_id'])
    
    artsit_genres_df = artist_genres_df.append(artist_genres_data)
    artist_genres_df = artist_genres_df.drop_duplicates(subset=['artist_id'])
    
    return playlist_tracks, tracks_df, artists_df, artist_tracks_df, artist_genres_df

In [None]:
track_features = [
    'id','name','artist_ids','album_id','duration','explicit','popularity','release_date','danceability','energy',
    'key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature'
]
artist_features = ['id','name','popularity','genres','followers']

all_tracks = pd.DataFrame(columns=track_features)
all_artists = pd.DataFrame(columns=artist_features)
artist_tracks = pd.DataFrame(columns=['track_id','artist_id'])
artist_genres = pd.DataFrame(columns=['artist_id','genre'])

playlist_dfs = {}
for playlist in playlists:
    playlist_dfs[playlist], all_tracks, all_artists, artist_tracks, artist_genres = timer(load_playlist,playlists[playlist],all_tracks,all_artists,artist_tracks,artist_genres)

all_tracks = all_tracks.drop(['artist_ids','date_added'],axis=1)
all_artists = all_artists.drop('genres',axis=1)

Unnamed: 0,id,name,album_id,duration,explicit,popularity,release_date,danceability,energy,key,loudness,mode,speechiness,acousticness,instrumentalness,liveness,valence,tempo,time_signature,artist_list
0,0RbW8kWozrVSIGb4V13o0o,Hurricane,340MjPcVdiQRnMigrPybZA,243187,False,90,2021-08-29,0.587,0.561,0,-6.569,1,0.1460,0.0466,0,0.1090,0.351,80.009,4,Kanye West
1,5PjdY0CKGZdEuoNab3yDmX,STAY (with Justin Bieber),4bZJWQhHKJckFLJuYdvyX2,141805,True,92,2021-07-23,0.591,0.764,1,-5.484,1,0.0483,0.0383,0,0.1030,0.478,169.928,4,"The Kid LAROI, Justin Bieber"
2,42ZrMq4BKsWDrrbMnMRYaK,Jail,340MjPcVdiQRnMigrPybZA,297000,False,88,2021-08-29,0.458,0.571,4,-6.739,1,0.0949,0.5350,0.000305,0.0846,0.586,110.013,4,Kanye West
3,2gbMPBrBVj3CuNTLp2dHYs,Off The Grid,340MjPcVdiQRnMigrPybZA,339249,False,88,2021-08-29,0.546,0.640,8,-5.582,1,0.3700,0.0438,0,0.1370,0.630,137.728,4,Kanye West
4,27NovPIUIRrOZoCHxABJwK,INDUSTRY BABY (feat. Jack Harlow),622NFw5Yk0OReMJ2XWcXUh,212000,True,98,2021-07-23,0.736,0.704,3,-7.409,0,0.0615,0.0203,0,0.0501,0.894,149.995,4,"Lil Nas X, Jack Harlow"
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
95,4QpWhd7l2NrmGmfw4qGlpa,Next Girl,6H1IElgVZIcO8HCsj3kAnk,164677,False,70,2020-09-04,0.583,0.867,11,-4.170,1,0.0637,0.1430,0,0.0863,0.962,160.096,4,Carly Pearce
96,7KrlNS2VqiNNJ82RMQdxBn,I Like Dat,6tFOjuF74joX0RHbimAZp3,242085,True,69,2021-05-14,0.565,0.491,1,-6.902,1,0.2210,0.0049,0,0.1370,0.337,175.694,4,"T-Pain, Kehlani"
97,0vzSAyinyA619ZbrOaAi04,Steal My Love,7L8IHgiomfEpxOm61vgPTm,162120,False,72,2021-08-13,0.590,0.633,2,-5.452,1,0.0328,0.6190,0,0.1570,0.751,179.856,4,Dan + Shay
98,21AJQhGZpujjZQXByZAXpr,REPEAT IT (with Gunna),0hTvuLmKKMWoBWgPPrpUKF,190344,True,81,2021-08-06,0.681,0.687,9,-7.612,1,0.3320,0.0555,0,0.0620,0.588,108.641,3,"Lil Tecca, Gunna"


In [81]:
with pd.ExcelWriter('song_data_new.xlsx') as writer:
    all_tracks.to_excel(writer,sheet_name = 'tracks',index=False)
    all_artists.to_excel(writer,sheet_name = 'artists',index=False)
    artist_tracks.to_excel(writer,sheet_name = 'artist_tracks',index=False)
    artist_genres.dropna().to_excel(writer,sheet_name = 'artist_genres',index=False)
    for playlist in playlists:
        playlist_dfs[playlist].to_excel(writer,sheet_name = playlist,index=False)

In [84]:
artist_tracks

Unnamed: 0,track_id,artist_id
0,0RbW8kWozrVSIGb4V13o0o,5K4W6rqBFWDnAN6FQUkS6x
1,5PjdY0CKGZdEuoNab3yDmX,2tIP7SsRs7vjIcLrU85W8J
2,42ZrMq4BKsWDrrbMnMRYaK,5K4W6rqBFWDnAN6FQUkS6x
3,2gbMPBrBVj3CuNTLp2dHYs,5K4W6rqBFWDnAN6FQUkS6x
4,27NovPIUIRrOZoCHxABJwK,7jVv8c5Fj3E9VhNjxT4snq
...,...,...
95,4QpWhd7l2NrmGmfw4qGlpa,4sIl4BTo9l9KqEi0Y3RE72
96,7KrlNS2VqiNNJ82RMQdxBn,3aQeKQSyrW4qWr35idm0cy
97,0vzSAyinyA619ZbrOaAi04,7z5WFjZAIYejWy0NI5lv4T
98,21AJQhGZpujjZQXByZAXpr,4Ga1P7PMIsmqEZqhYZQgDo
