In [2]:
import requests
import pandas as pd
import math
import time

#function used to time how long a function takes
def timer(func,*args):
    start = time.perf_counter()
    output = func(*args)
    print(f"{func.__name__} took {round(time.perf_counter()-start,4)} seconds")
    return output

In [3]:
#api definitions
user_id = "97t83gd83c75t69ogtnzjjray"

playlists = {
    'my_playlist' : "6fTpLZHz57ymQCXvwEbwHX",
    'top_songs_us' : "37i9dQZEVXbLp5XoPON0wI",
    'top_songs_global' : "37i9dQZEVXbNG2KDcFcKOF",
    'viral_us' : "37i9dQZEVXbKuaTI1Z1Afx",
    'viral_global' : "37i9dQZEVXbLiRSasKsNU9",
    'billboard_100' : '6UeSakyzhiEt4NB3UAd6NQ'
}

token = "BQCttf_TFTzoGgzIphJilvpPZQQRGURSNZoeJPgB3uPXS3dFCvATUyDn5OVGk5hQXGRUsLN-efzPevt8RKO8QLRd8YuerA9ZZUhgk3EltGeOU5CPFvL0t6Ucc6QnCtzLa_LZ7E1cA8I9UiNtrFpzeDOqx7s4et0vCJj9O80mUqa8G65YxJmBMEuG56ulGrS5Oyg"

headers = {
    'Accept': 'application/json',
    'Content-Type': 'application/json',
    'Authorization': 'Bearer '+token,
}

base_url = 'https://api.spotify.com/v1'

In [4]:
#gets relevant data from json track object
def set_track_data(track):
    track_data = {
        'id':track['track']['id'],
        'date_added':track['added_at'],
        'name':track['track']['name'],
        'artist_ids':[a['id'] for a in track['track']['artists']],
        'album_id':track['track']['album']['id'],
        'duration':track['track']['duration_ms'],
        'explicit':track['track']['explicit'],
        'popularity':track['track']['popularity'],
        'release_date':track['track']['album']['release_date']
    }
    
    return track_data

In [5]:
#gets relevant audio data from json track object
def set_audio_data(track):
    audio_data = {
        'id':track['id'],
        'danceability':track['danceability'],
        'energy':track['energy'],
        'key':track['key'],
        'loudness':track['loudness'],
        'mode':track['mode'],
        'speechiness':track['speechiness'],
        'acousticness':track['acousticness'],
        'instrumentalness':track['instrumentalness'],
        'liveness':track['liveness'],
        'valence':track['valence'],
        'tempo':track['tempo'],
        'time_signature':track['time_signature']
    }
    
    return audio_data

In [6]:
#gets relevant data from json artist object
def set_artist_data(artist):
    artist_data = {
        'id':artist['id'],
        'name':artist['name'],
        'popularity':artist['popularity'],
        'genres':artist['genres'],
        'followers':artist['followers']['total']
    }
    
    return artist_data

In [7]:
#gets relevant data from spotify api given a playlist id
def read_playlist(playlist_id):
    columns = ['id','date_added','name','artist_ids','album_id','duration','explicit','popularity','release_date']
    tracks_info = pd.DataFrame(columns=columns)
    for i in range(50):
        params = {'offset':100*i,'limit':100}
        tracks = requests.get(f"{base_url}/playlists/{playlist_id}/tracks",headers=headers,params=params).json()
        if not tracks['items']:
            break
        for track in tracks['items']:    
            track_data = set_track_data(track)
            tracks_info = tracks_info.append(track_data,ignore_index=True)
    return tracks_info

In [8]:
#given a list of track ids, will request spotify api for specified audio data
def get_audio_features(track_ids):
    columns = ['id','danceability','energy','key','loudness','mode','speechiness',
               'acousticness','instrumentalness','liveness','valence','tempo','time_signature']
    audio_info = pd.DataFrame(columns=columns)
    for i in range(math.ceil(len(track_ids)/50)):
        track_id_slice = track_ids[50*i:50*i+50]  
        tracks = requests.get(f"{base_url}/audio-features",headers=headers,params={'ids':','.join(track_id_slice)}).json()['audio_features']
        for track in tracks:
            audio_data = set_audio_data(track)
            audio_info = audio_info.append(audio_data,ignore_index=True)
    return audio_info

In [9]:
#requests data for given list of artist ids
def get_artists(artist_ids):
    columns = ['id','name','popularity','genres','followers']
    artists_info = pd.DataFrame(columns=columns)
    for i in range(math.ceil(len(artist_ids)/50)):
        artist_id_slice = artist_ids[50*i:50*i+50]
        artists = requests.get(f"{base_url}/artists",headers=headers,params={'ids':','.join(artist_id_slice)}).json()['artists']
        for artist in artists:
            artist_data = set_artist_data(artist)
            artists_info = artists_info.append(artist_data,ignore_index=True)
    return artists_info

In [10]:
#will read in data from a playlist and add track and artist data to the given dfs, which are the main track and artist dfs
#combines all above functions into one
def load_playlist(playlist_id,tracks_df,artists_df,artist_tracks_df,artist_genres_df):
    tracks_info = read_playlist(playlist_id)
    audio_info = get_audio_features(tracks_info['id'].unique())
    
    playlist_tracks = tracks_info[['id','date_added']].copy()
    tracks_data = tracks_info.merge(audio_info,on='id')
    
    artists_data = get_artists(tracks_data['artist_ids'].explode().unique())
    
    artist_list = artist_tracks.merge(all_artists[['id','name']],left_on='artist_id',right_on='id')

    artist_tracks_data = tracks_data[['id','artist_ids']].explode('artist_ids').rename(columns={'id':'track_id','artist_ids':'artist_id'})
    artist_tracks_data['artist_pop'] = artist_tracks_data['artist_id'].map(dict(zip(artists_data['id'],artists_data['popularity'])))
    artist_genres_data = artists_data[['id','genres']].explode('genres').rename(columns={'id':'artist_id','genres':'genre'})
    
    artist_list_map = artist_tracks_data.merge(artists_data[['id','name']],left_on='artist_id',right_on='id').groupby('track_id')['name'].apply(lambda x: ', '.join(x))
    tracks_data['artist_list'] = tracks_data['id'].map(artist_list_map)
    
    tracks_df = tracks_df.append(tracks_data)
    tracks_df = tracks_df.drop_duplicates(subset=['id'])
    
    artists_df = artists_df.append(artists_data)
    artists_df = artists_df.drop_duplicates(subset=['id'])
    
    artist_tracks_df = artist_tracks_df.append(artist_tracks_data)
    artist_tracks_df = artist_tracks_df.drop_duplicates()
    
    artist_genres_df = artist_genres_df.append(artist_genres_data)
    artist_genres_df = artist_genres_df.drop_duplicates()
    
    return playlist_tracks, tracks_df, artists_df, artist_tracks_df, artist_genres_df

In [11]:
track_features = [
    'id','name','artist_ids','album_id','duration','explicit','popularity','release_date','danceability','energy',
    'key','loudness','mode','speechiness','acousticness','instrumentalness','liveness','valence','tempo','time_signature'
]
artist_features = ['id','name','popularity','genres','followers']

all_tracks = pd.DataFrame(columns=track_features)
all_artists = pd.DataFrame(columns=artist_features)
artist_tracks = pd.DataFrame(columns=['track_id','artist_id'])
artist_genres = pd.DataFrame(columns=['artist_id','genre'])

playlist_dfs = {}
for playlist in playlists:
    playlist_dfs[playlist], all_tracks, all_artists, artist_tracks, artist_genres = timer(load_playlist,playlists[playlist],all_tracks,all_artists,artist_tracks,artist_genres)

all_tracks = all_tracks.drop(['artist_ids','date_added'],axis=1)
all_artists = all_artists.drop('genres',axis=1)

load_playlist took 44.8736 seconds
load_playlist took 1.9115 seconds
load_playlist took 1.5929 seconds
load_playlist took 2.0165 seconds
load_playlist took 2.1191 seconds
load_playlist took 3.4193 seconds


In [12]:
artist_genres.groupby('artist_id').count().sort_values('genre')#.dropna()

Unnamed: 0_level_0,genre
artist_id,Unnamed: 1_level_1
00dwwnz3V4kRfu3UFYpJLz,0
51VoiQGnr78y5uvwH4t2vD,0
4xM4nZLhGE0pMKbXA08s6A,0
4xDPSieFNUt9iHOsx3pxWt,0
4jBAtDSn9XkkE6PtrSgBDE,0
...,...
5WId4o5jdGVhptNU0uqKxu,13
2Jv5eshHtLycR6R8KQCdc4,14
5r5Va4lVQ1zjEfbJSrmCsS,15
58XGUNsRNu3cVOIOYk5chx,15


In [13]:
with pd.ExcelWriter('song_data2.xlsx') as writer:
    all_tracks.to_excel(writer,sheet_name = 'tracks',index=False)
    all_artists.to_excel(writer,sheet_name = 'artists',index=False)
    artist_tracks.to_excel(writer,sheet_name = 'artist_tracks',index=False)
    artist_genres.dropna().to_excel(writer,sheet_name = 'artist_genres',index=False)
    for playlist in playlists:
        playlist_dfs[playlist].to_excel(writer,sheet_name = playlist,index=False)

In [204]:
all_artists[all_artists['id']=='38zTZcuN7nFvVJ6auhc6V3']

Unnamed: 0,id,name,popularity,followers
620,38zTZcuN7nFvVJ6auhc6V3,Deerhunter,51,331506
