In [1]:
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
#!pip install spotipy

In [3]:
import sys
import spotipy
import spotipy.util as util

#authorize some stuff and test getting a list of my current saved tracks
import os
os.environ["SPOTIPY_CLIENT_ID"] = '3171b375071e4b4cac38e071125941d6'
os.environ["SPOTIPY_CLIENT_SECRET"] = '888d44a82ea5402fa3222339fc6e916a'
os.environ["SPOTIPY_REDIRECT_URI"] = 'http://localhost/'

scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

token = util.prompt_for_user_token(username, scope)

# Print some tracks
if token:
    sp = spotipy.Spotify(auth=token)
    results = sp.current_user_saved_tracks()
    for item in results['items']:
        track = item['track']
        print(track['name'] + ' - ' + track['artists'][0]['name'])
else:
    print("Can't get token for", username)
    
#Test some stuff using T Swift
urn = 'spotify:artist:06HL4z0CvFAxyc27GXpf02'

sp.trace = False # turn off tracing
sp.trace_out = False # turn off trace out

#artist = sp.artist(urn)
#print(artist)


#user = sp.user('laylaokane')
#print(user)


Nobody Else Will Be There - The National
Day I Die - The National
Walk It Back - The National
The System Only Dreams in Total Darkness - The National
Born to Beg - The National
Turtleneck - The National
Empire Line - The National
I'll Still Destroy You - The National
Guilty Party - The National
Carin at the Liquor Store - The National
Dark Side of the Gym - The National
Sleep Well Beast - The National
Frim Fram Sauce - Diana Krall
Want You Back - HAIM
Bad Blood - Taylor Swift
The Modern Leper - Frightened Rabbit
Bluebird - The Loreleis
Celebrate - Ingrid Michaelson
Hell No - Ingrid Michaelson
Time to Pretend - MGMT


In [4]:
#get categories of playlists
categories = sp.categories(country=None, locale=None, limit=50, offset=0)


In [5]:
#Get the ids of categories
temp = categories['categories']

cat_ids = []
for item in temp['items']:
    cat_ids.append(item['id'])
print(cat_ids)

['toplists', 'holidays', 'chill', 'pop', 'mood', 'hiphop', 'edm_dance', 'party', 'rock', 'workout', 'focus', 'decades', 'dinner', 'sleep', 'indie_alt', 'rnb', 'popculture', 'metal', 'soul', 'romance', 'jazz', 'classical', 'latin', 'country', 'folk_americana', 'blues', 'travel', 'kids', 'reggae', 'gaming', 'punk', 'funk', 'comedy']


In [6]:
#Get playlists for each of the ids of the categories
playlists = {}
for cat_id in cat_ids:
    playlists[cat_id] = sp.category_playlists(category_id=cat_id, country=None, limit=50, offset=0)

# Get playlist ids for each category (will then use this to grab song info)
playlist_ids_by_cat = {}
for category, playlist in playlists.items():
    #print(playlist['playlists']['items'][0]['id'])
    playlist_ids_by_cat[category] = [x['id'] for x in playlist['playlists']['items']]


retrying ...1secs


In [7]:
'''
Useful track-getting functions!
'''

def spotify_id_to_isrc(spotify_ids):
    '''
    converts spotify ids to isrcs
    '''
    tracks = sp.tracks(spotify_ids)
    return [x['external_ids']['isrc']  for x in tracks['tracks']]

def isrc_to_spotify_id(isrcs):
    '''
    converts isrcs to spotify ids
    This takes a while since we need to search
    Is there a better way to get this info?
    Note: isrc --> spotify_id is not necessarily a one-to-one mapping (multiple spotify ids
    can map to the same isrc)
    ''' 
    ids = []
    for isrc in isrcs:
        ids.append(sp.search('isrc:'+isrc)['tracks']['items'][0]['id'])
    return ids

def get_popularity_and_markets(spotify_ids):
    rez = {}
    rez['popularity'] = []
    rez['num_avail_mkts'] =  []
    # only do 50 at a time for API stability
    chunk_size= 42
    for i in range(0, len(spotify_ids), chunk_size):
        chunk = spotify_ids[i:i+chunk_size]
        tracks = sp.tracks(chunk)
        rez['popularity'] = rez['popularity'] +  [x['popularity'] for x in tracks['tracks']]
        rez['num_avail_mkts'] = rez['num_avail_mkts'] + [len(x['available_markets']) for x in tracks['tracks']]
    rez['max_popularity'] = max(rez['popularity'])
    return pd.DataFrame(rez)

def get_followers(playlist_id, user = 'spotify'):
    playlist = sp.user_playlist(user, playlist_id=playlist_id, fields = ['followers'])
    return playlist['followers']['total']

#Default: US, 11/24/2017 at 8PM
def get_featured_playlists(country = 'US', time = '2017-11-24T18:00:00'):
    featured = sp.featured_playlists(country=country, timestamp=time, limit=50, offset=0)
    return [x['id'] for x in featured['playlists']['items']]

def get_track_ids(playlist_id = '37i9dQZF1DX3FNkD0kDpDV'):
    ''' 
    Given a Spotify Playlist ID, returns a list of spotify ids for songs in playlist
    '''
    offset = 0
    playlist = sp.user_playlist_tracks(user = 'spotify', playlist_id = playlist_id, limit = 100)
    ids = [x['track']['id']  for x in playlist['items']]
    # if we hit the limit, need to add more
    while len(ids) / (offset + 100) == 1:
        offset = offset + 100
        playlist = sp.user_playlist_tracks(user = 'spotify', playlist_id = playlist_id, limit = 100, offset = offset)
        ids = ids + [x['track']['id']  for x in playlist['items']]
    return ids

def get_track_audio_features(spotify_ids = get_track_ids()):
    'Given a list of spotify IDs, returns a dataframe of track audio features'
    chunk_size= 42
    tmp = {}
    for i in range(0, len(spotify_ids), chunk_size):
        chunk = spotify_ids[i:i+chunk_size]
        features = sp.audio_features(chunk)
        tmp_df = pd.DataFrame([x for x in features if isinstance(x, dict)])
        tmp.update(tmp_df.to_dict())
    df = pd.DataFrame(tmp)
    df = df.drop(['analysis_url', 'track_href', 'uri', 'type'], 1)
    return df


In [8]:
def get_playlist_data(playlist_ids):
    '''
    Given a list of Spotify playlist IDs, returns a dataframe containing a row
    for each inputed playlist with columns for the following data:
    1) *average* audio characteristics for the songs in that playlist:
        acousticness, danceability, duration,
        energy, instrumentalness, key, liveness, loudness, mode, tempo,
        valence, and time signature
    2) average popularity of songs in the playlist
    3) popularity of most popular song in playlist (might be an anchor song to the playlist)
    4) average # of markets the songs in the playlist are available i
    5) global playlist info
        - number of followers the playlist has (response variable?)
        - number of tracks in playlist
        - whether or not the playlist was "featured" on 11/24/2017 at 8PM
    '''
    rez = {}
    # force list
    if not isinstance(playlist_ids, list):
        playlist_ids = [playlist_ids]
        
    featured_playlists = get_featured_playlists()
    for playlist_id in playlist_ids:
        print('Getting info for: ' + playlist_id)
        tmp = {}
        try:
            track_ids = get_track_ids(playlist_id)
        except spotipy.client.SpotifyException:
            print('WARNING: Playlist does not exist. Skipping.')
            continue
        # get average audio characteristics
        audio_chars = get_track_audio_features(track_ids).mean().to_dict()
        # get popularity and markets
        pop_and_mkts = get_popularity_and_markets(track_ids).mean().to_dict()
        # get # followers
        tmp['num_followers'] = get_followers(playlist_id)
        tmp['num_tracks'] = len(track_ids)
        tmp['featured'] = 1 if playlist_id in featured_playlists else 0
        tmp.update(audio_chars)
        tmp.update(pop_and_mkts)
        rez[playlist_id] = tmp
    return pd.DataFrame(rez).T

data = {}
for cat, playlists in playlist_ids_by_cat.items():
    print('Starting Category: ' + cat)
    playlist_data = get_playlist_data(playlists)
    playlist_data['category'] = cat
    data.update(playlist_data.T.to_dict())


Starting Category: rock
Getting info for: 37i9dQZF1DWXRqgorJj26U
Getting info for: 37i9dQZF1DXa6YOhGMjjgx
Getting info for: 37i9dQZF1DWY6vTWIdZ54A
Getting info for: 37i9dQZF1DX8FwnYE6PRvL
Getting info for: 37i9dQZF1DWZn9s1LNKPiM
Getting info for: 37i9dQZF1DX153gOfbCM2i
Getting info for: 37i9dQZF1DX3YMp9n8fkNx
Getting info for: 37i9dQZF1DWWGFQLoP9qlv
Getting info for: 37i9dQZF1DX82Zzp6AKx64
Getting info for: 37i9dQZF1DWSlJG7YPUBHF
Getting info for: 37i9dQZF1DX9wa6XirBPv8
Getting info for: 37i9dQZF1DX11ghcIxjcjE
Getting info for: 37i9dQZF1DWXs1L3AC0Xio
Getting info for: 37i9dQZF1DX0xLQsW8b5Zx
Getting info for: 37i9dQZF1DXd0ZFXhY0CRF
Getting info for: 37i9dQZF1DXdpVGstUksUC
Getting info for: 37i9dQZF1DWSDoVybeQisg
Getting info for: 37i9dQZF1DXbDjX0hus3Iu
Getting info for: 37i9dQZF1DX7k3T9O9bscd
Getting info for: 37i9dQZF1DWZkHEX2YHpDV
Getting info for: 37i9dQZF1DX3YlUroplxjF
Getting info for: 37i9dQZF1DX2S9rTKTX6JP
Getting info for: 37i9dQZF1DX6j6TEp70WNZ
Getting info for: 37i9dQZF1DX6ujZ

Getting info for: 37i9dQZF1DWWOaP4H0w5b0
Getting info for: 37i9dQZF1DXcfZ6moR6J0G
Getting info for: 37i9dQZF1DXdpVGstUksUC
Getting info for: 37i9dQZF1DX4jCqmsDQR1i
Getting info for: 37i9dQZF1DX9qNs32fujYe
Getting info for: 37i9dQZF1DWSfKFFAPxhCR
Getting info for: 37i9dQZF1DWXDJDWnzE39E
Getting info for: 37i9dQZF1DX5wgKYQVRARv
Getting info for: 37i9dQZF1DX1cJWWyylDuw
Getting info for: 37i9dQZF1DWZdFtcHGe8ED
Getting info for: 37i9dQZF1DWWBZ8hj4jFBr
Getting info for: 37i9dQZF1DX6GRSnGELn7L
Getting info for: 37i9dQZF1DX1kydukZhLms
Getting info for: 37i9dQZF1DX37bXS7EGI3f
Getting info for: 37i9dQZF1DWY3PJWG3ogmJ
Getting info for: 37i9dQZF1DXbl9rMxGEmRC
Getting info for: 37i9dQZF1DX2lfnpRKY6V3
Getting info for: 37i9dQZF1DWUk47CLxI4Uo
Getting info for: 37i9dQZF1DX3USLhm5QxgA
Getting info for: 37i9dQZF1DX1GZ9l4hvKSJ
Getting info for: 37i9dQZF1DWWzcHhJ7FCIq
Getting info for: 37i9dQZF1DWT9SRKhOEUYj
Getting info for: 37i9dQZF1DX4Woqxy7tpda
Getting info for: 37i9dQZF1DX7qU83RVkXfs
Getting info for

Getting info for: 37i9dQZF1DXbcPC6Vvqudd
Getting info for: 37i9dQZF1DX4sWSpwq3LiO
Getting info for: 37i9dQZF1DWYcDQ1hSjOpY
Getting info for: 37i9dQZF1DWUZ5bk6qqDSy
Getting info for: 37i9dQZF1DX0jgyAiPl8Af
Getting info for: 37i9dQZF1DX4aYNO8X5RpR
Getting info for: 37i9dQZF1DXa1rZf8gLhyz
Getting info for: 37i9dQZF1DX0x36cwEyOTG
Getting info for: 37i9dQZF1DWUKPeBypcpcP
Getting info for: 37i9dQZF1DX9if5QDLdzCa
Getting info for: 37i9dQZF1DWSiZVO2J6WeI
Getting info for: 37i9dQZF1DWSUFOo47GEsI
Getting info for: 37i9dQZF1DX1n9whBbBKoL
Getting info for: 37i9dQZF1DXbADqT0j1Cxt
Getting info for: 37i9dQZF1DWXtXDrhlJ17q
Getting info for: 37i9dQZF1DX5FuBDzVtEFX
Getting info for: 37i9dQZF1DX9NmDLwNQnXE
Getting info for: 37i9dQZF1DWWSads6V2oIk
Getting info for: 37i9dQZF1DWXzR2GKEiHgT
Getting info for: 37i9dQZF1DXdbkmlag2h7b
Getting info for: 37i9dQZF1DX0DxcHtn4Hwo
Getting info for: 37i9dQZF1DWTRnup1IgL4a
Getting info for: 37i9dQZF1DXaw68inx4UiN
Getting info for: 37i9dQZF1DWVLoqOzktoRS
Getting info for

ConnectionError: ('Connection aborted.', TimeoutError(10060, 'A connection attempt failed because the connected party did not properly respond after a period of time, or established connection failed because connected host has failed to respond', None, 10060, None))

In [None]:
import pickle
pickle.dump( data, open( "playlist_data.p", "wb" ) )