In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime
import requests
import bs4

import spotipy
from spotipy.oauth2 import SpotifyClientCredentials

CLIENT_ID = '7922a756b15b4992b6ceb45936211e51'
CLIENT_SECRET = 'ba9f61e8601046af92002cb044d59f56'

client_credentials_manager = SpotifyClientCredentials(CLIENT_ID,CLIENT_SECRET)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager)

In [7]:
playlists = sp.user_playlists('spotify')

Genre popularity
=====

Scrape genre popularity list from <a href="http://everynoise.com/everynoise1d.cgi?vector=popularity&scope=all"> Every Noise at Once</a>.

In [3]:
url = 'http://everynoise.com/everynoise1d.cgi?scope=deeper&vector=popularity'
source = requests.get(url).content
soup = bs4.BeautifulSoup(source,'lxml')

genrePopularity = dict()
rows = soup.table.findAll('tr')
count = len(rows)
for rank,tr in enumerate(rows):
    genrePopularity[tr.findAll('td')[2].next.next] = round(100*(1 - rank/count),2)

In [4]:
def PlaylistData(playlist,features):
    '''Return dict of playlist data.
    
    Input:
    playlist -- Spotify playlist dict
    features -- list of desired audio features
    
    Output:
    results -- dict with features as keys
    '''
    
    today = pd.to_datetime(datetime.date.today())
    
    trackScores = []
    artistIds = []
    ages = []
    trackIds = []
    
    #Collect data for each track
    for track in playlist['tracks']['items']:
        if track['track']:
            trackScores.append(track['track']['popularity'])
            artistIds.append([artist['id'] for artist in track['track']['artists']])
            ages.append((today - pd.to_datetime(track['track']['album']['release_date'])).days)
            trackIds.append(track['track']['id'])
    
    #For paged playlists (max tracks per request = 100)
    while playlist['tracks']['next']:
        playlist['tracks'] = sp.next(playlist['tracks'])
        for track in playlist['tracks']['items']:
            if track['track']:
                trackScores.append(track['track']['popularity'])
                artistIds.append([artist['id'] for artist in track['track']['artists']])
                ages.append((today - pd.to_datetime(track['track']['album']['release_date'])).days)
                trackIds.append(track['track']['id'])
            
    #Reset playlist to first page
    while playlist['tracks']['previous']:
        playlist['tracks'] = sp.previous(playlist['tracks'])
    
    #Flatten artists to request artist details (max artists per request = 50)
    artists = []
    flatIds = [artistId for sublist in artistIds for artistId in sublist]
    for j in range(int(np.ceil(len(flatIds)/50))):
        artists.extend(sp.artists(flatIds[j*50:(j+1)*50])['artists'])
    
    #Determine max artist and genre popularity for each track (artistIds grouped by track)
    artistScores = []
    genreScores = []
    artCount = 0
    for group in artistIds:
        artistScores.append(np.max([art['popularity'] for art in artists[artCount:artCount+len(group)]]))
        
        #Ignore unranked genres (just christmas genres?) and tracks with no genres listed
        genres = set([gen for art in artists[artCount:artCount+len(group)] for gen in art['genres'] if gen in genrePopularity.keys()])
        if len(genres)>0:
            genreScores.append(np.max([genrePopularity[gen] for gen in genres]))
        
        artCount += len(group)
    
    #Request audio features for each track (max tracks per request = 50)
    trackFeaturesAll = []
    for j in range(int(np.ceil(len(trackIds)/50))):
        trackFeaturesAll.extend(sp.audio_features(trackIds[j*50:(j+1)*50]))
    
    #Record desired features
    trackFeatures = []
    for track in trackFeaturesAll:
        if track:
            trackFeatures.append([track[feat] for feat in features])
    trackFeatures = np.array(trackFeatures)
    
    #Compute statistics
    results = dict()
    
    results['track_popularity'] = round(np.mean(trackScores),2)
    results['artist_popularity'] = round(np.mean(artistScores),2)
    results['genre_popularity'] = round(np.mean(genreScores),2)
    results['age'] = round(np.mean(ages),2)
    for j,feat in enumerate(features):
        results[feat] = round(np.mean(trackFeatures[:,j]),4)
    
#     results['track_popularity'] = [round(np.mean(trackScores),2), round(np.std(trackScores),2)]
#     results['artist_popularity'] = [round(np.mean(artistScores),2), round(np.std(artistScores),2)]
#     results['genre_popularity'] = [round(np.mean(genreScores),2), round(np.std(genreScores),2)]
#     results['age'] = [round(np.mean(ages),2), round(np.std(ages),2)]
#     for j,feat in enumerate(features):
#         results[feat] = [round(np.mean(trackFeatures[:,j]),4),round(np.std(trackFeatures[:,j]),4)]
    
    return results

In [8]:
features = ['danceability', 'energy', 'key', 'loudness', 'mode', 'acousticness']
features.extend(['instrumentalness', 'valence', 'tempo', 'duration_ms', 'time_signature'])

info = ['track_popularity','artist_popularity', 'genre_popularity', 'age']

columns = ['name','id','followers','length'] + info + features
df = {col: [] for col in columns}

for pl in playlists['items']:
    if pl['tracks']['total'] > 0:
        pl = sp.user_playlist('spotify',playlist_id=pl['id'])

        df['name'].append(pl['name'])
        df['id'].append(pl['id'])
        df['length'].append(pl['tracks']['total'])
        df['followers'].append(pl['followers']['total'])

        data = PlaylistData(pl, features)
        for col in (info + features):
            df[col].append(data[col])

its = 1
while ((playlists['next'] != None) & (its < 12)):
    playlists = sp.next(playlists)
    its +=1
    for pl in playlists['items']:
        if pl['tracks']['total'] > 0:
            pl = sp.user_playlist('spotify',playlist_id=pl['id'])

            df['name'].append(pl['name'])
            df['id'].append(pl['id'])
            df['length'].append(pl['tracks']['total'])
            df['followers'].append(pl['followers']['total'])

            data = PlaylistData(pl, features)
            for col in (info + features):
                df[col].append(data[col])

df = pd.DataFrame(df)
df.head()

TypeError: unsupported operand type(s) for -: 'Timestamp' and 'NoneType'

In [37]:
today = pd.to_datetime(datetime.date.today())
pl['tracks']['items'][12]['track']

{'album': {'album_type': None,
  'artists': [],
  'available_markets': [],
  'external_urls': {},
  'href': None,
  'id': None,
  'images': [],
  'name': 'Gossip In The Grain',
  'release_date': None,
  'release_date_precision': None,
  'type': 'album',
  'uri': None},
 'artists': [{'external_urls': {},
   'href': None,
   'id': None,
   'name': 'Ray LaMontagne',
   'type': 'artist',
   'uri': None}],
 'available_markets': [],
 'disc_number': 0,
 'duration_ms': 354000,
 'explicit': False,
 'external_ids': {},
 'external_urls': {},
 'href': None,
 'id': None,
 'name': 'I Still Care For You',
 'popularity': 0,
 'preview_url': None,
 'track_number': 0,
 'type': 'track',
 'uri': 'spotify:local:Ray+LaMontagne:Gossip+In+The+Grain:I+Still+Care+For+You:354'}

In [25]:
df.to_csv('spotifyData.csv', encoding='utf-8', index=False)

AttributeError: 'dict' object has no attribute 'to_csv'

In [None]:
df1 = df[info].copy().applymap(lambda x: x[0])
df1['followers'] = np.log10(df.followers.copy())
sns.pairplot(df1,hue = 'followers')
plt.show()

In [None]:
plt.scatter(np.log10(df1.genre_popularity),df1.followers)