In [6]:
import pandas as pd
import numpy as np 
from tqdm import tqdm
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials #To access authorised Spotify data

In [7]:
# Re-used code from: https://medium.com/@RareLoot/extracting-spotify-data-on-your-favourite-artist-via-python-d58bc92a4330

In [8]:
# Spotify developer page: https://developer.spotify.com/dashboard/applications/7085a21ce4124b3e89db61d750b133a7
client_id = '7085a21ce4124b3e89db61d750b133a7'
client_secret = '2b02da51f99f4470a1c2ef91f28a0957'

In [9]:
# !pip install spotipy

In [10]:
sp.user

NameError: name 'sp' is not defined

In [20]:
client_credentials_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret)
sp = spotipy.Spotify(client_credentials_manager=client_credentials_manager) #spotify object to access API #chosen artist

In [17]:
def get_artist_uri(artist_name):
    result = sp.search(artist_name) #search query
    for i in result['tracks']['items']:
        if (i['artists'][0]['name'] == artist_name):
            return i['artists'][0]['uri']
    print("Could not find uri for {}".format(artist_name))
    return None

def get_album_info_for_artist(name):
    #Extract Artist's uri
    artist_uri = get_artist_uri(name)
    #Pull all of the artist's albums
    sp_albums = sp.artist_albums(artist_uri, album_type='album')
    #Store artist's albums' names' and uris in separate lists
    album_names = []
    album_uris = []
    for i in range(len(sp_albums['items'])):
        album_names.append(sp_albums['items'][i]['name'])
        album_uris.append(sp_albums['items'][i]['uri'])

    return album_uris, album_names

def get_genres_from_artist(artist_name):
    artist_uri = get_artist_uri(artist_name)
    return sp.artist(artist_uri)['genres']

def get_album_tracks(album_uri, album_name):
    #Create keys-values of empty lists inside nested dictionary for album
    album_dict = dict()
    album_dict['album'] = [] #create empty list
    album_dict['track_number'] = []
    album_dict['id'] = []
    album_dict['song_name'] = []
    album_dict['song_uri'] = []
    tracks = sp.album_tracks(album_uri) #pull data on album tracks
    for n in range(len(tracks['items'])): #for each song track
        album_dict['album'].append(album_name) #append album name tracked via album_count
        album_dict['track_number'].append(tracks['items'][n]['track_number'])
        album_dict['id'].append(tracks['items'][n]['id'])
        album_dict['song_name'].append(tracks['items'][n]['name'])
        album_dict['song_uri'].append(tracks['items'][n]['uri'])
    return album_dict

def get_song_metadata_from_album(album_uris, album_names):
    spotify_albums = dict()
    for uri, name in zip(album_uris, album_names): 
        spotify_albums[uri] = get_album_tracks(uri, name)

    track_data = pd.DataFrame()
    for k in spotify_albums.keys():
        track_data = track_data.append(pd.DataFrame(spotify_albums[k]))
    return track_data

In [11]:
def build_artist_genre_data(artist_list):
    artist_genres = pd.DataFrame()
    for artist in artist_list: 
        artist_genres.append(pd.DataFrame({'artist': artist, 'genres': get_genres_from_artist(artist)}))
    return artist_genres

### Example: Get Song metadata for one artist

In [157]:
artist_name = 'Kanye West'
album_uris, album_names = get_album_info_for_artist(artist_name)
kanye_songs = get_song_metadata_from_album(album_uris, album_names)

### Build music (genre + song metadata) for group of artists

In [72]:
artists = ['Kanye West', 'Chance the Rapper', 'Frank Ocean', 'Tame Impala', 'Tash Sultana', 'John Mayer', 
           'Bon Iver', 'Future', 'Drake', 'The Beatles', 'Snoh Aalegra', 'Goldlink', 
           'Anderson .Paak', 'Kendrick Lamar', 'Mura Masa', 'Alt-j']

In [12]:
def build_artist_song_dataset(artist_list):
    song_metadata = pd.DataFrame()
    artist_genres = pd.DataFrame()
    for artist in artist_list:
        try:
            album_uris, album_names = get_album_info_for_artist(artist)
            curr_song_metadata = get_song_metadata_from_album(album_uris, album_names).assign(artist_name = lambda x: artist)
            song_metadata = song_metadata.append(curr_song_metadata)
            artist_genres = artist_genres.append(get_genres_from_artist(artist))
            print("Data for {} albums and genres has been collected".format(artist))
        except Exception: 
            print("Failed to collect data for {} albums and genres".format(artist))
    return song_metadata, artist_genres

In [13]:
def audio_features(uris):
    # Add new key-values to store audio features
    acousticness = []
    danceability = []
    energy = []
    instrumentalness = []
    liveness = []
    loudness = []
    speechiness = []
    tempo = []
    valence = []
    popularity = []
    # Create a track counter
    for track in tqdm(uris):
        # Pull audio features per track
        features = sp.audio_features(track)
        
        # Append to relevant key-value
        acousticness.append(features[0]['acousticness'])
        danceability.append(features[0]['danceability'])
        energy.append(features[0]['energy'])
        instrumentalness.append(features[0]['instrumentalness'])
        liveness.append(features[0]['liveness'])
        loudness.append(features[0]['loudness'])
        speechiness.append(features[0]['speechiness'])
        tempo.append(features[0]['tempo'])
        valence.append(features[0]['valence'])
        # Popularity is stored elsewhere
        pop = sp.track(track)
        popularity.append(pop['popularity'])
    
    return pd.DataFrame({'uri': uris, 
            'danceability': danceability, 
            'energy': energy, 
            'accousticness': acousticness,
            'instrumentalness': instrumentalness,
            'liveness': liveness, 
            'loudness': loudness, 
            'speechiness': speechiness, 
            'tempo': tempo, 
            'valence': valence, 
            'popularity': popularity})

In [74]:
song_metadata, artist_genres = build_artist_song_dataset(artists)

Data for Kanye West albums and genres has been collected
Data for Chance the Rapper albums and genres has been collected
Data for Frank Ocean albums and genres has been collected
Data for Tame Impala albums and genres has been collected
Data for Tash Sultana albums and genres has been collected
Data for John Mayer albums and genres has been collected
Data for Bon Iver albums and genres has been collected
Data for Future albums and genres has been collected
Data for Drake albums and genres has been collected
Data for The Beatles albums and genres has been collected
Data for Snoh Aalegra albums and genres has been collected
Could not find uri for Goldlink
Failed to collect data for Goldlink albums and genres
Data for Anderson .Paak albums and genres has been collected
Data for Kendrick Lamar albums and genres has been collected
Data for Mura Masa albums and genres has been collected
Could not find uri for Alt-j
Failed to collect data for Alt-j albums and genres


In [152]:
audio_data = audio_features(song_metadata.song_uri.values)

  0%|          | 3/2529 [00:02<40:38,  1.04it/s]  


KeyboardInterrupt: 

In [79]:
audio_data.to_csv("audio_feature_data.csv", index=False)

In [80]:
song_metadata.to_csv("spotify_song_metadata.csv", index=False)

In [81]:
artist_genres.to_csv("spotify_artist_genres.csv", index=False)

### Exploration

In [21]:
song_metadata, artist_genres = build_artist_song_dataset(['alt-J'])

Data for alt-J albums and genres has been collected


In [22]:
audio_data = audio_features(song_metadata.song_uri.values)

100%|██████████| 271/271 [01:20<00:00,  3.36it/s]


In [23]:
audio_data.to_csv("alt_j_audiodata.csv",index=False)

In [24]:
song_metadata.to_csv("spotify_song_metadata_altj.csv", index=False)

In [25]:
artist_genres.to_csv("spotify_artist_genres_altj.csv", index=False)

In [190]:
audio_data = pd.read_csv("audio_feature_data.csv")

In [191]:
audio_data

Unnamed: 0,uri,danceability,energy,accousticness,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity
0,spotify:track:1WnqWQcWcuQbVzgE7ecfCY,0.588,0.793,0.466000,0.000647,0.9130,-6.230,0.0794,76.997,0.694,72
1,spotify:track:39JRmdKFka1Oe09FoOCPI4,0.482,0.618,0.647000,0.000859,0.3200,-5.673,0.1970,90.076,0.146,76
2,spotify:track:2QpGZOhTCHHiKmpSO9FW4h,0.529,0.900,0.000248,0.000012,0.0426,-3.503,0.4180,180.291,0.434,85
3,spotify:track:3JWiDGQX2eTlFvKj3Yssj3,0.799,0.291,0.873000,0.000641,0.1390,-7.353,0.0318,97.984,0.406,77
4,spotify:track:2SasoXZyv82yYgHiVOvxQn,0.410,0.886,0.067100,0.000000,0.4610,-4.413,0.1550,72.577,0.393,76
...,...,...,...,...,...,...,...,...,...,...,...
2524,spotify:track:7dQ847NQosTZ9gztaaBtn7,0.785,0.947,0.000610,0.922000,0.0499,-7.708,0.0731,140.021,0.580,34
2525,spotify:track:0XCZSl12v5sqeeyPixMbnv,0.820,0.829,0.280000,0.552000,0.1030,-8.322,0.0651,140.029,0.334,33
2526,spotify:track:3LlN0koSj1GH4W6sER5OCa,0.763,0.522,0.599000,0.002760,0.1100,-8.790,0.1840,135.084,0.322,34
2527,spotify:track:75g6AGXwNMoB1QVN1gUZnx,0.745,0.407,0.822000,0.544000,0.1700,-14.071,0.0448,109.248,0.149,35


In [192]:
from sklearn.cluster import KMeans

In [193]:
import random

In [194]:
from sklearn.model_selection import train_test_split

In [195]:
train_tracks, test_tracks = train_test_split(audio_data.uri)

In [196]:
audio_data = audio_data.assign(is_train = lambda x: x.uri.isin(train_tracks))

In [197]:
km = KMeans(n_clusters=14)

In [198]:
features = ['danceability', 'energy', 'accousticness', 'instrumentalness',
            'liveness', 'loudness', 'speechiness', 'tempo', 'valence']

In [199]:
train_data = audio_data[audio_data.is_train]
test_data = audio_data[audio_data.is_train==False]

In [200]:
km.fit(train_data[features])

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=14, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)

In [201]:
predictions = km.predict(test_data[features])

In [202]:
audio_data['cluster'] = km.predict(audio_data[features])

In [203]:
song_metadata.columns

Index(['album', 'track_number', 'id', 'song_name', 'song_uri', 'artist_name'], dtype='object')

In [204]:
audio_data.columns

Index(['uri', 'danceability', 'energy', 'accousticness', 'instrumentalness',
       'liveness', 'loudness', 'speechiness', 'tempo', 'valence', 'popularity',
       'is_train', 'cluster'],
      dtype='object')

In [205]:
audio_data = audio_data.merge(song_metadata, left_on='uri', right_on='song_uri')

In [206]:
len(audio_data.album.unique())

101

In [208]:
audio_data.groupby('cluster').nunique()[['artist_name']]

Unnamed: 0_level_0,artist_name
cluster,Unnamed: 1_level_1
0,14
1,14
2,14
3,12
4,14
5,2
6,14
7,13
8,13
9,13


In [210]:
from sklearn.mixture import GaussianMixture

In [215]:
gm = GaussianMixture(n_components=14)

In [217]:
gm.fit(audio_data[features])

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=14, n_init=1,
                precisions_init=None, random_state=None, reg_covar=1e-06,
                tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
                weights_init=None)

In [218]:
audio_data['cluster'] = gm.predict(audio_data[features])

In [219]:
audio_data.groupby('cluster').nunique()[['artist_name']]

Unnamed: 0_level_0,artist_name
cluster,Unnamed: 1_level_1
0,10
1,13
2,12
3,5
4,10
5,14
6,13
7,13
8,2
9,12


Unnamed: 0,uri,danceability,energy,accousticness,instrumentalness,liveness,loudness,speechiness,tempo,valence,popularity,is_train,cluster,album,track_number,id,song_name,song_uri,artist_name
58,spotify:track:58kikhzxjR5yaA1t8LFLqJ,0.585,0.19,0.975,0.0,0.105,-6.898,0.0938,174.362,0.714,0,True,3,The Life Of Pablo,14,58kikhzxjR5yaA1t8LFLqJ,Frank's Track,spotify:track:58kikhzxjR5yaA1t8LFLqJ,Kanye West
224,spotify:track:4ulYASOuCUwa2hQRsE2OVh,0.603,0.401,0.542,0.225,0.103,-10.155,0.0306,172.633,0.347,7,False,3,808s & Heartbreak,1,4ulYASOuCUwa2hQRsE2OVh,Say You Will,spotify:track:4ulYASOuCUwa2hQRsE2OVh,Kanye West
236,spotify:track:5e5MKPtJuFb8NaeSEyV5gD,0.603,0.401,0.542,0.225,0.103,-10.155,0.0306,172.633,0.347,56,True,3,808s & Heartbreak,1,5e5MKPtJuFb8NaeSEyV5gD,Say You Will,spotify:track:5e5MKPtJuFb8NaeSEyV5gD,Kanye West
1146,spotify:track:6hHKFReuQR9VQp39ev43wY,0.397,0.0906,0.988,0.856,0.344,-21.655,0.054,185.167,0.136,50,False,3,So Far Gone,16,6hHKFReuQR9VQp39ev43wY,Outro (feat. Chilly Gonzales),spotify:track:6hHKFReuQR9VQp39ev43wY,Drake
1164,spotify:track:5hKi8LHc2GF5v9TSsvgZaf,0.393,0.0878,0.989,0.856,0.316,-21.628,0.053,185.562,0.122,13,False,3,So Far Gone,16,5hKi8LHc2GF5v9TSsvgZaf,Outro (feat. Chilly Gonzales),spotify:track:5hKi8LHc2GF5v9TSsvgZaf,Drake
1460,spotify:track:6lSxM9BKcEZBSDKl2VODsF,0.536,0.36,0.0823,0.167,0.0996,-10.973,0.0408,164.891,0.147,59,True,3,Abbey Road (Super Deluxe Edition),1,6lSxM9BKcEZBSDKl2VODsF,Come Together - 2019 Mix,spotify:track:6lSxM9BKcEZBSDKl2VODsF,The Beatles
1468,spotify:track:4kXXWWWIu0avePZXX7cBiW,0.33,0.442,0.317,0.0762,0.149,-12.216,0.035,173.751,0.326,53,True,3,Abbey Road (Super Deluxe Edition),9,4kXXWWWIu0avePZXX7cBiW,You Never Give Me Your Money - 2019 Mix,spotify:track:4kXXWWWIu0avePZXX7cBiW,The Beatles
1732,spotify:track:2EqlS6tkEnglzr7tkKAAYD,0.533,0.376,0.0302,0.248,0.0926,-11.913,0.0393,165.007,0.187,79,True,3,Abbey Road (Remastered),1,2EqlS6tkEnglzr7tkKAAYD,Come Together - Remastered 2009,spotify:track:2EqlS6tkEnglzr7tkKAAYD,The Beatles
1740,spotify:track:1jOLTO379yIu9aMnCkpMQl,0.335,0.416,0.345,0.136,0.116,-11.051,0.0348,174.464,0.223,59,False,3,Abbey Road (Remastered),9,1jOLTO379yIu9aMnCkpMQl,You Never Give Me Your Money - Remastered 2009,spotify:track:1jOLTO379yIu9aMnCkpMQl,The Beatles
1971,spotify:track:2VqKx3HH8gaZPabNWYvksy,0.705,0.433,0.388,0.134,0.0841,-9.706,0.204,168.006,0.602,71,True,3,"- Ugh, those feels again",2,2VqKx3HH8gaZPabNWYvksy,I Want You Around,spotify:track:2VqKx3HH8gaZPabNWYvksy,Snoh Aalegra


In [229]:
gm = GaussianMixture(n_components=11)

In [230]:
gm.fit(audio_data[audio_data.artist_name=='Kanye West'][features])

GaussianMixture(covariance_type='full', init_params='kmeans', max_iter=100,
                means_init=None, n_components=11, n_init=1,
                precisions_init=None, random_state=None, reg_covar=1e-06,
                tol=0.001, verbose=0, verbose_interval=10, warm_start=False,
                weights_init=None)

In [231]:
kanye_songs = audio_data[audio_data.artist_name=='Kanye West']

In [232]:
kanye_songs['cluster'] = gm.predict(kanye_songs[features])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


In [233]:
kanye_songs.groupby('cluster').nunique()['album']

cluster
0      7
1      9
2      8
3     11
4      7
5      7
6      8
7      5
8      7
9      1
10     7
Name: album, dtype: int64