In [1]:
import pandas as pd
import numpy as np
from math import floor

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from PIL import Image
from requests import get

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [2]:
cid = '01e5c3c39a334ae78bf5becf053ad2d5'
secret = '3c481468946e43dc9da43ed6b5c16bc8'
uri = 'http://localhost:9999/lab/workspaces/auto-c'
cache = '.spotipyoauthcache'
scope = 'user-read-recently-played user-library-read user-top-read'

auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

user = sp.current_user()
user_name = user['display_name']
user_profile = user['external_urls']['spotify']

In [3]:
auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

In [4]:
user

{'display_name': 'Aziz Maredia',
 'external_urls': {'spotify': 'https://open.spotify.com/user/1282060363'},
 'followers': {'href': None, 'total': 36},
 'href': 'https://api.spotify.com/v1/users/1282060363',
 'id': '1282060363',
 'images': [{'height': None,
   'url': 'https://scontent-ort2-2.xx.fbcdn.net/v/t1.0-1/p320x320/79640598_10158654118142079_1999166169432457216_n.jpg?_nc_cat=104&ccb=2&_nc_sid=0c64ff&_nc_ohc=da1WIZbNg3AAX9xFEHC&_nc_ht=scontent-ort2-2.xx&tp=6&oh=3b473272b20ff25e6efe7ad950ea1fef&oe=6033D3A9',
   'width': None}],
 'type': 'user',
 'uri': 'spotify:user:1282060363'}

In [75]:
def get_current_saved(sp):
    
    # set empty variable for dataframe to be assigned to
    df = None
    
    # set variable to check number of rows in dataframe
    num_rows = 0
    
    # api can pull max of 20 tracks at a time and a spotify user can have a max of 10,000 songs saved
    for i in range(0, 10000, 20):
        
        # call api endpoint to collect saved tracks. offset equals index of first song being pull and increases by 20 each interation
        results = sp.current_user_saved_tracks(offset = i)
        
        # set empty dict for track variables and values collected to be stored in
        current_saved_dict = {}
        current_saved_dict['track'] = [i['track']['name'] for i in results['items']]
        current_saved_dict['artist'] = [i['track']['artists'][0]['name'] for i in results['items']]
        current_saved_dict['track_id'] = [i['track']['id'] for i in results['items']]
        current_saved_dict['artist_id'] = [i['track']['artists'][0]['id'] for i in results['items']]
        current_saved_dict['release_date'] = [i['track']['album']['release_date'] for i in results['items']]
#         current_saved_dict['popularity_song'] = [i['track']['popularity'] for i in results['items']]
    
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)
        
        # if length of dataframe index equals the num_rows variable set above, this means that we have pulled all songs in a user's library
        # if not, update num_rows to current index length
        if len(df.index) == num_rows:
            return df
        else:
            num_rows = len(df.index)
    
    return df

In [76]:
def get_audio_features(all_tracks_df, sp):
        
    df = None
    start = 0
    end = 100
    
    # api can only pull audio features for 100 tracks max
    # calculting how many pulls will be need based on number of tracks in user's saved library
    for i in range((len(all_tracks_df['track_id']) // 100) + 1):
        
        # XXX
        if i == (len(all_tracks_df['track_id']) // 100):
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:len(all_tracks_df['track_id'])])
        else: 
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:end])
        
        # XXX
        audio_dict = {}
        keep_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                         'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                         'valence', 'tempo', 'id', 'duration_ms', 'time_signature']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results]
        
        # if first iteration, convert dict to dataframe and save to empty df variable
        # if not, convert dict to dataframe and concat it with dataframe contatining tracks from previous pulls
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 100
        end += 100
    
    return df

In [89]:
def get_genres(all_tracks_df, sp):
        
    df = None
    start = 0
    end = 50
    
    artists = list((set(list(all_tracks_df['artist_id']))))
    
    
    for i in range((len(artists) // 50) + 1):
    
        if i == (len(artists) // 50):
            results = sp.artists(artists = artists[start:len(artists)])
        else: 
            results = sp.artists(artists = artists[start:end])

        audio_dict = {}
        keep_features = ['id', 'genres'] # 'popularity',

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results['artists']]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 50
        end += 50
    
#     df.rename(columns = {'popularity': 'popularity_artist'}, inplace = True)
    
    return df

In [90]:
def get_user_tracks_w_audio_features(sp):
    
    all_tracks_df = get_current_saved(sp)
    
    audio_features_df = get_audio_features(all_tracks_df, sp)
    
    df = pd.merge(all_tracks_df, audio_features_df.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    genres_df = get_genres(all_tracks_df, sp)
    
    df = pd.merge(df, genres_df.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'user_library'
    
    df = df[['track', 'artist', 'track_id', 'data_type',
             'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
             'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']] # 'popularity_artist', 'popularity_song'
    
    return df

In [100]:
def get_album_tracks(album_to_search, sp):

    results = sp.search(album_to_search, 1, 0, 'album', None)
    album_id = results['albums']['items'][0]['id']

    df = None
    
    results2 = sp.album(album_id)

    album_tracks_dict = {}

    album_tracks_dict['track'] = [i['name'] for i in results2['tracks']['items']]
    album_tracks_dict['track_id'] = [i['id'] for i in results2['tracks']['items']]
    album_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results2['tracks']['items']]
    album_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results2['tracks']['items']]
    album_tracks_dict['preview'] = [i['preview_url'] for i in results2['tracks']['items']]
    
    df = pd.DataFrame(album_tracks_dict)
    album_dict = {}
    
    album_dict['album_name'] = results2['name'] # album title
    album_dict['album_image'] = results2['images'][1]['url'] # album coveer
    album_dict['artist_name'] = results2['artists'][0]['name'] # artist name
    album_dict['release_year'] = results2['release_date'][:4] # release_year
    album_dict['total_tracks'] = results2['total_tracks'] # total tracks
    album_dict['album_link'] = results2['external_urls']['spotify']
    album_dict['artist_link'] = results2['artists'][0]['external_urls']['spotify']
    
    minutes = sum([i['duration_ms'] for i in results2['tracks']['items']]) / 60000
    
    if minutes > 60:
        album_dict['album_duration'] = f'{int(minutes // 60)} hr {floor(minutes % 60)} min'
    else:
        album_dict['album_duration'] = f'{floor(minutes)} min {floor((minutes - floor(minutes)) * 60)} sec'
    
    df['release_date'] = results2['release_date']
    
#     results3 = sp.tracks(tracks = list(df['track_id']))
#     df['popularity_song'] = [[i][0]['popularity'] for i in results3['tracks']]

    df_audio = get_audio_features(df, sp)
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    df_genres = get_genres(df, sp)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'album'
    
    album_final = df[['track', 'track_id', 'duration_ms', 'spotify_link', 'preview']]
    
    df = df[['track', 'track_id', 'artist', 'data_type',
             'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 
             'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']] # 'popularity_artist', 'popularity_song'
                                 
    return df, album_final, album_dict

In [101]:
def clean_genres(df):
    
    genres = ['folk', 'gothic', 'emo', 'metal', 'rock', 'punk', 'alternative', 'grunge', 'pop', 'hip hop',
              'country', 'bluegrass', 'swing', 'blues', 'jazz', 'gospel', 'soul', 'piano', 'rythm', 'reggae',
              'rap', 'r&b', 'edm', 'dupstep', 'techno', 'house', 'trance', 'electro', 'dance', 'disco',
              'classical', 'singer songwriter', 'musical', 'african', 'hawaiian', 'jam band', 'psychedelic']
    
    
    
    languages = {'asian' : ['chinese', 'japanese', 'korean', 'korean pop', 'taiwanese', 'vietnamese', 'malaysian', 'indonesian', 'thai', 'tibetan'],
                 'baltic_slavic' : ['croatian', 'czech', 'latvian', 'polish', 'serbian', 'russian', 'lithuanian', 'ukrainian', 'slovenian', 'bulgarian'],
                 'celtic' : ['irish', 'scottish', 'celtic'],
                 'english' : ['australian', 'uk', 'british', 'canadian'],
                 'germanic' : ['german', 'norwegian', 'swedish', 'dutch', 'icelandic', 'austrian', 'danish', 'belgian'],
                 'indian_pakistani' : ['indian', 'pakistani', 'punjabi', 'hindustani'],
                 'middle_eastern' : ['israeli', 'kurdish', 'hebrew', 'arab', 'turkish'],
                 'romance' : ['spanish','french', 'italian', 'romanian', 'latin', 'portuguese'],
                 'south_american' : ['brazilian', 'venezuelan', 'argentine', 'peruvian', 'chilean'],
                 'uralic' : ['finnish', 'estonian', 'hungarian']}
    
    genres_list = []

    for i in df['genres']:

        temp = []

        for g in genres:
            if g in ' '.join(i):
                temp.append(g)

        for group in languages.keys():
            for l in languages[group]:
                if l in ' '.join(i):
                    temp.append(group)

        if len(temp) == 0:
            temp.append('other')

        genres_list.append(temp)
    
    df['genres'] = genres_list
    
    df = pd.concat([df, pd.get_dummies(df['genres'].apply(pd.Series).stack()).sum(level = 0)], axis = 1).drop(columns = ['genres'])
    
    return df

In [102]:
def clean_data(df):
    
    # bringing the following variables down to a 0-1 scale
    # df['popularity_song'] = df['popularity_song'] / 100 # currently on 0-100 scale
    # df['popularity_artist'] = df['popularity_artist'] / 100 # currently on 0-100 scale
    df['loudness'] = (df['loudness'] / 60) * -1 # currently on -60-0 scale
    
    # tempo is not on a scale so using a minmax scalar to bring it down to a 0-1 scale
    mms = MinMaxScaler()
    df[['tempo']] = mms.fit_transform(df[['tempo']])
    
    # only need year so removing day and month from release date, then turning it into a
    # categorical variable with values corresponding with the decade the track was released
    
    df['release_date'] = [int(i[:4]) for i in df['release_date']]
    new_dates = []
    
    for date in df['release_date']:
        if date < 1950:
            new_dates.append('Pre_50s')
        elif date >= 1950 and date < 1960:
            new_dates.append('50s')
        elif date >= 1960 and date < 1970:
            new_dates.append('60s')
        elif date >= 1970 and date < 1980:
            new_dates.append('70s')
        elif date >= 1980 and date < 1990:
            new_dates.append('80s')
        elif date >= 1990 and date < 2000:
            new_dates.append('90s')
        elif date >= 2000 and date < 2010:
            new_dates.append('2000s')
        elif date >= 2010:
            new_dates.append('Post_2010')
        else:
            new_dates.append('Date not available') 
    
    df['release_date'] = new_dates
    
    # converting categorical columns into dummy variables
    df = pd.get_dummies(df, columns = ['key', 'release_date', 'time_signature'])
    
    return df

In [103]:
def find_optimal_components(array):

    for i in enumerate(array[:len(array)-1]):
        diff = array[i[0] + 1] - i[1]
        if diff < 0.025:
            return i[0]
    
    return 8

In [104]:
def pca(df):
    
    X = df.drop(columns = ['track', 'artist', 'track_id', 'data_type'])
    
    test_pca = PCA().fit(X)
    cumsum_array = np.cumsum(test_pca.explained_variance_ratio_)
    
    pca = PCA(n_components = find_optimal_components(cumsum_array), random_state = 42)

    combined_pca = pd.DataFrame(pca.fit_transform(X))
    combined_pca = pd.concat([df[['track', 'artist', 'data_type']], combined_pca], axis = 1)

    user_tracks_pca = combined_pca.loc[combined_pca['data_type'] == 'user_library', :].reset_index(drop = True)
    album_tracks_pca = combined_pca.loc[combined_pca['data_type'] == 'album', :].reset_index(drop = True)
    
    return user_tracks_pca, album_tracks_pca

In [105]:
def cluster_user_tracks(df):
        
    results = {'n_clusters':[], 'silhouette_score':[]}
    
    X = df

    for k in range(3,17):
            
        km = KMeans(n_clusters = k, random_state = 42)
        km.fit(X)
        ss = silhouette_score(X, km.labels_)

        results['n_clusters'] = results['n_clusters'] + [k]
        results['silhouette_score'] = results['silhouette_score'] + [ss]
    
    optimal_k = results['n_clusters'][results['silhouette_score'].index(max(results['silhouette_score']))]
    
    km = KMeans(n_clusters = optimal_k, random_state = 42)
    best_model = km.fit(X)
    
    return best_model

In [106]:
def sort_album(user_tracks_pca, album_tracks_pca, album_final_model):
    
    model = cluster_user_tracks(user_tracks_pca)
    
    centroids_df = pd.DataFrame(model.cluster_centers_)
    
    similarity_df = pd.DataFrame(cosine_similarity(np.array(album_tracks_pca), model.cluster_centers_))
    
    song_scores = pd.DataFrame(similarity_df[similarity_df.mean().sort_values(ascending = False).index[0]])
    column_name = list(song_scores.columns)[0]
    
    sorted_album = pd.concat([album_final_model, song_scores], axis = 1).sort_values(by = column_name, ascending = False)

    return sorted_album

In [107]:
def model(album, sp):
    
    user_tracks = get_user_tracks_w_audio_features(sp)
    
    album_tracks, album_final_model, album_info_dict = get_album_tracks('Man on the Moon III: The Chosen', sp)
    
    combined_tracks = pd.concat([user_tracks, album_tracks], axis = 0)
    combined_tracks.reset_index(drop = True, inplace = True)
    combined_tracks = clean_genres(combined_tracks)
    combined_tracks = clean_data(combined_tracks)
    
    user_tracks_pca, album_tracks_pca = pca(combined_tracks)
    user_tracks_pca.drop(columns = ['track', 'artist', 'data_type'], inplace = True)
    album_tracks_pca.drop(columns = ['track', 'artist', 'data_type'], inplace = True)
    
    sorted_album = sort_album(user_tracks_pca, album_tracks_pca, album_final_model)
    
    
    return sorted_album

In [108]:
%%time
z = model('Man on the Moon III: The Chosen', sp)

CPU times: user 7.28 s, sys: 1.14 s, total: 8.42 s
Wall time: 18.4 s


In [115]:
z.reset_index()

Unnamed: 0,index,track,track_id,duration_ms,spotify_link,preview,5
0,11,Sept. 16,3Uw2se3aQU1UFrpRBvBnB4,249040,https://open.spotify.com/track/3Uw2se3aQU1UFrp...,https://p.scdn.co/mp3-preview/62578afedb81632e...,0.990583
1,8,"Solo Dolo, Pt. III",27oVCAziETRbNuo5A8LNpg,242267,https://open.spotify.com/track/27oVCAziETRbNuo...,https://p.scdn.co/mp3-preview/c461629a7f1acf31...,0.98722
2,4,Dive,7Hc3YL8oDiAzbiAW32KXrw,148707,https://open.spotify.com/track/7Hc3YL8oDiAzbiA...,https://p.scdn.co/mp3-preview/89e0c1d79d7d17f4...,0.984029
3,2,Another Day,6myUpr3GDR80Dg3zqNTmmG,199787,https://open.spotify.com/track/6myUpr3GDR80Dg3...,https://p.scdn.co/mp3-preview/0270d8150fcbd742...,0.983613
4,15,Rockstar Knights (with Trippie Redd),4J9SI7do4KOEsCexqEbjmR,231467,https://open.spotify.com/track/4J9SI7do4KOEsCe...,https://p.scdn.co/mp3-preview/d0610339176bd8c2...,0.965621
5,1,Tequila Shots,30KctD1WsHKTIYczXjip5a,193293,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...,0.965246
6,3,She Knows This,1xzUQMiCoY5pdego0pHMeV,216560,https://open.spotify.com/track/1xzUQMiCoY5pdeg...,https://p.scdn.co/mp3-preview/29dc43127abc4fde...,0.964137
7,9,Sad People,4nuAslShoN77tq12fzwjUq,176027,https://open.spotify.com/track/4nuAslShoN77tq1...,https://p.scdn.co/mp3-preview/a2e8a2ffade8b892...,0.956021
8,10,Elsie's Baby Boy (flashback),6jiwr6xTHqjdun5d3cEwXV,219213,https://open.spotify.com/track/6jiwr6xTHqjdun5...,https://p.scdn.co/mp3-preview/62530a94d72e2613...,0.950617
9,12,The Void,2yg7MXp8nSPaf61HVkhEr3,325413,https://open.spotify.com/track/2yg7MXp8nSPaf61...,https://p.scdn.co/mp3-preview/698c2b64f5d5d4b0...,0.937683
