In [16]:
import pandas as pd
import numpy as np
from math import floor

import spotipy
from spotipy.oauth2 import SpotifyOAuth

from PIL import Image
from requests import get

from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler

In [17]:
cid = '01e5c3c39a334ae78bf5becf053ad2d5'
secret = '3c481468946e43dc9da43ed6b5c16bc8'
uri = 'http://localhost:9999/lab/workspaces/auto-c'
cache = '.spotipyoauthcache'
scope = 'user-read-recently-played user-library-read user-top-read'

auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

user = sp.current_user()
user_name = user['display_name']
user_profile = user['external_urls']['spotify']

In [18]:
auth = SpotifyOAuth(client_id = cid,
                    client_secret = secret,
                    redirect_uri = uri,
                    cache_path = cache,
                    scope = scope)

sp = spotipy.Spotify(oauth_manager = auth)

In [19]:
user

{'display_name': 'Aziz Maredia',
 'external_urls': {'spotify': 'https://open.spotify.com/user/1282060363'},
 'followers': {'href': None, 'total': 36},
 'href': 'https://api.spotify.com/v1/users/1282060363',
 'id': '1282060363',
 'images': [{'height': None,
   'url': 'https://scontent-ort2-1.xx.fbcdn.net/v/t1.0-1/p320x320/79640598_10158654118142079_1999166169432457216_n.jpg?_nc_cat=104&ccb=2&_nc_sid=0c64ff&_nc_ohc=ZQsONbuiFA0AX9ABoSl&_nc_ht=scontent-ort2-1.xx&tp=6&oh=be86373831fe05fbe2d6b7b4488f3d49&oe=602FDF29',
   'width': None}],
 'type': 'user',
 'uri': 'spotify:user:1282060363'}

In [20]:
def get_current_saved(var = None):
    
    df = None
    num_rows = 0
    
    for i in range(0, 10000, 20):
    
        results = sp.current_user_saved_tracks(offset = i)
        current_saved_dict = {}

#         current_saved_dict['track'] = [i['track']['name'] for i in results['items']]
#         current_saved_dict['artist'] = [i['track']['artists'][0]['name'] for i in results['items']]
        current_saved_dict['track_id'] = [i['track']['id'] for i in results['items']]
        current_saved_dict['artist_id'] = [i['track']['artists'][0]['id'] for i in results['items']]
        current_saved_dict['release_date'] = [i['track']['album']['release_date'] for i in results['items']]
        current_saved_dict['popularity_song'] = [i['track']['popularity'] for i in results['items']]
    
        if i == 0:
            df = pd.DataFrame(current_saved_dict)
        else:
            df = pd.concat([df, pd.DataFrame(current_saved_dict)])
            df.reset_index(drop = True, inplace = True)
        
        if len(df.index) == num_rows:
            return df
        else:
            num_rows = len(df.index)
    
    return df

In [21]:
def get_audio_features(all_tracks_df):
        
    df = None
    start = 0
    end = 100
    
    for i in range((len(all_tracks_df['track_id']) // 100) + 1):
    
        if i == (len(all_tracks_df['track_id']) // 100):
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:len(all_tracks_df['track_id'])])
        else: 
            results = sp.audio_features(tracks = list(all_tracks_df['track_id'])[start:end])

        audio_dict = {}
        keep_features = ['danceability', 'energy', 'key', 'loudness', 'mode',
                         'speechiness', 'acousticness', 'instrumentalness', 'liveness',
                         'valence', 'tempo', 'id', 'duration_ms', 'time_signature'] # duration_ms removed

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 100
        end += 100
    
    return df

In [22]:
def get_genres(all_tracks_df):
        
    df = None
    start = 0
    end = 50
    
    artists = list((set(list(all_tracks_df['artist_id']))))
    
    
    for i in range((len(artists) // 50) + 1):
    
        if i == (len(artists) // 50):
            results = sp.artists(artists = artists[start:len(artists)])
        else: 
            results = sp.artists(artists = artists[start:end])

        audio_dict = {}
        keep_features = ['id', 'popularity', 'genres']

        for i in keep_features:
            audio_dict[i] = [x[i] for x in results['artists']]
        
        if i == 0:
            df = pd.DataFrame(audio_dict)
        else:
            df = pd.concat([df, pd.DataFrame(audio_dict)])
            df.reset_index(drop = True, inplace = True)
        
        start += 50
        end += 50
    
    df.rename(columns = {'popularity': 'popularity_artist'}, inplace = True)
    
    return df

In [26]:
def get_user_tracks_w_audio_features(var = None):
    
    all_tracks_df = get_current_saved()
    
    audio_features_df = get_audio_features(all_tracks_df)
    
    df = pd.merge(all_tracks_df, audio_features_df.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    genres_df = get_genres(all_tracks_df)
    
    df = pd.merge(df, genres_df.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'user_library'
    
    df = df[['data_type', 'duration_ms', 'popularity_artist', 'popularity_song', 'danceability', 'energy', 'loudness', 'mode', 'speechiness', 
             'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', 'key', 'time_signature', 'release_date', 'genres']] # 'album', spotify_link', 'preview', 'album_image_large', 'album_image_small'
    
    return df

In [27]:
%%time
user_tracks_df = get_user_tracks_w_audio_features()

CPU times: user 894 ms, sys: 78.2 ms, total: 972 ms
Wall time: 20.4 s


In [45]:
def get_album_tracks(album_to_search):

    results = sp.search(album_to_search, 1, 0, 'album', None)
    album_id = results['albums']['items'][0]['id']

    df = None
    
    results2 = sp.album(album_id)

    album_tracks_dict = {}

    album_tracks_dict['track'] = [i['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist'] = [i['artists'][0]['name'] for i in results2['tracks']['items']]
    album_tracks_dict['artist_id'] = [i['artists'][0]['id'] for i in results2['tracks']['items']]
    album_tracks_dict['track_id'] = [i['id'] for i in results2['tracks']['items']]
    album_tracks_dict['spotify_link'] = [i['external_urls']['spotify'] for i in results2['tracks']['items']]
    album_tracks_dict['preview'] = [i['preview_url'] for i in results2['tracks']['items']]
    
    df = pd.DataFrame(album_tracks_dict)
    album_dict = {}
    
    album_dict['album_name'] = results2['name'] # album title
    album_dict['artist_name'] = results2['artists'][0]['name'] # artist name
    album_dict['release_year'] = results2['release_date'][:4] # release_year
    album_dict['total_tracks'] = results2['total_tracks'] # total tracks
    album_dict['album_link'] = results2['external_urls']['spotify']
    album_dict['artist_link'] = results2['artists'][0]['external_urls']['spotify']
    
    album_image = Image.open(get(results2['images'][1]['url'], stream = True).raw)
    album_dict['album_image'] = album_image.resize((300, 300))
    
    minutes = sum([i['duration_ms'] for i in results2['tracks']['items']]) / 60000
    
    if minutes > 60:
        album_dict['album_duration'] = f'{int(minutes // 60)} hr {floor(minutes % 60)} min'
    else:
        album_dict['album_duration'] = f'{floor(minutes)} min {floor((minutes - floor(minutes)) * 60)} sec'
    
#     df['album'] = album_dict['album_name']
#     df['album_image_large'] = album_dict['album_image']
    df['release_date'] = results2['release_date']
    

    results3 = sp.tracks(tracks = list(df['track_id']))
    
    df['popularity_song'] = [[i][0]['popularity'] for i in results3['tracks']]

    df_audio = get_audio_features(df)
    df = pd.merge(df, df_audio.rename(columns = {'id':'track_id'}), on = 'track_id', how = 'inner')
    
    df_genres = get_genres(df)
    df = pd.merge(df, df_genres.rename(columns = {'id':'artist_id'}), on = 'artist_id', how = 'left')
    
    df['data_type'] = 'album'
    
    df1 = df[['data_type', 'duration_ms', 'popularity_artist', 
             'popularity_song', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 
             'instrumentalness', 'liveness', 'valence', 'tempo', 'mode', 'key', 'time_signature', 'release_date', 'genres']]
    
    df2 = df[['track', 'track_id', 'spotify_link', 'preview']] 
                                 
    return df1, df2, album_dict

In [46]:
%%time
album_df, album_sort_df, album_dict = get_album_tracks('Man on the Moon III: The Chosen')

CPU times: user 68.5 ms, sys: 7.71 ms, total: 76.2 ms
Wall time: 1.01 s


In [33]:
def clean_data(df):
    
    df['popularity_song'] = df['popularity_song'] / 100
    df['popularity_artist'] = df['popularity_artist'] / 100
    df['loudness'] = (df['loudness'] / 60) * -1
    
    df['release_date'] = [int(i[:4]) for i in df['release_date']]
    
    new_dates = []
    
    for date in df['release_date']:
        if date < 1950:
            new_dates.append('Pre_50s')
        elif date >= 1950 and date < 1960:
            new_dates.append('50s')
        elif date >= 1960 and date < 1970:
            new_dates.append('60s')
        elif date >= 1970 and date < 1980:
            new_dates.append('70s')
        elif date >= 1980 and date < 1990:
            new_dates.append('80s')
        elif date >= 1990 and date < 2000:
            new_dates.append('90s')
        elif date >= 2000 and date < 2010:
            new_dates.append('2000s')
        elif date >= 2010:
            new_dates.append('Post_2010s')
        else:
            new_dates.append('Date not available') 
    
    df['release_date'] = new_dates
    
    df = pd.get_dummies(df, columns = ['key', 'release_date', 'time_signature'])
    
    mms = MinMaxScaler()
    
    df[['tempo', 'duration_ms']] = mms.fit_transform(df[['tempo', 'duration_ms']])
    
    return df

In [76]:
def cluster_user_tracks(df):
        
    results = {'n_clusters':[], 'silhouette_score':[]}
    
    X = df

    for k in range(3,17):
            
        km = KMeans(n_clusters = k, random_state = 42)
        km.fit(X)
        ss = silhouette_score(X, km.labels_)

        results['n_clusters'] = results['n_clusters'] + [k]
        results['silhouette_score'] = results['silhouette_score'] + [ss]
    
    optimal_k = results['n_clusters'][results['silhouette_score'].index(max(results['silhouette_score']))]
    
    km = KMeans(n_clusters = optimal_k, random_state = 42)
    best_model = km.fit(X)
    cluster_labels = km.labels_
    
    return best_model
                

In [84]:
def sort_album(album, df, album_tracks_pca):
    
    model = cluster_user_tracks(df)
    
    centroids_df = pd.DataFrame(model.cluster_centers_, 
                                columns = [f'PCA {i}' for i in range(1, len(pd.DataFrame(model.cluster_centers_).columns) + 1)],
                                index = [f'Cluster {i}' for i in range(1, len(pd.DataFrame(model.cluster_centers_).index) + 1)])
    
    similarity_df = pd.DataFrame(cosine_similarity(np.array(album_tracks_pca), model.cluster_centers_), index = album['track'], columns = centroids_df.index)
    
    optimal_cluster_df = pd.DataFrame(similarity_df.mean()).rename(columns = {0:'mean_cosine_sim'}).sort_values(by = 'mean_cosine_sim', ascending = False)
    
    song_scores = pd.DataFrame(similarity_df[similarity_df.mean().sort_values(ascending = False).index[0]])
    column_name = list(song_scores.columns)[0]
    song_scores.rename(columns = {column_name: 'optimal_cluster'}, inplace = True)
    
    album = album.set_index('track')
    
    sorted_album = pd.concat([album, song_scores], axis = 1).sort_values(by = 'optimal_cluster', ascending = False)

    return sorted_album

In [79]:
album = pd.read_csv('.././datasets/modeling/album_sort_model.csv')
album_tracks_pca = pd.read_csv('.././datasets/modeling/album_tracks_pca.csv')

In [82]:
album_tracks_pca = album_tracks_pca.drop(columns = ['track', 'data_type'])

In [85]:
sort_album(album, user_tracks_pca, album_tracks_pca)

Unnamed: 0_level_0,track_id,spotify_link,preview,optimal_cluster
track,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"Solo Dolo, Pt. III",27oVCAziETRbNuo5A8LNpg,https://open.spotify.com/track/27oVCAziETRbNuo...,https://p.scdn.co/mp3-preview/c461629a7f1acf31...,0.993172
Sept. 16,3Uw2se3aQU1UFrpRBvBnB4,https://open.spotify.com/track/3Uw2se3aQU1UFrp...,https://p.scdn.co/mp3-preview/62578afedb81632e...,0.991973
Beautiful Trip,4IIuCotvqijraSdnVLaFnM,https://open.spotify.com/track/4IIuCotvqijraSd...,https://p.scdn.co/mp3-preview/05edcc06fc085a58...,0.991546
Tequila Shots,30KctD1WsHKTIYczXjip5a,https://open.spotify.com/track/30KctD1WsHKTIYc...,https://p.scdn.co/mp3-preview/eea2485b714dffab...,0.989563
Dive,7Hc3YL8oDiAzbiAW32KXrw,https://open.spotify.com/track/7Hc3YL8oDiAzbiA...,https://p.scdn.co/mp3-preview/89e0c1d79d7d17f4...,0.988668
Sad People,4nuAslShoN77tq12fzwjUq,https://open.spotify.com/track/4nuAslShoN77tq1...,https://p.scdn.co/mp3-preview/a2e8a2ffade8b892...,0.98782
Another Day,6myUpr3GDR80Dg3zqNTmmG,https://open.spotify.com/track/6myUpr3GDR80Dg3...,https://p.scdn.co/mp3-preview/0270d8150fcbd742...,0.987305
She Knows This,1xzUQMiCoY5pdego0pHMeV,https://open.spotify.com/track/1xzUQMiCoY5pdeg...,https://p.scdn.co/mp3-preview/29dc43127abc4fde...,0.981572
Rockstar Knights (with Trippie Redd),4J9SI7do4KOEsCexqEbjmR,https://open.spotify.com/track/4J9SI7do4KOEsCe...,https://p.scdn.co/mp3-preview/d0610339176bd8c2...,0.980678
Elsie's Baby Boy (flashback),6jiwr6xTHqjdun5d3cEwXV,https://open.spotify.com/track/6jiwr6xTHqjdun5...,https://p.scdn.co/mp3-preview/62530a94d72e2613...,0.973779


In [86]:
def sort_album(album):
    
    km_tests, model = cluster_user_tracks()
    
    centroids_df = pd.DataFrame(model.cluster_centers_)
    
    similarity_df = pd.DataFrame(cosine_similarity(np.array(album_tracks_pca), model.cluster_centers_))
    
    song_scores = pd.DataFrame(similarity_df[similarity_df.mean().sort_values(ascending = False).index[0]])
    column_name = list(song_scores.columns)[0]
    
    sorted_album = pd.concat([album, song_scores], axis = 1).sort_values(by = column_name, ascending = False)

    return sorted_album