In [103]:
import pandas as pd
import numpy as np
import json
import re 
import sys
import itertools

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
import matplotlib.pyplot as plt


import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
from spotipy.oauth2 import SpotifyOAuth
import spotipy.util as util

import warnings
warnings.filterwarnings("ignore")

In [104]:
%matplotlib inline

In [105]:
#If you're not familiar with this, save it! Makes using jupyter notebook on laptops much easier
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))

In [106]:
#another useful command to make data exploration easier
# NOTE: if you are using a massive dataset, this could slow down your code. 
pd.set_option('display.max_columns', None)
pd.set_option("max_rows", None)

In [107]:
spotify_df = pd.read_csv('E:\spotify-recommendation-system-main (1)\spotify-recommendation-system-main\data.csv')

In [108]:
#spotify_df.head()

In [109]:
data_w_genre = pd.read_csv('E:\spotify-recommendation-system-main (1)\spotify-recommendation-system-main\data_w_genres.csv')
#data_w_genre.head()

In [110]:
#data_w_genre['genres'].values[0]

In [111]:
#To check if this is actually a list, let me index it and see what it returns
#data_w_genre['genres'].values[0][0]

In [112]:
data_w_genre['genres_upd'] = data_w_genre['genres'].apply(lambda x: [re.sub(' ','_',i) for i in re.findall(r"'([^']*)'", x)])

In [113]:
#data_w_genre['genres_upd'].values[0][0]

In [114]:
spotify_df['artists_upd_v1'] = spotify_df['artists'].apply(lambda x: re.findall(r"'([^']*)'", x))


In [115]:
#spotify_df['artists'].values[0]

In [116]:
#spotify_df['artists_upd_v1'].values[0][0]

In [117]:
spotify_df[spotify_df['artists_upd_v1'].apply(lambda x: not x)].head(5)

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,artists_upd_v1
127,0.995,"[""Sam Manning's and His Cole Jazz Orchestra""]",0.664,173333,0.283,0,42WDMm9hX0xCFkkKpt6NOY,0.874,8,0.109,-18.301,0,Bungo,0,1930-01-01,0.0807,99.506,0.688,1930,[]
180,0.984,"[""Scarlet D'Carpio""]",0.4,142443,0.19,0,4Gcc2YB0AAlzPLQhosdyAw,0.9,0,0.182,-12.062,1,Chililin Uth'aja,0,1930,0.0492,81.29,0.402,1930,[]
1244,0.506,"[""Original Broadway Cast Of 'Flahooley""]",0.519,35227,0.475,0,1Qt9zpHUfVqMNr25EU9IFL,0.071,7,0.103,-9.553,0,Prologue,0,1951-01-01,0.107,105.639,0.615,1951,[]
1478,0.809,"[""Cal Tjader's Modern Mambo Quintet""]",0.795,238200,0.386,0,5VeW5QJDW906P5knRgJWzt,0.874,1,0.106,-14.984,1,Dearly Beloved,2,1954-09-11,0.057,119.8,0.807,1954,[]
1944,0.804,"[""Screamin' Jay Hawkins""]",0.574,142893,0.401,0,6MC85zBk1dQqnywRDdzy7h,2e-05,2,0.546,-11.185,1,I Love Paris,14,1958,0.0533,89.848,0.587,1958,[]


In [118]:
spotify_df['artists_upd_v2'] = spotify_df['artists'].apply(lambda x: re.findall('\"(.*?)\"',x))
spotify_df['artists_upd'] = np.where(spotify_df['artists_upd_v1'].apply(lambda x: not x), spotify_df['artists_upd_v2'], spotify_df['artists_upd_v1'] )

In [119]:
#need to create my own song identifier because there are duplicates of the same song with different ids. I see different
spotify_df['artists_song'] = spotify_df.apply(lambda row: row['artists_upd'][0]+row['name'],axis = 1)

In [120]:
spotify_df.sort_values(['artists_song','release_date'], ascending = False, inplace = True)

In [121]:
#spotify_df[spotify_df['name']=='Adore You']

In [122]:
spotify_df.drop_duplicates('artists_song',inplace = True)

In [123]:
#spotify_df[spotify_df['name']=='Adore You']

In [124]:
artists_exploded = spotify_df[['artists_upd','id']].explode('artists_upd')

In [125]:
artists_exploded_enriched = artists_exploded.merge(data_w_genre, how = 'left', left_on = 'artists_upd',right_on = 'artists')
artists_exploded_enriched_nonnull = artists_exploded_enriched[~artists_exploded_enriched.genres_upd.isnull()]

In [126]:
#artists_exploded_enriched_nonnull[artists_exploded_enriched_nonnull['id'] =='6KuQTIu1KoTTkLXKrwlLPV']

In [127]:
artists_genres_consolidated = artists_exploded_enriched_nonnull.groupby('id')['genres_upd'].apply(list).reset_index()

In [128]:
artists_genres_consolidated['consolidates_genre_lists'] = artists_genres_consolidated['genres_upd'].apply(lambda x: list(set(list(itertools.chain.from_iterable(x)))))

In [129]:
#artists_genres_consolidated.head()

In [130]:
spotify_df = spotify_df.merge(artists_genres_consolidated[['id','consolidates_genre_lists']], on = 'id',how = 'left')

In [131]:
#spotify_df.tail()

In [132]:
spotify_df['year'] = spotify_df['release_date'].apply(lambda x: x.split('-')[0])

In [133]:
float_cols = spotify_df.dtypes[spotify_df.dtypes == 'float64'].index.values

In [134]:
ohe_cols = 'popularity'

In [135]:
#spotify_df['popularity'].describe()

In [136]:
# create 5 point buckets for popularity 
spotify_df['popularity_red'] = spotify_df['popularity'].apply(lambda x: int(x/5))

In [137]:
# tfidf can't handle nulls so fill any null values with an empty list
spotify_df['consolidates_genre_lists'] = spotify_df['consolidates_genre_lists'].apply(lambda d: d if isinstance(d, list) else [])

In [138]:
#spotify_df.head()

In [139]:
#simple function to create OHE features
#this gets passed later on
def ohe_prep(df, column, new_name): 
    """ 
    Create One Hot Encoded features of a specific column

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        column (str): Column to be processed
        new_name (str): new column name to be used
        
    Returns: 
        tf_df: One hot encoded features 
    """
    
    tf_df = pd.get_dummies(df[column])
    feature_names = tf_df.columns
    tf_df.columns = [new_name + "|" + str(i) for i in feature_names]
    tf_df.reset_index(drop = True, inplace = True)    
    return tf_df


In [140]:
#from IPython.display import Image
#Image("E:\spotify-recommendation-system-main (1)\spotify-recommendation-system-main/tfidf_4.png")

In [141]:
#function to build entire feature set
def create_feature_set(df, float_cols):
    """ 
    Process spotify df to create a final set of features that will be used to generate recommendations

    Parameters: 
        df (pandas dataframe): Spotify Dataframe
        float_cols (list(str)): List of float columns that will be scaled 
        
    Returns: 
        final: final set of features 
    """
    
    #tfidf genre lists
    tfidf = TfidfVectorizer()
    tfidf_matrix =  tfidf.fit_transform(df['consolidates_genre_lists'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + "|" + i for i in tfidf.get_feature_names()]
    genre_df.reset_index(drop = True, inplace=True)

    #explicity_ohe = ohe_prep(df, 'explicit','exp')    
    year_ohe = ohe_prep(df, 'year','year') * 0.5
    popularity_ohe = ohe_prep(df, 'popularity_red','pop') * 0.15

    #scale float columns
    floats = df[float_cols].reset_index(drop = True)
    scaler = MinMaxScaler()
    floats_scaled = pd.DataFrame(scaler.fit_transform(floats), columns = floats.columns) * 0.2

    #concanenate all features
    final = pd.concat([genre_df, floats_scaled, popularity_ohe, year_ohe], axis = 1)
     
    #add song id
    final['id']=df['id'].values
    
    return final

In [142]:
complete_feature_set = create_feature_set(spotify_df, float_cols=float_cols)#.mean(axis = 0)

In [143]:
#complete_feature_set.head()

In [144]:
#client id and secret for my application
client_id = '090bc03cd6d944d996fe7f5618e68939'
client_secret= '161fb408a5014beeb7c2a3f41029df14'
redirect='https://example.com/'

In [145]:
scope = 'user-library-read'

if len(sys.argv) > 1:
    username = sys.argv[1]
else:
    print("Usage: %s username" % (sys.argv[0],))
    sys.exit()

In [146]:
auth_manager = SpotifyClientCredentials(client_id=client_id, client_secret=client_secret) 
sp = spotipy.Spotify(auth_manager=auth_manager)

In [147]:
token = util.prompt_for_user_token(scope, client_id= client_id, client_secret=client_secret,redirect_uri=redirect)

In [148]:
sp = spotipy.Spotify(auth=token)

In [149]:
#dir(sp)

In [150]:
#gather playlist names and images. 
#images aren't going to be used until I start building a UI
id_name = {}
#list_photo = {}
for i in sp.current_user_playlists()['items']:

    id_name[i['name']] = i['uri'].split(':')[2]
    #list_photo[i['uri'].split(':')[2]] = i['images'][0]['url']

In [151]:
id_name

{'Project': '5V3wpdsnMAEPGI2ratve4Q',
 'Newwww': '6mjewLAIIHOGz8QSAtXwbU',
 'Bhajan': '1hLlCI0Kt6uL5296wzSzna',
 'My playlist #8': '5GtsMv1xan4jbyo1cg7Unb',
 'Playlist 2': '7HP8r4gOUjtnF2A2YpEtkU',
 'Old': '19cgcx7ldtwct7veCE89nV',
 'Now Playing': '3CtkdEEHysapfjVHhJd1E2',
 'My playlist #3': '3nnp1Pb65rGM2uY8gBy6C6',
 'My playlist #2': '4KMXkdxNrVkDpndtzPsCvp',
 'Playing Now': '1hoFv09w9KGOt6mHiEl4B9'}

In [152]:
def create_necessary_outputs(playlist_name,id_dic, df):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        playlist_name (str): name of the playlist you'd like to pull from the spotify API
        id_dic (dic): dictionary that maps playlist_name to playlist_id
        df (pandas dataframe): spotify datafram
        
    Returns: 
        playlist: all songs in the playlist THAT ARE AVAILABLE IN THE KAGGLE DATASET
    """
    
    #generate playlist dataframe
    playlist = pd.DataFrame()
    playlist_name = playlist_name

    for ix, i in enumerate(sp.playlist(id_dic[playlist_name])['tracks']['items']):
        #print(i['track']['artists'][0]['name'])
        playlist.loc[ix, 'artist'] = i['track']['artists'][0]['name']
        playlist.loc[ix, 'name'] = i['track']['name']
        playlist.loc[ix, 'id'] = i['track']['id'] # ['uri'].split(':')[2]
        playlist.loc[ix, 'url'] = i['track']['album']['images'][1]['url']
        playlist.loc[ix, 'date_added'] = i['added_at']

    playlist['date_added'] = pd.to_datetime(playlist['date_added'])  
    
    playlist = playlist[playlist['id'].isin(df['id'].values)].sort_values('date_added',ascending = False)
    
    return playlist

In [153]:
#id_name

In [154]:
playlist_EDM = create_necessary_outputs('Newwww', id_name,spotify_df)
playlist_chill = create_necessary_outputs('Project',id_name, spotify_df)
playlist_classical = create_necessary_outputs('Playing Now',id_name, spotify_df)

In [155]:
'''from skimage import io
import matplotlib.pyplot as plt

def visualize_songs(df):
    """ 
    Visualize cover art of the songs in the inputted dataframe

    Parameters: 
        df (pandas dataframe): Playlist Dataframe
    """
    
    temp = df['url'].values
    plt.figure(figsize=(15,int(0.625 * len(temp))))
    columns = 5
    
    for i, url in enumerate(temp):
        plt.subplot(len(temp) / columns + 1, columns, i + 1)

        image = io.imread(url)
        plt.imshow(image)
        plt.xticks(color = 'w', fontsize = 0.1)
        plt.yticks(color = 'w', fontsize = 0.1)
        plt.xlabel(df['name'].values[i], fontsize = 12)
        plt.tight_layout(h_pad=0.4, w_pad=0)
        plt.subplots_adjust(wspace=None, hspace=None)

    plt.show()
 '''

'from skimage import io\nimport matplotlib.pyplot as plt\n\ndef visualize_songs(df):\n    """ \n    Visualize cover art of the songs in the inputted dataframe\n\n    Parameters: \n        df (pandas dataframe): Playlist Dataframe\n    """\n    \n    temp = df[\'url\'].values\n    plt.figure(figsize=(15,int(0.625 * len(temp))))\n    columns = 5\n    \n    for i, url in enumerate(temp):\n        plt.subplot(len(temp) / columns + 1, columns, i + 1)\n\n        image = io.imread(url)\n        plt.imshow(image)\n        plt.xticks(color = \'w\', fontsize = 0.1)\n        plt.yticks(color = \'w\', fontsize = 0.1)\n        plt.xlabel(df[\'name\'].values[i], fontsize = 12)\n        plt.tight_layout(h_pad=0.4, w_pad=0)\n        plt.subplots_adjust(wspace=None, hspace=None)\n\n    plt.show()\n '

In [156]:
#playlist_EDM

In [157]:
#visualize_songs(playlist_EDM)

In [158]:
#from IPython.display import Image
#Image("E:\spotify-recommendation-system-main (1)\spotify-recommendation-system-main/summarization_2.png")

In [159]:
def generate_playlist_feature(complete_feature_set, playlist_df, weight_factor):
    """ 
    Summarize a user's playlist into a single vector

    Parameters: 
        complete_feature_set (pandas dataframe): Dataframe which includes all of the features for the spotify songs
        playlist_df (pandas dataframe): playlist dataframe
        weight_factor (float): float value that represents the recency bias. The larger the recency bias, the most priority recent songs get. Value should be close to 1. 
        
    Returns: 
        playlist_feature_set_weighted_final (pandas series): single feature that summarizes the playlist
        complete_feature_set_nonplaylist (pandas dataframe): 
    """
    
    complete_feature_set_playlist = complete_feature_set[complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1).mean(axis =0)
    complete_feature_set_playlist = complete_feature_set_playlist.merge(playlist_df[['id','date_added']], on = 'id', how = 'inner')
    complete_feature_set_nonplaylist = complete_feature_set[~complete_feature_set['id'].isin(playlist_df['id'].values)]#.drop('id', axis = 1)
    
    playlist_feature_set = complete_feature_set_playlist.sort_values('date_added',ascending=False)

    most_recent_date = playlist_feature_set.iloc[0,-1]
    
    for ix, row in playlist_feature_set.iterrows():
        playlist_feature_set.loc[ix,'months_from_recent'] = int((most_recent_date.to_pydatetime() - row.iloc[-1].to_pydatetime()).days / 30)
        
    playlist_feature_set['weight'] = playlist_feature_set['months_from_recent'].apply(lambda x: weight_factor ** (-x))
    
    playlist_feature_set_weighted = playlist_feature_set.copy()
    #print(playlist_feature_set_weighted.iloc[:,:-4].columns)
    playlist_feature_set_weighted.update(playlist_feature_set_weighted.iloc[:,:-4].mul(playlist_feature_set_weighted.weight,0))
    playlist_feature_set_weighted_final = playlist_feature_set_weighted.iloc[:, :-4]
    #playlist_feature_set_weighted_final['id'] = playlist_feature_set['id']
    
    return playlist_feature_set_weighted_final.sum(axis = 0), complete_feature_set_nonplaylist

In [160]:
complete_feature_set_playlist_vector_EDM, complete_feature_set_nonplaylist_EDM = generate_playlist_feature(complete_feature_set, playlist_EDM, 1.09)
complete_feature_set_playlist_vector_chill, complete_feature_set_nonplaylist_chill = generate_playlist_feature(complete_feature_set, playlist_chill, 1.09)

In [161]:
#complete_feature_set_playlist_vector_EDM.shape

In [162]:
#from IPython.display import Image
#Image("E:\spotify-recommendation-system-main (1)\spotify-recommendation-system-main/cosine_sim_2.png")

In [163]:
def generate_playlist_recos(df, features, nonplaylist_features):
    """ 
    Pull songs from a specific playlist.

    Parameters: 
        df (pandas dataframe): spotify dataframe
        features (pandas series): summarized playlist feature
        nonplaylist_features (pandas dataframe): feature set of songs that are not in the selected playlist
        
    Returns: 
        non_playlist_df_top_40: Top 40 recommendations for that playlist
    """
    
    non_playlist_df = df[df['id'].isin(nonplaylist_features['id'].values)]
    non_playlist_df['sim'] = cosine_similarity(nonplaylist_features.drop('id', axis = 1).values, features.values.reshape(1, -1))[:,0]
    non_playlist_df_top_40 = non_playlist_df.sort_values('sim',ascending = False).head(40)
    non_playlist_df_top_40['url'] = non_playlist_df_top_40['id'].apply(lambda x: sp.track(x)['album']['images'][1]['url'])
    
    return non_playlist_df_top_40

In [164]:
edm_top40 = generate_playlist_recos(spotify_df, complete_feature_set_playlist_vector_EDM, complete_feature_set_nonplaylist_EDM)

In [165]:
#from IPython.display import Image
#Image("/Users/thakm004/Documents/Spotify/spotify_results.png")

In [166]:
edm_top40

Unnamed: 0,acousticness,artists,danceability,duration_ms,energy,explicit,id,instrumentalness,key,liveness,loudness,mode,name,popularity,release_date,speechiness,tempo,valence,year,artists_upd_v1,artists_upd_v2,artists_upd,artists_song,consolidates_genre_lists,popularity_red,sim,url
54086,0.00477,['One Direction'],0.681,174080,0.864,0,34aYkYrY3sXhEU9O4VQgtB,0.0,3,0.122,-3.276,1,Fireproof,68,2014-11-17,0.0301,132.929,0.921,2014,[One Direction],[],[One Direction],One DirectionFireproof,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.82149,https://i.scdn.co/image/ab67616d00001e02d304ba...
54021,0.0014,['One Direction'],0.565,229040,0.862,0,6HFywc5eQYRRYHYTatCb5Y,0.0,4,0.105,-3.715,1,Where Do Broken Hearts Go,69,2014-11-17,0.0326,122.994,0.647,2014,[One Direction],[],[One Direction],One DirectionWhere Do Broken Hearts Go,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.817406,https://i.scdn.co/image/ab67616d00001e02d304ba...
54084,0.0424,['One Direction'],0.508,201853,0.895,0,5UgsZiYk1lkEobuPHmRtWm,0.0,9,0.198,-3.745,1,Girl Almighty,67,2014-11-17,0.0414,168.941,0.495,2014,[One Direction],[],[One Direction],One DirectionGirl Almighty,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.816697,https://i.scdn.co/image/ab67616d00001e02d304ba...
54102,0.00287,['One Direction'],0.495,198653,0.883,0,4bqIFmnVAsndYWuXxAwQet,2.3e-05,6,0.0978,-4.722,1,Act My Age,65,2014-11-17,0.0402,135.899,0.642,2014,[One Direction],[],[One Direction],One DirectionAct My Age,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.816455,https://i.scdn.co/image/ab67616d00001e02d304ba...
54053,0.0278,['One Direction'],0.558,199720,0.916,0,4JaLkM90MJutDAl5jD9BZX,0.0,1,0.0296,-3.145,0,No Control,72,2014-11-17,0.0839,142.984,0.85,2014,[One Direction],[],[One Direction],One DirectionNo Control,"[post-teen_pop, talent_show, boy_band, pop, da...",14,0.816225,https://i.scdn.co/image/ab67616d00001e02d304ba...
54104,0.22,['One Direction'],0.681,248360,0.582,0,3JjnGLK8IxkNLvo8Lb3KOM,0.0,6,0.119,-7.76,0,18,79,2014-11-17,0.0313,124.038,0.221,2014,[One Direction],[],[One Direction],One Direction18,"[post-teen_pop, talent_show, boy_band, pop, da...",15,0.815722,https://i.scdn.co/image/ab67616d00001e02d304ba...
54034,0.0734,['One Direction'],0.678,214720,0.933,0,6AzCBeiDuUXGXjznBufswB,0.0,2,0.0863,-4.959,1,Stockholm Syndrome,69,2014-11-17,0.14,120.572,0.336,2014,[One Direction],[],[One Direction],One DirectionStockholm Syndrome,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.81557,https://i.scdn.co/image/ab67616d00001e02d304ba...
54094,0.00154,['One Direction'],0.59,266600,0.944,0,2skzofTV9Ys3eN70pghEHL,0.0,10,0.322,-4.323,1,Change Your Ticket,62,2014-11-17,0.0499,103.991,0.596,2014,[One Direction],[],[One Direction],One DirectionChange Your Ticket,"[post-teen_pop, talent_show, boy_band, pop, da...",12,0.814423,https://i.scdn.co/image/ab67616d00001e02d304ba...
54085,0.23,['One Direction'],0.515,210893,0.773,0,0L5gFNrzHyh4OhMwEz6DGS,1e-06,6,0.0938,-4.848,1,Fool's Gold,69,2014-11-17,0.0479,128.001,0.379,2014,[One Direction],[],[One Direction],One DirectionFool's Gold,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.812698,https://i.scdn.co/image/ab67616d00001e02d304ba...
54045,0.00229,['One Direction'],0.524,196053,0.831,0,7GtGeeChOx4NS77bqK8SUx,0.0,2,0.171,-4.174,1,Ready to Run,67,2014-11-17,0.036,115.067,0.299,2014,[One Direction],[],[One Direction],One DirectionReady to Run,"[post-teen_pop, talent_show, boy_band, pop, da...",13,0.811379,https://i.scdn.co/image/ab67616d00001e02d304ba...


In [167]:
#visualize_songs(edm_top40)

In [168]:
#chill_top40 = generate_playlist_recos(spotify_df, complete_feature_set_playlist_vector_chill, complete_feature_set_nonplaylist_chill)