Import Everything

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

## Get our data and preprocess it

In [2]:
master = pd.read_csv('processed1000.csv')
master.drop(columns=["Unnamed: 0"], inplace=True)
master.head()

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,name,danceability,energy,...,tempo,type,id,track_href,analysis_url,duration_ms_y,time_signature,artistpop,trackpop,genres
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Throwbacks,0.904,0.813,...,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,74,68,dance_pop hip_hop hip_pop pop_rap r&b rap urba...
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,Throwbacks,0.774,0.838,...,143.04,audio_features,6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4,81,82,dance_pop pop
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,0.664,0.758,...,99.259,audio_features,0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4,87,20,dance_pop pop r&b
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,Throwbacks,0.892,0.714,...,100.972,audio_features,1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4,81,77,dance_pop pop
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,Throwbacks,0.853,0.606,...,94.759,audio_features,1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4,74,0,dance_pop pop_rap reggae_fusion


In [4]:
def preprocessing(df):
   # dropping duplicate songs (check for artist too, the name of a song might be shared among different artists)
   df['artist+song'] = master.apply(lambda row: row['artist_name']+row['track_name'],axis = 1)
   df = df.drop_duplicates('artist+song')

   # getting only the important columns
   df = df[['artist_name','id','track_name','danceability', 'energy', 'key', 'loudness', 'mode',
      'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', "artistpop", "genres", "trackpop"]]

   # splitting the genres into a list so it becomes easier for TF-IDF later
   df['genre_list'] = df['genres'].apply(lambda x: x.split(' '))
   return df

In [5]:
songs = preprocessing(master)

## Feature Engineering

#### TF-IDF

Term Frequency - Inverse Document Frequency is a technique used in NLP to generate features based on words that appear in each document (songs - more particularly, the genre list in our case), taking into account how common those words are. 

TF-IDF is superior to using something like Bag of Words because it more heavily weights unique words, recongizing that the rarer a word is, the more likely it is that the emphasis of a particular sentence (or song) was put on to that word.

In [6]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(songs['genre_list'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + '|' + i for i in tfidf.get_feature_names_out()]
genre_df.drop(columns='genre|unknown')
genre_df.reset_index(drop = True, inplace=True)

In [7]:
songs['artistpop'].describe()

count    970.000000
mean      62.664948
std       17.449680
min        0.000000
25%       53.000000
50%       65.000000
75%       74.000000
max       98.000000
Name: artistpop, dtype: float64

#### Normalization

We want to normalize all of the features that aren't scaled between 0 and 1, such as our popularity and audio features columns. Normalization ensures that our recommendation system works properly.

In [8]:
pop = songs[["artistpop", 'trackpop']]#.reset_index(drop = True)
scaler = MinMaxScaler()
pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns)
pop_scaled.head()

Unnamed: 0,artistpop,trackpop
0,0.755102,0.809524
1,0.826531,0.97619
2,0.887755,0.238095
3,0.826531,0.916667
4,0.755102,0.0


In [61]:
def engineering(df):
    # tfidf
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['genre_list'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + '|' + i for i in tfidf.get_feature_names_out()]
    genre_df.drop(columns='genre|unknown')
    genre_df.reset_index(drop = True, inplace=True)

    # print(genre_df.shape)#['id'].isna().sum())

    # normalize variables
    tobenormed = ['artistpop', 'trackpop', 'loudness', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    pop = songs[tobenormed]#.reset_index(drop = True)
    scaler = MinMaxScaler()
    pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns)
    # print(pop_scaled.shape)

    final = pd.concat([genre_df, pop_scaled], axis=1)
    # print(df['id'].isna().sum())
    
    final['id'] = df['id'].copy()
    # print(final['id'].isna().sum())

    return final#, pd.concat([df['id'], final['id']], axis=1)


In [66]:
final = engineering(songs)
# final['id'].isna().sum()
# print(songs.shape, final.shape)
# wtf.to_csv('what.csv')
final.dropna(inplace=True)

In [67]:
final.to_csv('final.csv')

## Recommending New Playlists

In [68]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import cred

In [69]:
# Authenticating (without user credentials)
cid = cred.client_ID
secret = cred.client_SECRET

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [71]:
def retrieve_uris(playlist_link):
    playlist_URI = playlist_link.split('/')[-1].split('?')[0] # extracts the URI from the link
    
    track_uris = []
    for track in sp.playlist_tracks(playlist_URI)['items']:
        track_uris.append(track['track']['uri'].split(':')[-1])

    return track_uris

In [72]:
a = ['a','b','c']
b = ['a','d']
[item in b for item in a]

[True, False, False]

In [73]:
def generate_playlist_vector(spotify_feature_df, playlist_uris):
    feature_set_playlist = spotify_feature_df[spotify_feature_df['id'].isin(playlist_uris)]
    feature_set_notplaylist = spotify_feature_df[~spotify_feature_df['id'].isin(playlist_uris)]
    final_feature_set_playlist = feature_set_playlist.drop(columns='id')
    
    return final_feature_set_playlist, feature_set_notplaylist

In [74]:
playlist_link = "https://open.spotify.com/playlist/37i9dQZF1E8B5tVAgJDi7C?si=b408aec431ce4e88"
uris = retrieve_uris(playlist_link)
play, notplay = generate_playlist_vector(final, uris)
notplay

Unnamed: 0,genre|_hip_hop,genre|abstract_hip_hop,genre|acoustic_pop,genre|adult_standards,genre|aesthetic_rap,genre|afrofuturism,genre|alabama_indie,genre|album_rock,genre|albuquerque_indie,genre|alt_z,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.770719,0.847831,0.900132,0.168210,0.025167,0.025227,0.232321,0.945969,0.594755,6I9VzXrHxO9rA9A5euc8Ak
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.647690,0.765577,0.807740,0.346054,0.002397,0.000000,0.038862,0.709715,0.333138,0WqIKmW4BTrj3eJFmnCKMv
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.902695,0.720337,0.826018,0.218229,0.203228,0.000236,0.030686,0.832609,0.343374,1AWQoqb9bSvzTjaLralEkT
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.859076,0.609295,0.876523,0.089107,0.056715,0.000000,0.307709,0.659922,0.306248,1lzr43nnXAijIGYnCT8M8H
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.890393,0.796422,0.873996,0.268247,0.021426,0.000000,0.015396,0.594237,0.367426,0XUfyU2QviPAs6bxSpXYG4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
965,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.375909,0.798478,0.820064,0.056688,0.002063,0.000312,0.132512,0.262634,0.708195,2xlgLLcGDnvaQXyyjGhRdG
966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.512359,0.673041,0.779597,0.007410,0.000722,0.000080,0.317265,0.420489,0.295791,36PQh1G6h7n9VWB799fXpI
967,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.787496,0.556858,0.831383,0.032975,0.026280,0.000136,0.203653,0.722428,0.297105,5VarZFY00sO54HErJ3TPCI
968,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.693547,0.788197,0.818679,0.106150,0.461067,0.001534,0.097473,0.451213,0.588707,0gbBzIqrECJOEPvQJIBFs5


In [76]:
final['id'].isin(['0UaMYEvWZi0ZqiDOoHU3YI']).sum()
# final.loc[final['id'].isna()]

1

In [77]:
def generate_recommendations(playlist_vector, notplay_df, number):
    temp = notplay_df.copy()

    X = notplay_df.drop(columns='id').values
    Y = playlist_vector.values#.reshape(1,-1)
    print(X.shape, Y.shape)
    
    temp['similarity'] = cosine_similarity(X, Y)[:,0]
    finalrecs = temp.sort_values('similarity', ascending=False).head(number)

    return finalrecs

In [78]:
play.values.shape, notplay.shape

((3, 527), (873, 528))

In [79]:
generate_recommendations(play, notplay, 5)

(873, 527) (3, 527)


Unnamed: 0,genre|_hip_hop,genre|abstract_hip_hop,genre|acoustic_pop,genre|adult_standards,genre|aesthetic_rap,genre|afrofuturism,genre|alabama_indie,genre|album_rock,genre|albuquerque_indie,genre|alt_z,...,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,similarity
946,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.898211,0.906639,0.050204,0.057928,3e-06,0.107029,0.985168,0.541046,0c5QKViNmTWyXJODcruNsr,0.909892
944,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.834464,0.848622,0.042979,0.051154,1e-06,0.093226,0.973514,0.571157,6VVd4kRfzBsZqFbvEAjloh,0.906906
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.453013,0.869392,0.011486,0.355909,0.0,0.044914,0.749974,0.337506,4Y45aqo9QMa57rDsAJv40A,0.904629
19,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.69052,0.875035,0.06206,0.005481,0.001584,0.032385,0.690645,0.271817,4omisSlTk6Dsq2iQD7MA07,0.895939
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.737816,0.847099,0.205261,0.191094,0.0,0.112338,0.836847,0.523424,1HwpWwa6bnqqRhK8agG4RS,0.894122


In [80]:
sp.track('2PpruBYCo4H7WOBJ7Q2EwM')['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1G9G7WwrXka3Z1r7aIDjI7'},
  'href': 'https://api.spotify.com/v1/artists/1G9G7WwrXka3Z1r7aIDjI7',
  'id': '1G9G7WwrXka3Z1r7aIDjI7',
  'name': 'Outkast',
  'type': 'artist',
  'uri': 'spotify:artist:1G9G7WwrXka3Z1r7aIDjI7'}]

In [81]:
f = generate_recommendations(play, notplay, 10)
for num, uri in enumerate(f['id'].values):

    arts = [artist['name'] for artist in sp.track(uri)['artists']]
    print(f'{num+1}. ', end='')
    print(*arts, sep=', ', end=' - ')

    print(sp.track(uri)['name'])

# arts = [i['name'] for i in sp.track(f['id'].values[0])['artists']]
# print(arts)

(873, 527) (3, 527)
1. The Twilight Sad - There's A Girl In The Corner
2. Night Riots - All For You
3. Jesse McCartney - Right Where You Want Me - Radio Edit Version
4. Avril Lavigne - Sk8er Boi
5. Jesse McCartney - Beautiful Soul
6. Cupid - Cupid Shuffle
7. Michael Kiwanuka - I'll Get Along
8. Pitbull, Ne-Yo, AFROJACK, Nayer - Give Me Everything (feat. Ne-Yo, Afrojack & Nayer)
9. Britney Spears - Toxic
10. Lil Mama - Lip Gloss
