Import Everything

In [1]:
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_extraction.text import TfidfVectorizer

## Get our data and preprocess it

In [4]:
master = pd.read_csv('processed100.csv')
master.drop(columns=["Unnamed: 0"], inplace=True)
master.head()

Unnamed: 0,artist_name,track_uri,artist_uri,track_name,album_uri,duration_ms_x,album_name,name,danceability,energy,...,tempo,type,id,track_href,analysis_url,duration_ms_y,time_signature,artistpop,trackpop,genres
0,Missy Elliott,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,Lose Control (feat. Ciara & Fat Man Scoop),spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,226863,The Cookbook,Throwbacks,0.904,0.813,...,125.461,audio_features,0UaMYEvWZi0ZqiDOoHU3YI,https://api.spotify.com/v1/tracks/0UaMYEvWZi0Z...,https://api.spotify.com/v1/audio-analysis/0UaM...,226864,4,69,66,dance_pop hip_hop hip_pop pop_rap r&b rap urba...
1,Britney Spears,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Toxic,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,198800,In The Zone,Throwbacks,0.774,0.838,...,143.04,audio_features,6I9VzXrHxO9rA9A5euc8Ak,https://api.spotify.com/v1/tracks/6I9VzXrHxO9r...,https://api.spotify.com/v1/audio-analysis/6I9V...,198800,4,81,81,dance_pop pop post-teen_pop
2,Beyoncé,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Crazy In Love,spotify:album:25hVFAxTlDvXbx2X2QkUkE,235933,Dangerously In Love (Alben für die Ewigkeit),Throwbacks,0.664,0.758,...,99.259,audio_features,0WqIKmW4BTrj3eJFmnCKMv,https://api.spotify.com/v1/tracks/0WqIKmW4BTrj...,https://api.spotify.com/v1/audio-analysis/0WqI...,235933,4,86,23,dance_pop pop r&b
3,Justin Timberlake,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,Rock Your Body,spotify:album:6QPkyl04rXwTGlGlcYaRoW,267266,Justified,Throwbacks,0.892,0.714,...,100.972,audio_features,1AWQoqb9bSvzTjaLralEkT,https://api.spotify.com/v1/tracks/1AWQoqb9bSvz...,https://api.spotify.com/v1/audio-analysis/1AWQ...,267267,4,79,77,dance_pop pop
4,Shaggy,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,It Wasn't Me,spotify:album:6NmFmPX56pcLBOFMhIiKvF,227600,Hot Shot,Throwbacks,0.853,0.606,...,94.759,audio_features,1lzr43nnXAijIGYnCT8M8H,https://api.spotify.com/v1/tracks/1lzr43nnXAij...,https://api.spotify.com/v1/audio-analysis/1lzr...,227600,4,71,0,dance_pop pop_rap reggae_fusion


In [5]:
def preprocessing(df):
   # dropping duplicate songs (check for artist too, the name of a song might be shared among different artists)
   df['artist+song'] = master.apply(lambda row: row['artist_name']+row['track_name'],axis = 1)
   df = df.drop_duplicates('artist+song')

   # getting only the important columns
   df = df[['artist_name','id','track_name','danceability', 'energy', 'key', 'loudness', 'mode',
      'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo', "artistpop", "genres", "trackpop"]]

   # splitting the genres into a list so it becomes easier for TF-IDF later
   df['genre_list'] = df['genres'].apply(lambda x: x.split(' '))
   return df

In [6]:
songs = preprocessing(master)

## Feature Engineering

#### TF-IDF

Term Frequency - Inverse Document Frequency is a technique used in NLP to generate features based on words that appear in each document (songs - more particularly, the genre list in our case), taking into account how common those words are. 

TF-IDF is superior to using something like Bag of Words because it more heavily weights unique words, recongizing that the rarer a word is, the more likely it is that the emphasis of a particular sentence (or song) was put on to that word.

In [7]:
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(songs['genre_list'].apply(lambda x: " ".join(x)))
genre_df = pd.DataFrame(tfidf_matrix.toarray())
genre_df.columns = ['genre' + '|' + i for i in tfidf.get_feature_names_out()]
genre_df.drop(columns='genre|unknown')
genre_df.reset_index(drop = True, inplace=True)

In [8]:
songs['artistpop'].describe()

count    99.000000
mean     66.363636
std      14.394907
min      29.000000
25%      60.000000
50%      66.000000
75%      77.000000
max      91.000000
Name: artistpop, dtype: float64

#### Normalization

We want to normalize all of the features that aren't scaled between 0 and 1, such as our popularity and audio features columns. Normalization ensures that our recommendation system works properly.

In [9]:
pop = songs[["artistpop", 'trackpop']]#.reset_index(drop = True)
scaler = MinMaxScaler()
pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns)
pop_scaled.head()

Unnamed: 0,artistpop,trackpop
0,0.645161,0.795181
1,0.83871,0.975904
2,0.919355,0.277108
3,0.806452,0.927711
4,0.677419,0.0


In [10]:
def engineering(df):
    # tfidf
    tfidf = TfidfVectorizer()
    tfidf_matrix = tfidf.fit_transform(df['genre_list'].apply(lambda x: " ".join(x)))
    genre_df = pd.DataFrame(tfidf_matrix.toarray())
    genre_df.columns = ['genre' + '|' + i for i in tfidf.get_feature_names_out()]
    genre_df.drop(columns='genre|unknown')
    genre_df.reset_index(drop = True, inplace=True)

    # normalize variables
    tobenormed = ['artistpop', 'trackpop', 'loudness', 'danceability', 'energy', 'loudness', 'speechiness', 'acousticness', 'instrumentalness', 'liveness', 'valence', 'tempo']
    pop = songs[tobenormed]#.reset_index(drop = True)
    scaler = MinMaxScaler()
    pop_scaled = pd.DataFrame(scaler.fit_transform(pop), columns = pop.columns)

    final = pd.concat([genre_df, pop_scaled], axis=1)
    final['id'] = df['id']

    return final


In [11]:
final = engineering(songs)
final

Unnamed: 0,genre|album_rock,genre|alternative_rock,genre|art_rock,genre|atl_hip_hop,genre|blues_rock,genre|boy_band,genre|british_invasion,genre|british_soul,genre|british_soundtrack,genre|broadway,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.960145,0.788586,0.549692,0.200841,0.034816,0.007033,0.029200,0.813702,0.433860,0UaMYEvWZi0ZqiDOoHU3YI
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.807760,0.821012,0.786185,0.186120,0.027873,0.025227,0.309551,0.950721,0.550401,6I9VzXrHxO9rA9A5euc8Ak
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.678818,0.717250,0.588379,0.388013,0.002654,0.000000,0.047468,0.682692,0.260152,0WqIKmW4BTrj3eJFmnCKMv
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.946079,0.660182,0.627511,0.242902,0.225076,0.000236,0.036392,0.822115,0.271508,1AWQoqb9bSvzTjaLralEkT
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.900363,0.520104,0.735641,0.096320,0.062812,0.000000,0.411680,0.626202,0.230318,1lzr43nnXAijIGYnCT8M8H
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.532294,0.831388,0.839695,0.020820,0.007044,0.000002,0.148446,0.393029,0.311418,4erhEGuOGQgjv3p1bccnpn
95,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.755011,0.898833,1.000000,0.028601,0.352737,0.000000,0.115362,0.461538,0.443910,4hRA2rCPaCOpoEIq5qXaBz
96,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.614348,0.053178,0.450085,0.025447,0.877938,0.000000,0.080121,0.286058,0.422669,1enx9LPZrXxaVVBxas5rRm
97,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.794866,0.587549,0.658045,0.255521,0.028881,0.000000,0.093067,0.307692,0.132724,0WNGsQ1oAuHzNTk8jivBKW


In [12]:
final.to_csv('final.csv')

## Recommending New Playlists

In [13]:
import spotipy
from spotipy.oauth2 import SpotifyClientCredentials
import cred

In [14]:
# Authenticating (without user credentials)
cid = cred.client_ID
secret = cred.client_SECRET

client_credentials_manager = SpotifyClientCredentials(client_id=cid, client_secret=secret)
sp = spotipy.Spotify(client_credentials_manager = client_credentials_manager)

In [15]:
def retrieve_uris(playlist_link):
    playlist_URI = playlist_link.split('/')[-1].split('?')[0] # extracts the URI from the link
    
    track_uris = []
    for track in sp.playlist_tracks(playlist_URI)['items']:
        track_uris.append(track['track']['uri'].split(':')[-1])

    return track_uris

In [16]:
a = ['a','b','c']
b = ['a','d']
[item in b for item in a]

[True, False, False]

In [17]:
def generate_playlist_vector(spotify_feature_df, playlist_uris):
    feature_set_playlist = spotify_feature_df[spotify_feature_df['id'].isin(playlist_uris)]
    feature_set_notplaylist = spotify_feature_df[~spotify_feature_df['id'].isin(playlist_uris)]
    final_feature_set_playlist = feature_set_playlist.drop(columns='id')
    
    return final_feature_set_playlist, feature_set_notplaylist

In [28]:
playlist_link = "https://open.spotify.com/playlist/37i9dQZF1E8B5tVAgJDi7C?si=b408aec431ce4e88"
uris = retrieve_uris(playlist_link)
play, notplay = generate_playlist_vector(final, uris)
notplay

Unnamed: 0,genre|album_rock,genre|alternative_rock,genre|art_rock,genre|atl_hip_hop,genre|blues_rock,genre|boy_band,genre|british_invasion,genre|british_soul,genre|british_soundtrack,genre|broadway,...,danceability,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id
1,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.807760,0.821012,0.786185,0.186120,0.027873,0.025227,0.309551,0.950721,0.550401,6I9VzXrHxO9rA9A5euc8Ak
2,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.678818,0.717250,0.588379,0.388013,0.002654,0.000000,0.047468,0.682692,0.260152,0WqIKmW4BTrj3eJFmnCKMv
3,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.946079,0.660182,0.627511,0.242902,0.225076,0.000236,0.036392,0.822115,0.271508,1AWQoqb9bSvzTjaLralEkT
4,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.900363,0.520104,0.735641,0.096320,0.062812,0.000000,0.411680,0.626202,0.230318,1lzr43nnXAijIGYnCT8M8H
5,0.0,0.0,0.0,0.484257,0.0,0.0,0.0,0.0,0.0,0.0,...,0.933185,0.756161,0.730230,0.299685,0.023730,0.000000,0.015679,0.551683,0.298192,0XUfyU2QviPAs6bxSpXYG4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
94,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.532294,0.831388,0.839695,0.020820,0.007044,0.000002,0.148446,0.393029,0.311418,4erhEGuOGQgjv3p1bccnpn
95,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.755011,0.898833,1.000000,0.028601,0.352737,0.000000,0.115362,0.461538,0.443910,4hRA2rCPaCOpoEIq5qXaBz
96,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.614348,0.053178,0.450085,0.025447,0.877938,0.000000,0.080121,0.286058,0.422669,1enx9LPZrXxaVVBxas5rRm
97,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,...,0.794866,0.587549,0.658045,0.255521,0.028881,0.000000,0.093067,0.307692,0.132724,0WNGsQ1oAuHzNTk8jivBKW


In [19]:
final['id'].isin(['0UaMYEvWZi0ZqiDOoHU3YI']).sum()

1

In [20]:
def generate_recommendations(playlist_vector, notplay_df, number):
    temp = notplay_df.copy()

    X = notplay_df.drop(columns='id').values
    Y = playlist_vector.values#.reshape(1,-1)
    print(X.shape, Y.shape)
    
    temp['similarity'] = cosine_similarity(X, Y)[:,0]
    finalrecs = temp.sort_values('similarity', ascending=False).head(number)

    return finalrecs

In [26]:
play.values.shape, notplay.shape

((4, 110), (95, 111))

In [60]:
generate_recommendations(play, notplay, 5)

(97, 110) (2, 110)


Unnamed: 0,genre|album_rock,genre|alternative_rock,genre|art_rock,genre|atl_hip_hop,genre|blues_rock,genre|boy_band,genre|british_invasion,genre|british_soul,genre|british_soundtrack,genre|broadway,...,energy,loudness,speechiness,acousticness,instrumentalness,liveness,valence,tempo,id,similarity
9,0.0,0.0,0.0,0.359279,0.0,0.0,0.0,0.0,0.0,0.0,...,0.997406,0.908693,0.086015,0.115332,0.000537,0.211738,1.0,0.12933,2PpruBYCo4H7WOBJ7Q2EwM,0.865327
8,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.613489,0.815015,0.160883,0.305704,0.0,0.175777,0.722356,0.517048,7H6ev70Weq6DdpZyyTmUXk,0.852042
14,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.322957,0.720374,0.008202,0.39417,0.0,0.055667,0.728365,0.264998,7k6IzwMGpxnRghE7YosnXT,0.848657
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.992218,0.624324,0.052787,0.063708,6.2e-05,0.182969,0.883413,0.360053,2gam98EZKrF9XuOkU13ApN,0.84159
15,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.682231,0.672645,0.228181,0.211638,0.0,0.147008,0.826923,0.471264,1Bv0Yl01xBDZD4OQP93fyl,0.83542


In [66]:
sp.track('2PpruBYCo4H7WOBJ7Q2EwM')['artists']

[{'external_urls': {'spotify': 'https://open.spotify.com/artist/1G9G7WwrXka3Z1r7aIDjI7'},
  'href': 'https://api.spotify.com/v1/artists/1G9G7WwrXka3Z1r7aIDjI7',
  'id': '1G9G7WwrXka3Z1r7aIDjI7',
  'name': 'Outkast',
  'type': 'artist',
  'uri': 'spotify:artist:1G9G7WwrXka3Z1r7aIDjI7'}]

In [67]:
f = generate_recommendations(play, notplay, 10)
for num, uri in enumerate(f['id'].values):

    arts = [artist['name'] for artist in sp.track(uri)['artists']]
    print(f'{num+1}. ', end='')
    print(*arts, sep=', ', end=' - ')

    print(sp.track(uri)['name'])

# arts = [i['name'] for i in sp.track(f['id'].values[0])['artists']]
# print(arts)

(97, 110) (2, 110)
1. Outkast - Hey Ya!
2. Destiny's Child - Say My Name
3. Cassie - Me & U
4. Nelly Furtado, Timbaland - Promiscuous
5. Omarion - Ice Box
6. Justin Timberlake - Rock Your Body
7. Britney Spears - Toxic
8. Ne-Yo - Miss Independent
9. Jason Derulo - Whatcha Say
10. Lil Mama - Lip Gloss
