<a href="https://colab.research.google.com/github/mserkantan/spotify-playlist-recommendation/blob/main/bbm406_project.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

### Setup & Importing Libraries

In [344]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

import os
from tqdm import tqdm 
import json

# configure
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

# sklearn libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!cp "/content/drive/MyDrive/data.zip" .
!unzip -q data.zip
!rm data.zip

### Data Preprocessing

In [9]:
def get_playlists_df(number_of_files):
    
    start = 0
    end = 1000
    list_of_df = []
    
    for i in range(number_of_files):
        
        path = 'data/mpd.slice.' + str(start) + "-" + str(end-1) + '.json'
        json_file = json.load(open(path, 'r'))
        
        playlists_df = pd.DataFrame.from_dict(json_file['playlists'], orient='columns')
        list_of_df.append(playlists_df)
        
        start = end
        end = end + 1000
    
    concat_playlists_df = pd.concat(list_of_df).reset_index(drop=True)
        
        
    return concat_playlists_df

In [10]:
def get_all_songs_df(playlists_df):
    all_songs_array = []
    for index, row in playlists_df.iterrows():
        for track in row['tracks']:
            all_songs_array.append([track['track_uri'], 
                                      track['track_name'], 
                                      track['artist_uri'], 
                                      track['artist_name'],  
                                      track['album_uri'], 
                                      track['album_name'],
                                      row['pid']])

    all_songs_df = pd.DataFrame(all_songs_array, columns=['track_uri', 
                                                          'track_name', 
                                                          'artist_uri', 
                                                          'artist_name', 
                                                          'album_uri', 
                                                          'album_name', 
                                                          'pid'])
    return all_songs_df

In [11]:
# get playlists
playlists_df = get_playlists_df(number_of_files=10)
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
1,Awesome Playlist,false,1,1506556800,39,23,1,"[{'pos': 0, 'artist_name': 'Survivor', 'track_...",5,11656470,21,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
4,90s,false,4,1401667200,17,16,2,"[{'pos': 0, 'artist_name': 'The Smashing Pumpk...",7,4335282,16,
...,...,...,...,...,...,...,...,...,...,...,...,...
9995,rap,false,9995,1491782400,34,26,1,"[{'pos': 0, 'artist_name': 'Lecrae', 'track_ur...",16,8530582,15,
9996,Blues,false,9996,1482364800,57,48,1,"[{'pos': 0, 'artist_name': 'Robert Johnson', '...",15,13010049,30,
9997,game songs,false,9997,1508371200,27,24,4,"[{'pos': 0, 'artist_name': 'NateWantsToBattle'...",23,5104068,7,
9998,country,false,9998,1466208000,12,12,1,"[{'pos': 0, 'artist_name': 'Little Big Town', ...",3,2459585,12,


In [12]:
# apply the threshold to playlists 
threshold_num_tracks = 50
playlists_df = playlists_df[playlists_df.num_tracks > threshold_num_tracks]
playlists_df

Unnamed: 0,name,collaborative,pid,modified_at,num_tracks,num_albums,num_followers,tracks,num_edits,duration_ms,num_artists,description
0,Throwbacks,false,0,1493424000,52,47,1,"[{'pos': 0, 'artist_name': 'Missy Elliott', 't...",6,11532414,37,
2,korean,false,2,1505692800,64,51,1,"[{'pos': 0, 'artist_name': 'Hoody', 'track_uri...",18,14039958,31,
3,mat,false,3,1501027200,126,107,1,"[{'pos': 0, 'artist_name': 'Camille Saint-Saën...",4,28926058,86,
5,Wedding,false,5,1430956800,80,71,1,"[{'pos': 0, 'artist_name': 'Cali Swag District...",3,19156557,56,
7,2017,false,7,1509321600,53,52,1,"[{'pos': 0, 'artist_name': 'Fink', 'track_uri'...",38,12674796,48,
...,...,...,...,...,...,...,...,...,...,...,...,...
9992,Likes,false,9992,1509062400,74,69,3,"[{'pos': 0, 'artist_name': 'High Highs', 'trac...",67,17521523,65,
9993,October,false,9993,1478563200,58,56,2,"[{'pos': 0, 'artist_name': 'Two Door Cinema Cl...",10,12661986,55,
9994,alone,false,9994,1387065600,56,37,1,"[{'pos': 0, 'artist_name': 'Britt Nicole', 'tr...",11,13389556,21,
9996,Blues,false,9996,1482364800,57,48,1,"[{'pos': 0, 'artist_name': 'Robert Johnson', '...",15,13010049,30,


In [13]:
# get all songs
all_songs_df = get_all_songs_df(playlists_df)
all_songs_df

Unnamed: 0,track_uri,track_name,artist_uri,artist_name,album_uri,album_name,pid
0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,Lose Control (feat. Ciara & Fat Man Scoop),spotify:artist:2wIVse2owClT7go1WT98tk,Missy Elliott,spotify:album:6vV5UrXcfyQD1wu4Qo2I9K,The Cookbook,0
1,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,Toxic,spotify:artist:26dSoYclwsYLMAKD3tpOr4,Britney Spears,spotify:album:0z7pVBGOD7HCIB7S8eLkLI,In The Zone,0
2,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,Crazy In Love,spotify:artist:6vWDO969PvNqNYHIOW5v0m,Beyoncé,spotify:album:25hVFAxTlDvXbx2X2QkUkE,Dangerously In Love (Alben für die Ewigkeit),0
3,spotify:track:1AWQoqb9bSvzTjaLralEkT,Rock Your Body,spotify:artist:31TPClRtHm23RisEBtV3X7,Justin Timberlake,spotify:album:6QPkyl04rXwTGlGlcYaRoW,Justified,0
4,spotify:track:1lzr43nnXAijIGYnCT8M8H,It Wasn't Me,spotify:artist:5EvFsr3kj42KNv97ZEnqij,Shaggy,spotify:album:6NmFmPX56pcLBOFMhIiKvF,Hot Shot,0
...,...,...,...,...,...,...,...
526060,spotify:track:6ZOPiKQeibCn7fP8dncucL,Blue Ain't Your Color,spotify:artist:0u2FHSq3ln94y5Q57xazwf,Keith Urban,spotify:album:1r7ABqzNXQnUPAH3ZjrHMn,Ripcord,9999
526061,spotify:track:7pxhKtuTwofDIdgHx2DcVK,Seein' Red,spotify:artist:1dID9zgn0OV0Y8ud7Mh2tS,Dustin Lynch,spotify:album:23cuZhPWDfX1uKD4qwuv7t,Current Mood,9999
526062,spotify:track:7mldq42yDuxiUNn08nvzHO,Body Like A Back Road,spotify:artist:2kucQ9jQwuD8jWdtR9Ef38,Sam Hunt,spotify:album:2N7kidh1wA9EoLdf16QWrz,Body Like A Back Road,9999
526063,spotify:track:23TxRN09aR1RB0G0tFoT0b,Better Man,spotify:artist:3CygdxquGHurS7f9LjNLkv,Little Big Town,spotify:album:2aQOzEjLzPkffXDwREXdAh,The Breaker,9999


#### Preparing training data

In [14]:
# get df that we will use in training // artist_uri and album_uri can be added
training_df = all_songs_df[['pid', 'track_uri']]
training_df

Unnamed: 0,pid,track_uri
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H
...,...,...
526060,9999,spotify:track:6ZOPiKQeibCn7fP8dncucL
526061,9999,spotify:track:7pxhKtuTwofDIdgHx2DcVK
526062,9999,spotify:track:7mldq42yDuxiUNn08nvzHO
526063,9999,spotify:track:23TxRN09aR1RB0G0tFoT0b


In [15]:
training_df['interaction'] = 1
training_df

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,pid,track_uri,interaction
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1
...,...,...,...
526060,9999,spotify:track:6ZOPiKQeibCn7fP8dncucL,1
526061,9999,spotify:track:7pxhKtuTwofDIdgHx2DcVK,1
526062,9999,spotify:track:7mldq42yDuxiUNn08nvzHO,1
526063,9999,spotify:track:23TxRN09aR1RB0G0tFoT0b,1


In [16]:
# unique songs in all songs
all_unique_songs = training_df['track_uri'].unique()

In [17]:
print("Although we have {} tracks in all playlists we include, there are {} unique tracks.".format(training_df.shape[0], len(all_unique_songs)))

Although we have 526065 tracks in all playlists we include, there are 146399 unique tracks.


In [18]:
def get_negative_samples(training_df, all_unique_songs, number_of_neg_sample):

  """
  number_of_neg_sample : number of negative samples will be added for each playlist,
                         or assign 'same' to add negative samples as much as number of positive samples for each playlist.
  """

  all_neg_samples_list = []
  all_pids = training_df['pid'].unique()

  for playlist_id in tqdm(all_pids, position=0, leave=True):

    # tracks in corresponding playlist
    tracks_in_playlist = training_df[training_df.pid == playlist_id].track_uri.values

    # take the difference between all unique songs and songs in the playlist to get possible neg samples 
    possible_neg_samples =  np.array(list( set(all_unique_songs) - set(tracks_in_playlist) ))

    # get indices of n neg random samples
    random_neg_sample_indices = np.random.randint(0, len(possible_neg_samples), size=(number_of_neg_sample,))

    # get n neg random samples
    neg_samples_for_a_playlist = possible_neg_samples[random_neg_sample_indices]

    for a_track in neg_samples_for_a_playlist:
      all_neg_samples_list.append([playlist_id, a_track])

  all_neg_samples_df = pd.DataFrame(data = all_neg_samples_list, columns=['pid', 'track_uri'])
  all_neg_samples_df['interaction'] = 0

  return all_neg_samples_df

In [19]:
neg_samples_df = get_negative_samples(training_df, all_unique_songs, number_of_neg_sample=20)

100%|██████████| 4907/4907 [06:26<00:00, 12.69it/s]


In [20]:
training_df = pd.concat([training_df, neg_samples_df])

In [21]:
training_df.sort_values(['pid', 'interaction'], ascending=[True, False], inplace=True)

In [22]:
training_df.reset_index(drop=True, inplace=True)

In [23]:
training_df

Unnamed: 0,pid,track_uri,interaction
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1
...,...,...,...
624200,9999,spotify:track:1iQJ8eNglGldMTZnN6DGvq,0
624201,9999,spotify:track:5OzlfXT5ZKAjuBfvXqaZQm,0
624202,9999,spotify:track:0WSv8FOuqHmi659cunJGIn,0
624203,9999,spotify:track:5QSKjxDRVRpd6BeHMv2ZM0,0


In [24]:
le = LabelEncoder()
le.fit(training_df.track_uri)
training_df['track_id'] = le.transform(training_df.track_uri)

In [25]:
le_pid = LabelEncoder()
le_pid.fit(training_df.pid)
training_df['playlist_id'] = le_pid.transform(training_df.pid)

In [26]:
training_df

Unnamed: 0,pid,track_uri,interaction,track_id,playlist_id
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1,9427,0
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1,118356,0
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1,10109,0
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1,22175,0
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1,33472,0
...,...,...,...,...,...
624200,9999,spotify:track:1iQJ8eNglGldMTZnN6DGvq,0,32402,4906
624201,9999,spotify:track:5OzlfXT5ZKAjuBfvXqaZQm,0,101691,4906
624202,9999,spotify:track:0WSv8FOuqHmi659cunJGIn,0,9982,4906
624203,9999,spotify:track:5QSKjxDRVRpd6BeHMv2ZM0,0,102117,4906


#### Preparing test data

In [27]:
def get_test_samples(training_df, number_of_test_sample):

  """
  number_of_neg_sample : number of test samples will be selected for each playlist

  """

  all_test_samples_indices = []
  all_pids = training_df['pid'].unique()

  for playlist_id in tqdm(all_pids, position=0, leave=True):

    # indices of tracks in corresponding playlist
    track_indices = training_df[(training_df.pid == playlist_id) & (training_df.interaction == 1)].index.values

    # randomly select n track
    random_indices = np.random.randint(0, len(track_indices), size=(number_of_test_sample,))
    test_samples_ind_for_a_playlist = track_indices[random_indices]

    for test_sample_ind in test_samples_ind_for_a_playlist:
      all_test_samples_indices.append(test_sample_ind)

  return all_test_samples_indices

In [28]:
test_sample_indices = get_test_samples(training_df, number_of_test_sample=1)

100%|██████████| 4907/4907 [00:13<00:00, 374.43it/s]


In [30]:
test_df = training_df.iloc[test_sample_indices,:]
test_df.reset_index(drop=True, inplace=True)

In [32]:
training_df = training_df.drop(test_sample_indices)
training_df.reset_index(drop=True, inplace=True)

In [38]:
training_df.to_csv('training_df.csv', index=False) 
test_df.to_csv('test_df.csv', index=False) 

---

Use to load back training and test dataframes

In [4]:
training_df = pd.read_csv('training_df.csv')
test_df = pd.read_csv('test_df.csv')

In [41]:
training_df

Unnamed: 0,pid,track_uri,interaction,track_id,playlist_id
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1,9427,0
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,1,118356,0
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,1,10109,0
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,1,22175,0
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,1,33472,0
...,...,...,...,...,...
619293,9999,spotify:track:1iQJ8eNglGldMTZnN6DGvq,0,32402,4906
619294,9999,spotify:track:5OzlfXT5ZKAjuBfvXqaZQm,0,101691,4906
619295,9999,spotify:track:0WSv8FOuqHmi659cunJGIn,0,9982,4906
619296,9999,spotify:track:5QSKjxDRVRpd6BeHMv2ZM0,0,102117,4906


In [42]:
test_df

Unnamed: 0,pid,track_uri,interaction,track_id,playlist_id
0,0,spotify:track:6d8A5sAx9TfdeseDvfWNHd,1,124641,0
1,2,spotify:track:6ToAD7ajJidQTDn72OncDG,1,121855,1
2,3,spotify:track:1Kzxd1kkjaGX4JZz2CYsXB,1,25395,2
3,5,spotify:track:6e8Ou0wiqAzIpWb2eSxll8,1,124937,3
4,7,spotify:track:0UE0RhnRaEYsiYgXpyLoZc,1,9327,4
...,...,...,...,...,...
4902,9992,spotify:track:6QgjcU0zLnzq5OrUoSZ3OK,1,120920,4902
4903,9993,spotify:track:5JBdJ82bsTGX4XHwDeJDHm,1,99919,4903
4904,9994,spotify:track:1U6riPmEZzICc3NVAlxihh,1,28069,4904
4905,9996,spotify:track:5APxd9Oameqe2EF2h5lDHI,1,97237,4905


---

### Model

In [77]:
import keras
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Flatten, Multiply
from keras.optimizers import Adam
from keras.regularizers import l2

In [78]:
num_playlists = training_df.playlist_id.unique().max() + 1
num_tracks = training_df.track_id.unique().max() + 1

emb_vec_size_playlists = 16
emb_vec_size_tracks = 16

In [79]:
playlist_input = Input(shape=(1,), dtype='int32', name = 'playlist_input')
track_input = Input(shape=(1,), dtype='int32', name = 'track_input')


embedding_playlist = Embedding(input_dim = num_playlists, 
                               output_dim = emb_vec_size_playlists, 
                               name = 'playlist_embedding',
                               input_length=1)


embedding_track = Embedding(input_dim = num_tracks, 
                            output_dim = emb_vec_size_tracks, 
                            name = 'track_embedding',
                            input_length=1) 


playlist_emb_vec = Flatten()(embedding_playlist(playlist_input))
track_emb_vec = Flatten()(embedding_track(track_input))


# Element-wise product of playlist and track embeddings 
predict_vector = Multiply()([playlist_emb_vec, track_emb_vec])

prediction = Dense(1, activation='sigmoid', name = 'prediction')(predict_vector)

model = Model(inputs=[playlist_input, track_input], outputs=prediction)

In [80]:
model.summary()

Model: "model_4"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
playlist_input (InputLayer)     [(None, 1)]          0                                            
__________________________________________________________________________________________________
track_input (InputLayer)        [(None, 1)]          0                                            
__________________________________________________________________________________________________
playlist_embedding (Embedding)  (None, 1, 16)        78512       playlist_input[0][0]             
__________________________________________________________________________________________________
track_embedding (Embedding)     (None, 1, 16)        2342384     track_input[0][0]                
____________________________________________________________________________________________

In [81]:
model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

In [82]:
hist = model.fit([training_df.playlist_id.values, training_df.track_id.values],
                  training_df.interaction.values,
                  validation_split=0.15, 
                  batch_size=256, 
                  epochs=3, 
                  shuffle=True)

Epoch 1/3
Epoch 2/3
Epoch 3/3


### Evaluation

In [345]:
test_df

Unnamed: 0,pid,track_uri,interaction,track_id,playlist_id
0,0,spotify:track:6d8A5sAx9TfdeseDvfWNHd,1,124641,0
1,2,spotify:track:6ToAD7ajJidQTDn72OncDG,1,121855,1
2,3,spotify:track:1Kzxd1kkjaGX4JZz2CYsXB,1,25395,2
3,5,spotify:track:6e8Ou0wiqAzIpWb2eSxll8,1,124937,3
4,7,spotify:track:0UE0RhnRaEYsiYgXpyLoZc,1,9327,4
...,...,...,...,...,...
4902,9992,spotify:track:6QgjcU0zLnzq5OrUoSZ3OK,1,120920,4902
4903,9993,spotify:track:5JBdJ82bsTGX4XHwDeJDHm,1,99919,4903
4904,9994,spotify:track:1U6riPmEZzICc3NVAlxihh,1,28069,4904
4905,9996,spotify:track:5APxd9Oameqe2EF2h5lDHI,1,97237,4905


In [138]:
p_ids = test_df.playlist_id.values
t_ids = test_df.track_id.values
test_len = len(p_ids)

In [160]:
test_scores = []
for i in tqdm(range(test_len)):
  score = model.predict([np.reshape([p_ids[i]], (-1,1)), np.reshape([t_ids[i]], (-1,1))])
  test_scores.append(score[0][0])

100%|██████████| 4907/4907 [02:44<00:00, 29.76it/s]


**Evaluation Metric 1** - The avg. probability of suggesting the randomly selected song from the playlist before training to the user as a recommendation (the random song was the playlist is hidden in training part) :

In [324]:
print("{}%".format(round(np.array(test_scores).mean()*100,2)))

82.18%


---

In [216]:
def get_negative_samples_test(training_df, all_unique_songs, number_of_neg_sample):

  """
  number_of_neg_sample : number of negative samples will be added for each playlist,
                         or assign 'same' to add negative samples as much as number of positive samples for each playlist.
  """

  all_neg_samples_list = []
  all_pids = training_df['playlist_id'].unique()

  for p_id in tqdm(all_pids, position=0, leave=True):

    # tracks in corresponding playlist
    tracks_in_playlist = training_df[training_df.playlist_id == p_id].track_id.values

    # take the difference between all unique songs and songs in the playlist to get possible neg samples 
    possible_neg_samples =  np.array(list( set(all_unique_songs) - set(tracks_in_playlist) ))

    # get indices of n neg random samples
    random_neg_sample_indices = np.random.randint(0, len(possible_neg_samples), size=(number_of_neg_sample,))

    # get n neg random samples
    neg_samples_for_a_playlist = possible_neg_samples[random_neg_sample_indices]

    for a_track in neg_samples_for_a_playlist:
      all_neg_samples_list.append([p_id, a_track])

  all_neg_samples_df = pd.DataFrame(data = all_neg_samples_list, columns=['playlist_id', 'track_id'])
  all_neg_samples_df['interaction'] = 0

  return all_neg_samples_df

In [201]:
train_test_df = pd.concat([training_df, test_df])
train_test_df = train_test_df[train_test_df.interaction == 1]
train_test_df.sort_values('pid', inplace=True)
train_test_df.reset_index(drop=True, inplace=True)

In [207]:
train_test_df

Unnamed: 0,pid,track_uri,interaction,track_id,playlist_id
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,1,9427,0
1,0,spotify:track:34ceTg8ChN5HjrqiIYCn9Q,1,57885,0
2,0,spotify:track:5Q0Nhxo0l2bP3pNjpGJwV1,1,101985,0
3,0,spotify:track:6GIrIt2M39wEGwjCQjGChX,1,117795,0
4,0,spotify:track:4E5P1XyAFtrjpiIxkydly4,1,79573,0
...,...,...,...,...,...
526060,9999,spotify:track:5S5rw0WLVCAux5B5bWCehK,1,102630,4906
526061,9999,spotify:track:5y2pNuAvWcPKiORZ12e44S,1,112220,4906
526062,9999,spotify:track:2D2tn77f5ZrADmf8h4zR3t,1,41571,4906
526063,9999,spotify:track:0spUlzosnRQfp22j4wtHa6,1,16702,4906


In [217]:
all_unique_track_ids = train_test_df.track_id.unique()
neg_samples_for_test = get_negative_samples_test(train_test_df, all_unique_track_ids, number_of_neg_sample=99)

100%|██████████| 4907/4907 [03:46<00:00, 21.66it/s]


In [224]:
test_df_2 = pd.concat([neg_samples_for_test, test_df[['playlist_id', 'track_id', 'interaction']]])
test_df_2.sort_values(['playlist_id', 'interaction'], ascending=[True, False], inplace=True)
test_df_2.reset_index(drop=True, inplace=True)

In [227]:
test_df_2

Unnamed: 0,playlist_id,track_id,interaction
0,0,124641,1
1,0,104064,0
2,0,106544,0
3,0,13030,0
4,0,108178,0
...,...,...,...
490695,4906,108344,0
490696,4906,123451,0
490697,4906,121440,0
490698,4906,91521,0


In [280]:
all_scores = []

unique_pid_list = test_df_2.playlist_id.unique()
for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):

  pid_scores = []
  track_ids = test_df_2[test_df_2.playlist_id == pid].track_id.values
  
  for tid in range(len(track_ids)):

      score = model.predict([np.reshape([pid], (-1,1)), np.reshape([track_ids[tid]], (-1,1))])
      pid_scores.append(score[0][0])
    
  all_scores.append(pid_scores)

100%|██████████| 4907/4907 [4:21:15<00:00,  3.19s/it]


In [284]:
np.savetxt('all_scores.txt', all_scores)

In [285]:
#to load back
all_scores_loaded = np.loadtxt('all_scores.txt')

In [None]:
uniuqe_pid_list = test_df_2.playlist_id.unique()

test_click_ranks = []

for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):
  pid_df = test_df_2[test_df_2.playlist_id == pid]
  pid_df['pred_scores'] = all_scores_loaded[pid,:]
  pid_df.sort_values('pred_scores', ascending=False, inplace=True)
  pid_df.reset_index(drop=True, inplace=True)
  test_click_ranks.append(pid_df[pid_df.interaction==1].index.values[0])

In [310]:
test_click_ranks = np.array(test_click_ranks)

**Evaluation Metric 2** - Avg. number of click:

In [313]:
test_click_ranks.mean()

22.577542286529447

**Evaluation Metric 3** - Top-k (in 100 tracks):


We say that we have a **hit**, if the best **k** songs chosen among 100 songs as recommendation for a playlist includes the song that are randomly selected from the playlist before training. And **accuracy score** shows that we have how many 'hit' among all the playlist used in testing.

In [337]:
def print_top_k_acc(click_ranks, k):
  acc = np.sum(click_ranks < k) / len(click_ranks)
  acc = round(acc, 4)
  print("Top-{} accuracy: {}".format(k, acc))

In [343]:
print_top_k_acc(test_click_ranks,1)
print_top_k_acc(test_click_ranks,2)
print_top_k_acc(test_click_ranks,5)
print_top_k_acc(test_click_ranks,10)
print_top_k_acc(test_click_ranks,20)


Top-1 accuracy: 0.2653
Top-2 accuracy: 0.3668
Top-5 accuracy: 0.4981
Top-10 accuracy: 0.581
Top-20 accuracy: 0.6527
