<a href="https://colab.research.google.com/github/mserkantan/spotify-playlist-recommendation/blob/trials/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'

import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

import os, sys, time
from tqdm import tqdm 
import json

# sklearn libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import keras
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Flatten, Multiply
from keras.optimizers import Adam
from keras.regularizers import l2

# configure
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

from google.colab import drive
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/spotify-playlist-recommendation')
from utils import get_all_songs_df, get_negative_samples, get_negative_samples_test, get_playlists_df, get_test_samples, print_top_k_acc

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
playlists_df = get_playlists_df(number_of_files=1)
playlists_df

# apply the threshold to playlists 
threshold_num_tracks = 50
playlists_df = playlists_df[playlists_df.num_tracks > threshold_num_tracks]
playlists_df

# get all songs
all_songs_df = get_all_songs_df(playlists_df)
all_songs_df

# get df that we will use in training // artist_uri and album_uri can be added
training_df = all_songs_df[['pid', 'track_uri', 'artist_uri']]
print("Dataset size before filtering the NaNs: {}".format(len(training_df)))
training_df = training_df[training_df[['artist_uri']].notnull().all(1)]
print("Dataset size after filtering the NaNs: {}".format(len(training_df)))

Dataset size before filtering the NaNs: 53898
Dataset size after filtering the NaNs: 53898
       pid  ...                             artist_uri
0        0  ...  spotify:artist:2wIVse2owClT7go1WT98tk
1        0  ...  spotify:artist:26dSoYclwsYLMAKD3tpOr4
2        0  ...  spotify:artist:6vWDO969PvNqNYHIOW5v0m
3        0  ...  spotify:artist:31TPClRtHm23RisEBtV3X7
4        0  ...  spotify:artist:5EvFsr3kj42KNv97ZEnqij
...    ...  ...                                    ...
53893  998  ...  spotify:artist:7x8nK0m0cP2ksQf0mjWdPS
53894  998  ...  spotify:artist:4xFUf1FHVy696Q1JQZMTRj
53895  998  ...  spotify:artist:7dOBabd5O4CvKrg4iriHTM
53896  998  ...  spotify:artist:0qSX3s5pJnAlSsgsCne8Cz
53897  998  ...  spotify:artist:0fiWOxhsBsQQvFDtxUQWo0

[53898 rows x 3 columns]


In [7]:
training_df['interaction'] = 1

# unique songs in all songs
all_unique_songs = training_df['track_uri'].unique()

print("Although we have {} tracks in all playlists we include, there are {} unique tracks.".format(training_df.shape[0], len(all_unique_songs)))

neg_samples_df = get_negative_samples(training_df, all_unique_songs, number_of_neg_sample=20)

training_df = pd.concat([training_df, neg_samples_df])

training_df.sort_values(['pid', 'interaction'], ascending=[True, False], inplace=True)

training_df.reset_index(drop=True, inplace=True)
training_df

  1%|          | 3/494 [00:00<00:18, 26.87it/s]

Although we have 53898 tracks in all playlists we include, there are 29197 unique tracks.


100%|██████████| 494/494 [00:18<00:00, 26.01it/s]


Unnamed: 0,pid,track_uri,artist_uri,interaction
0,0,spotify:track:0UaMYEvWZi0ZqiDOoHU3YI,spotify:artist:2wIVse2owClT7go1WT98tk,1
1,0,spotify:track:6I9VzXrHxO9rA9A5euc8Ak,spotify:artist:26dSoYclwsYLMAKD3tpOr4,1
2,0,spotify:track:0WqIKmW4BTrj3eJFmnCKMv,spotify:artist:6vWDO969PvNqNYHIOW5v0m,1
3,0,spotify:track:1AWQoqb9bSvzTjaLralEkT,spotify:artist:31TPClRtHm23RisEBtV3X7,1
4,0,spotify:track:1lzr43nnXAijIGYnCT8M8H,spotify:artist:5EvFsr3kj42KNv97ZEnqij,1
...,...,...,...,...
63773,998,spotify:track:1jPeNIZAEJr0IvCVN5meRh,spotify:artist:7H6dkUChT5EoOQtUVMg4cN,0
63774,998,spotify:track:1Zq4NhdZi7FDBDQt8YIkvn,spotify:artist:7H6dkUChT5EoOQtUVMg4cN,0
63775,998,spotify:track:3Np3DKcOQuraKONOVVMxGJ,spotify:artist:7H6dkUChT5EoOQtUVMg4cN,0
63776,998,spotify:track:44zQmgpr8TnYChiWHmGyYO,spotify:artist:7H6dkUChT5EoOQtUVMg4cN,0


In [8]:

le = LabelEncoder()
le.fit(training_df.track_uri)
training_df['track_id'] = le.transform(training_df.track_uri)

le_album = LabelEncoder()
le_album.fit(training_df.artist_uri)
training_df['artist_id'] = le_album.transform(training_df.artist_uri)

le_pid = LabelEncoder()
le_pid.fit(training_df.pid)
training_df['playlist_id'] = le_pid.transform(training_df.pid)

training_df

test_sample_indices = get_test_samples(training_df, number_of_test_sample=1)


test_df = training_df.iloc[test_sample_indices,:]
test_df.reset_index(drop=True, inplace=True)

training_df = training_df.drop(test_sample_indices)
training_df.reset_index(drop=True, inplace=True)

training_df.to_csv('training_df.csv', index=False) 
test_df.to_csv('test_df.csv', index=False) 

training_df = pd.read_csv('training_df.csv')
test_df = pd.read_csv('test_df.csv')

100%|██████████| 494/494 [00:00<00:00, 650.91it/s]


In [11]:


num_playlists = training_df.playlist_id.unique().max() + 1
num_tracks = training_df.track_id.unique().max() + 1
num_artists = training_df.artist_id.unique().max() + 1

emb_vec_size_playlists = 16
emb_vec_size_tracks = 16
emb_vec_size_artists = 16

playlist_input = Input(shape=(1,), dtype='int32', name = 'playlist_input')
track_input = Input(shape=(1,), dtype='int32', name = 'track_input')
artist_input = Input(shape=(1,), dtype='int32', name = 'artist_input')


embedding_playlist = Embedding(input_dim = num_playlists, 
                               output_dim = emb_vec_size_playlists, 
                               name = 'playlist_embedding',
                               input_length=1)


embedding_track = Embedding(input_dim = num_tracks, 
                            output_dim = emb_vec_size_tracks, 
                            name = 'track_embedding',
                            input_length=1) 

embedding_artist = Embedding(input_dim = num_artists, 
                            output_dim = emb_vec_size_artists, 
                            name = 'artist_embedding',
                            input_length=1) 


playlist_emb_vec = Flatten()(embedding_playlist(playlist_input))
track_emb_vec = Flatten()(embedding_track(track_input))
artist_emb_vec = Flatten()(embedding_artist(artist_input))


# Element-wise product of playlist and track embeddings 
predict_vector = Multiply()([playlist_emb_vec, track_emb_vec, artist_emb_vec])

prediction = Dense(1, activation='sigmoid', name = 'prediction')(predict_vector)

model = Model(inputs=[playlist_input, track_input, artist_input], outputs=prediction)

model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

hist = model.fit([training_df.playlist_id.values, training_df.track_id.values, training_df.artist_id.values],
                  training_df.interaction.values,
                  validation_split=0.15, 
                  batch_size=256, 
                  epochs=5, 
                  shuffle=True)


Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


In [None]:

p_ids = test_df.playlist_id.values
t_ids = test_df.track_id.values
a_ids = test_df.artist_id.values
test_len = len(p_ids)


test_scores = []
for i in tqdm(range(test_len)):
  score = model.predict([np.reshape([p_ids[i]], (-1,1)), np.reshape([t_ids[i]], (-1,1)), np.reshape([a_ids[i]], (-1,1))])
  test_scores.append(score[0][0])

print("{}%".format(round(np.array(test_scores).mean()*100,2)))


train_test_df = pd.concat([training_df, test_df])
train_test_df = train_test_df[train_test_df.interaction == 1]
train_test_df.sort_values('pid', inplace=True)
train_test_df.reset_index(drop=True, inplace=True)


all_unique_track_ids = train_test_df.track_id.unique()
neg_samples_for_test = get_negative_samples_test(train_test_df, all_unique_track_ids, number_of_neg_sample=99)

test_df_2 = pd.concat([neg_samples_for_test, test_df[['playlist_id', 'track_id', 'artist_id', 'interaction']]])
test_df_2.sort_values(['playlist_id', 'interaction'], ascending=[True, False], inplace=True)
test_df_2.reset_index(drop=True, inplace=True)

all_scores = []

unique_pid_list = test_df_2.playlist_id.unique()
for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):

  pid_scores = []
  track_ids = test_df_2[test_df_2.playlist_id == pid].track_id.values
  artist_ids = test_df_2[test_df_2.playlist_id == pid].artist_id.values

  for id in range(len(track_ids)): #from tid to id (generalizing to n dimensional matrix factorization)

      score = model.predict([np.reshape([pid], (-1,1)), np.reshape([track_ids[id]], (-1,1)), np.reshape([artist_ids[id]], (-1,1))])
      pid_scores.append(score[0][0])
    
  all_scores.append(pid_scores)


np.savetxt('all_scores.txt', all_scores)

#to load back
all_scores_loaded = np.loadtxt('all_scores.txt')

uniuqe_pid_list = test_df_2.playlist_id.unique()

test_click_ranks = []

for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):
  pid_df = test_df_2[test_df_2.playlist_id == pid]
  pid_df['pred_scores'] = all_scores_loaded[pid,:]
  pid_df.sort_values('pred_scores', ascending=False, inplace=True)
  pid_df.reset_index(drop=True, inplace=True)
  test_click_ranks.append(pid_df[pid_df.interaction==1].index.values[0])

test_click_ranks = np.array(test_click_ranks)

test_click_ranks.mean()


print_top_k_acc(test_click_ranks,1)
print_top_k_acc(test_click_ranks,2)
print_top_k_acc(test_click_ranks,5)
print_top_k_acc(test_click_ranks,10)
print_top_k_acc(test_click_ranks,20)