<a href="https://colab.research.google.com/github/mserkantan/spotify-playlist-recommendation/blob/trials/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns

import os, sys
from tqdm import tqdm 
import json

# sklearn libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

import keras
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Flatten, Multiply
from keras.optimizers import Adam
from keras.regularizers import l2

# configure
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)

from google.colab import drive
drive.mount('/content/drive')

sys.path.append('/content/drive/MyDrive/spotify-playlist-recommendation')
from utils import get_all_songs_df, get_negative_samples, get_negative_samples_test, get_playlists_df, get_test_samples, print_top_k_acc

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
playlists_df = get_playlists_df(number_of_files=1)
playlists_df

# apply the threshold to playlists 
threshold_num_tracks = 50
playlists_df = playlists_df[playlists_df.num_tracks > threshold_num_tracks]
playlists_df

# get all songs
all_songs_df = get_all_songs_df(playlists_df)
all_songs_df

# get df that we will use in training // artist_uri and album_uri can be added
training_df = all_songs_df[['pid', 'track_uri']]
training_df

training_df['interaction'] = 1
training_df

# unique songs in all songs
all_unique_songs = training_df['track_uri'].unique()

print("Although we have {} tracks in all playlists we include, there are {} unique tracks.".format(training_df.shape[0], len(all_unique_songs)))


neg_samples_df = get_negative_samples(training_df, all_unique_songs, number_of_neg_sample=20)

training_df = pd.concat([training_df, neg_samples_df])

training_df.sort_values(['pid', 'interaction'], ascending=[True, False], inplace=True)

training_df.reset_index(drop=True, inplace=True)

le = LabelEncoder()
le.fit(training_df.track_uri)
training_df['track_id'] = le.transform(training_df.track_uri)

le_pid = LabelEncoder()
le_pid.fit(training_df.pid)
training_df['playlist_id'] = le_pid.transform(training_df.pid)

training_df

test_sample_indices = get_test_samples(training_df, number_of_test_sample=1)


test_df = training_df.iloc[test_sample_indices,:]
test_df.reset_index(drop=True, inplace=True)

training_df = training_df.drop(test_sample_indices)
training_df.reset_index(drop=True, inplace=True)

training_df.to_csv('training_df.csv', index=False) 
test_df.to_csv('test_df.csv', index=False) 

training_df = pd.read_csv('training_df.csv')
test_df = pd.read_csv('test_df.csv')



In [None]:


num_playlists = training_df.playlist_id.unique().max() + 1
num_tracks = training_df.track_id.unique().max() + 1

emb_vec_size_playlists = 16
emb_vec_size_tracks = 16

playlist_input = Input(shape=(1,), dtype='int32', name = 'playlist_input')
track_input = Input(shape=(1,), dtype='int32', name = 'track_input')


embedding_playlist = Embedding(input_dim = num_playlists, 
                               output_dim = emb_vec_size_playlists, 
                               name = 'playlist_embedding',
                               input_length=1)


embedding_track = Embedding(input_dim = num_tracks, 
                            output_dim = emb_vec_size_tracks, 
                            name = 'track_embedding',
                            input_length=1) 


playlist_emb_vec = Flatten()(embedding_playlist(playlist_input))
track_emb_vec = Flatten()(embedding_track(track_input))


# Element-wise product of playlist and track embeddings 
predict_vector = Multiply()([playlist_emb_vec, track_emb_vec])

prediction = Dense(1, activation='sigmoid', name = 'prediction')(predict_vector)

model = Model(inputs=[playlist_input, track_input], outputs=prediction)

model.compile(optimizer=Adam(lr=1e-3), loss='binary_crossentropy', metrics=['accuracy'])

hist = model.fit([training_df.playlist_id.values, training_df.track_id.values],
                  training_df.interaction.values,
                  validation_split=0.15, 
                  batch_size=256, 
                  epochs=3, 
                  shuffle=True)


In [None]:

p_ids = test_df.playlist_id.values
t_ids = test_df.track_id.values
test_len = len(p_ids)


test_scores = []
for i in tqdm(range(test_len)):
  score = model.predict([np.reshape([p_ids[i]], (-1,1)), np.reshape([t_ids[i]], (-1,1))])
  test_scores.append(score[0][0])

print("{}%".format(round(np.array(test_scores).mean()*100,2)))





train_test_df = pd.concat([training_df, test_df])
train_test_df = train_test_df[train_test_df.interaction == 1]
train_test_df.sort_values('pid', inplace=True)
train_test_df.reset_index(drop=True, inplace=True)


all_unique_track_ids = train_test_df.track_id.unique()
neg_samples_for_test = get_negative_samples_test(train_test_df, all_unique_track_ids, number_of_neg_sample=99)

test_df_2 = pd.concat([neg_samples_for_test, test_df[['playlist_id', 'track_id', 'interaction']]])
test_df_2.sort_values(['playlist_id', 'interaction'], ascending=[True, False], inplace=True)
test_df_2.reset_index(drop=True, inplace=True)

all_scores = []

unique_pid_list = test_df_2.playlist_id.unique()
for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):

  pid_scores = []
  track_ids = test_df_2[test_df_2.playlist_id == pid].track_id.values
  
  for tid in range(len(track_ids)):

      score = model.predict([np.reshape([pid], (-1,1)), np.reshape([track_ids[tid]], (-1,1))])
      pid_scores.append(score[0][0])
    
  all_scores.append(pid_scores)


np.savetxt('all_scores.txt', all_scores)

#to load back
all_scores_loaded = np.loadtxt('all_scores.txt')

uniuqe_pid_list = test_df_2.playlist_id.unique()

test_click_ranks = []

for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):
  pid_df = test_df_2[test_df_2.playlist_id == pid]
  pid_df['pred_scores'] = all_scores_loaded[pid,:]
  pid_df.sort_values('pred_scores', ascending=False, inplace=True)
  pid_df.reset_index(drop=True, inplace=True)
  test_click_ranks.append(pid_df[pid_df.interaction==1].index.values[0])

test_click_ranks = np.array(test_click_ranks)

test_click_ranks.mean()


print_top_k_acc(test_click_ranks,1)
print_top_k_acc(test_click_ranks,2)
print_top_k_acc(test_click_ranks,5)
print_top_k_acc(test_click_ranks,10)
print_top_k_acc(test_click_ranks,20)