<a href="https://colab.research.google.com/github/mserkantan/spotify-playlist-recommendation/blob/trials/train.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None  # default='warn'
 
import matplotlib.pyplot as plt
from matplotlib import style
import seaborn as sns
 
import os, sys, time
from tqdm import tqdm 
import json
 
# sklearn libraries
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
 
import keras
from keras import backend as K
from keras.models import Sequential, Model, load_model, save_model
from keras.layers.core import Dense, Lambda, Activation
from keras.layers import Embedding, Input, Dense, merge, Reshape, Flatten, Multiply
from keras.optimizers import Adam
from keras.regularizers import l2
 
# configure
%matplotlib inline  
style.use('fivethirtyeight')
sns.set(style='whitegrid',color_codes=True)
 
from google.colab import drive
drive.mount('/content/drive')
 
sys.path.append('/content/drive/MyDrive/spotify-playlist-recommendation')
from utils import get_all_songs_df, get_negative_samples, get_negative_samples_test, get_playlists_df, get_test_samples, print_top_k_acc

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [7]:
class NCFDriver:
  def __init__(self, params, load_dataset=False):
    '''
      Include all the parameters that can be tweaked
      @args: NO, for differentiaton purposes
      @args: file_count, total chunk of playlists to be included 10 corresponds to 10 * 1000 = 10.000
      @args: threshold, mininum number of tracks for playlists to be accepted as valid input
      @args: input_vector, the data for creating embeddings for example, playlist_uri, track_uri, album_uri, artist_uri
      @args: neg_count, amount of wrong samples to prevent data imbalance
      @args: test_count, how many samples will be separated from each dataset for evaluation
      @args: embed_out, embedded vector output sizes default 16 for all.
      @args: conc_method, how to fuse embeddings ("element-wise" is default)
      @args: layers (list), layer input outputs that will be used in MLP. 
      @args: activation, which transfer function will be used for non-linearitization (default is sigmoid)
      @args: optimizer, optimization algorithm (adam is default)
      @args: lr, learning rate (1e-3 is default)
      @args: loss_fcn, loss function (BCE is default) 
      @args: val_split, train_validation split (0.15 is default)
      @args: batch_size, (default is 256)
      @args: epoch, number of epochs for training (3 is default)
    '''    
    #assert len(params.keys()) == 16, "Expected Dictionary Size is 16, given {}".format(len(params.keys())) #FIXME: Ignore that for test purposes! 

    if "NO" in params.keys():
      self.NO = params["NO"]
    else: 
      raise AssertionError('The key "NO" is expected!')

    self.FILE_COUNT   = params["file_count"]    if "file_count"   in params.keys() else 10
    self.THRESHOLD    = params["threshold"]     if "threshold"    in params.keys() else 50
    self.INPUT_VECTOR = params["input_vector"]  if "input_vector" in params.keys() else ["pid", "track_uri", "artist_uri"]
    self.NEG_COUNT    = params["neg_count"]     if "neg_count"    in params.keys() else 20
    self.TEST_COUNT   = params["test_count"]    if "test_count"   in params.keys() else 1
    self.EMBED_OUT    = params["embed_out"]     if "embed_out"    in params.keys() else [16 for i in self.INPUT_VECTOR]
    self.CONC_METHOD  = params["conc_method"]   if "conc_method"  in params.keys() else "element_wise"
    self.LAYERS       = params["layers"]        if "layers"       in params.keys() else [1]
    self.ACTIVATION   = params["activation"]    if "activation"   in params.keys() else "sigmoid"
    self.OPTIMIZER    = params["optimizer"]     if "optimizer"    in params.keys() else "adam"
    self.LR           = params["lr"]            if "lr"           in params.keys() else 1e-3
    self.LOSS_FCN     = params["loss_fcn"]      if "loss_fcn"     in params.keys() else "binary_crossentropy"
    self.VAL_SPLIT    = params["val_split"]     if "val_split"    in params.keys() else 0.15
    self.BATCH_SIZE   = params["batch_size"]    if "batch_size"   in params.keys() else 256
    self.EPOCH        = params["epoch"]         if "epoch"        in params.keys() else 3


    assert "track_uri" in self.INPUT_VECTOR, 'Expected "track_uri" in input_vector, given features: {}'.format(self.INPUT_VECTOR)

    assert "pid" in self.INPUT_VECTOR, 'Expected "pid" in input_vector, given features: {}'.format(self.INPUT_VECTOR)


    self.encoded_labels = {name: LabelEncoder() for name in self.INPUT_VECTOR}
    self.ids = []
    self.names = []

    if not load_dataset:
      self.load_data()
    else:
      self.training_df = pd.read_csv('{}_training_df.csv'.format(self.NO))
      self.test_df = pd.read_csv('{}_test_df.csv'.format(self.NO))

    self.create_embeddings()
    self.train()
    self.predict()

  def load_data(self):

    def get_id_name(name):
      if name == "pid":
        self.names.append("playlist")
        self.ids.append("playlist_id")
        return "playlist_id"
      elif name == "track_uri":
        self.names.append("track")
        self.ids.append("track_id")
        return "track_id"
      elif name == "album_uri":
        self.names.append("album")
        self.ids.append("album_id")
        return "album_id"
      elif name == "artist_uri":
        self.names.append("artist")
        self.ids.append("artist_id")
        return "artist_id"
      else:
        raise AssertionError("An error Occured argument 'input_vector' has invalid column names, {}".format(name))

    self.playlists_df = get_playlists_df(number_of_files=self.FILE_COUNT)
    self.playlists_df = self.playlists_df[self.playlists_df.num_tracks > self.THRESHOLD]
    self.all_songs_df = get_all_songs_df(self.playlists_df)
    self.training_df = self.all_songs_df[self.INPUT_VECTOR]
    self.training_df['interaction'] = 1
    self.all_unique_songs = self.training_df['track_uri'].unique()

    print("Although we have {} tracks in all playlists we include, there are {} unique tracks.\n".format(self.training_df.shape[0], len(self.all_unique_songs)))


    self.neg_samples_df = get_negative_samples(self.training_df, self.all_unique_songs, number_of_neg_sample=self.NEG_COUNT, inp=self.INPUT_VECTOR[2:])
    self.training_df = pd.concat([self.training_df, self.neg_samples_df])
    self.training_df.sort_values(['pid', 'interaction'], ascending=[True, False], inplace=True)
    self.training_df.reset_index(drop=True, inplace=True)

    for inp in self.INPUT_VECTOR:
      col = getattr(self.training_df, inp)
      self.encoded_labels[inp].fit(col)
      id_name = get_id_name(inp) #FIXME: A little shaky cant be trusted, but works for now.
      self.training_df[id_name] = self.encoded_labels[inp].transform(col)

    self.test_sample_indices = get_test_samples(self.training_df, number_of_test_sample=self.TEST_COUNT)
    self.test_df = self.training_df.iloc[self.test_sample_indices,:]
    self.test_df.reset_index(drop=True, inplace=True)
    self.training_df = self.training_df.drop(self.test_sample_indices)
    self.training_df.reset_index(drop=True, inplace=True)

    self.training_df.to_csv('{}_training_df.csv'.format(self.NO), index=False) 
    self.test_df.to_csv('{}_test_df.csv'.format(self.NO), index=False) 

    self.training_df = pd.read_csv('{}_training_df.csv'.format(self.NO))
    self.test_df = pd.read_csv('{}_test_df.csv'.format(self.NO))

    self.train_test_df = pd.concat([self.training_df, self.test_df])
    self.train_test_df = self.train_test_df[self.train_test_df.interaction == 1]
    self.train_test_df.sort_values('pid', inplace=True)
    self.train_test_df.reset_index(drop=True, inplace=True)


    all_unique_track_ids = self.train_test_df.track_id.unique()
    self.neg_samples_for_test = get_negative_samples_test(self.train_test_df, all_unique_track_ids, number_of_neg_sample=99, inp=self.ids[2:]) #buraya embed liste eklenecek


  def create_embeddings(self):
    self.number_of = {id.split("_")[0]: self.training_df[id].unique().max() + 1 for id in self.ids}

    self.emb_vec_size = {name: self.EMBED_OUT[x] for x, name in enumerate(self.names)}

    self.model_input = {name: Input(shape=(1,), dtype='int32', name = '{}_input'.format(name)) for name in self.names}

    self.embedding_of = {name: Embedding(input_dim = self.number_of[name], 
                                  output_dim = self.emb_vec_size[name], 
                                  name = '{}_embedding'.format(name),
                                  input_length=1) for name in self.names}

    self.embedded_vec_of = {name: Flatten()(self.embedding_of[name](self.model_input[name])) for name in self.names}

  def train(self):
    # Element-wise product of playlist and track embeddings 
    self.predict_vector = Multiply()(self.embedded_vec_of.values()) #FIXME: Add more concatenation option!

    self.prediction = Dense(1, activation=self.ACTIVATION, name = 'prediction')(self.predict_vector) #FIXME: To be able to add more layers by arguments!

    self.model = Model(inputs=self.model_input.values(), outputs=self.prediction)

    self.model.compile(optimizer=Adam(lr=self.LR), loss=self.LOSS_FCN, metrics=['accuracy']) #FIXME: Change for the optimizer as well!

    vals = [self.training_df[id].values for id in self.ids]
    self.hist = self.model.fit(vals,
                      self.training_df.interaction.values,
                      validation_split=self.VAL_SPLIT, 
                      batch_size=self.BATCH_SIZE, 
                      epochs=self.EPOCH, 
                      shuffle=True)
  def predict(self):
    test_vals = [self.test_df[id].values for id in self.ids]
    test_len = len(test_vals[0])


    test_scores = []
    for i in tqdm(range(test_len), position=0, leave=True):
      score = self.model.predict([np.reshape([test_vals[id][i]], (-1,1)) for id in range(len(self.ids))])
      test_scores.append(score[0][0])

    print("\n{}%\n".format(round(np.array(test_scores).mean()*100,2)))

    test_df_2 = pd.concat([self.neg_samples_for_test, self.test_df[self.ids + ['interaction']]])
    test_df_2.sort_values(['playlist_id', 'interaction'], ascending=[True, False], inplace=True)
    test_df_2.reset_index(drop=True, inplace=True)

    all_scores = []
    
    unique_pid_list = test_df_2.playlist_id.unique()
    for pid in tqdm(range(len(unique_pid_list)), position=0, leave=True):

      pid_scores = []
      ids_of = {name: test_df_2[test_df_2.playlist_id == pid][self.ids[id]].values for id, name in enumerate(self.names)}

      for id in range(len(ids_of["track"])): #from tid to id (generalizing to n dimensional matrix factorization)
          arr = [np.reshape([pid], (-1,1))]
          for name in self.names:
            if name != "playlist":
              arr.append(np.reshape([ids_of[name][id]], (-1,1)))

          score = self.model.predict(arr)
          pid_scores.append(score[0][0])
        
      all_scores.append(pid_scores)


    np.savetxt('{}_all_scores.txt'.format(self.NO), all_scores)

    #to load back
    all_scores_loaded = np.loadtxt('{}_all_scores.txt'.format(self.NO))

    uniuqe_pid_list = test_df_2.playlist_id.unique()

    test_click_ranks = []

    for pid in tqdm(range(len(unique_pid_list)),position=0, leave=True):
      pid_df = test_df_2[test_df_2.playlist_id == pid]
      pid_df['pred_scores'] = all_scores_loaded[pid,:]
      pid_df.sort_values('pred_scores', ascending=False, inplace=True)
      pid_df.reset_index(drop=True, inplace=True)
      test_click_ranks.append(pid_df[pid_df.interaction==1].index.values[0])

    test_click_ranks = np.array(test_click_ranks)

    test_click_ranks.mean()


    print_top_k_acc(test_click_ranks,1)
    print_top_k_acc(test_click_ranks,2)
    print_top_k_acc(test_click_ranks,5)
    print_top_k_acc(test_click_ranks,10)
    print_top_k_acc(test_click_ranks,20)
    

In [None]:
params = {"NO": 1, 
          "input_vector": ["pid", "track_uri", "album_uri"]
          }
nfc = NCFDriver(params=params, load_dataset=False)



  0%|          | 0/4907 [00:00<?, ?it/s]

Although we have 526065 tracks in all playlists we include, there are 146399 unique tracks.



  4%|▍         | 187/4907 [00:44<18:42,  4.21it/s]

In [None]:
params = {"NO": 2, 
          "input_vector": ["pid", "track_uri", "artist_uri"]
          }
nfc = NCFDriver(params=params, load_dataset=False)