In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
from sklearn.utils import shuffle

subset100 = pd.read_csv("../raw_data/track_meta_100subset_new.csv")
subset100 = shuffle(subset100)
# subset100 = subset100.reset_index()

In [151]:
class RecommenderSystem():
    """Represents the scheme for implementing a recommender system."""
    def __init__(self, training_data, *params):
        """Initializes the recommender system. 
        
        Note that training data has to be provided when instantiating. 
        Optional parameters are passed to the underlying system.
        """
        raise NotImplementedError
        
    def train(self, *params):
        """Starts training. Passes optional training parameters to the system."""
        raise NotImplementedError
    
    def score(self, user_id, hotel_id):
        """Returns a single score for a user-hotel pair.
        
        If no prediction for the given pair can be made, an exception should be raised.
        """
        raise NotImplementedError

In [152]:
class ALSRecommenderSystem(RecommenderSystem):
    """Provides a biased ALS-based implementation of an implicit recommender system."""
    def __init__(self, training_data, biased, latent_dimension, log_dir=None, confidence=20):
        """Initializes the recommender system.
        
        Keyword arguments:
        training_data: Data to train on.
        biased: Whether to include user- and item-related biases in the model.
        latent_dimension: Dimension of the latent space.
        log_dir: Optional pointer to directory storing logging information.
        confidence: Confidence value that should be assigned to pairs where interaction 
                    was present. Since the data includes single interactions only, simply
                    assigining 1 for non-interactions and this value otherwise suffices.
                    Should be greater than 1.
        """
        self.biased = biased
        self.confidence = confidence
        self.latent_dimension = latent_dimension
        self.U = None
        self.V = None
        self.log_dir = log_dir
        self.C_users, self.P_users, self.C_items, self.P_items, self.mapping_users, self.mapping_hotels = self._build_matrices(training_data, confidence)
        self.user_dim, self.item_dim = self.P_users.shape
        
    def _build_matrices(self, activity, confidence):
        """Build the initial matrices."""
        distinct_users = len(set(activity['user']))
        distinct_hotels = len(set(activity['hotel']))
        C_users = np.ones(shape=(distinct_users, distinct_hotels))
        P_users = np.zeros(shape=(distinct_users, distinct_hotels))
        C_items = np.ones(shape=(distinct_hotels, distinct_users))
        P_items = np.zeros(shape=(distinct_hotels, distinct_users))

        mapping_users = {}
        mapping_hotels = {}
        user_ct = 0
        hotel_ct = 0

        for index, row in activity.iterrows():
            user, hotel = row
            if not user in mapping_users:
                mapping_users[user] = user_ct
                user_ct += 1
            if not hotel in mapping_hotels:
                mapping_hotels[hotel] = hotel_ct
                hotel_ct += 1
            user_index, hotel_index = mapping_users[user], mapping_hotels[hotel]
            C_users[user_index, hotel_index] = confidence
            P_users[user_index, hotel_index] = 1
            C_items[hotel_index, user_index] = confidence
            P_items[hotel_index, user_index] = 1
        return C_users, P_users, C_items, P_items, mapping_users, mapping_hotels
    
    def save(self, directory):
        """Saves current matrices to the given directory."""
        np.save(os.path.join(directory, 'U.npy'), self.U)
        np.save(os.path.join(directory, 'V.npy'), self.V)
        np.save(os.path.join(directory, 'training_data.npy'), self.training_data)
        np.save(os.path.join(directory, 'params.npy'), np.array([self.confidence]))
        if self.biased:
            np.save(os.path.join(directory, 'user_biases.npy'), self.user_biases)
            np.save(os.path.join(directory, 'item_biases.npy'), self.item_biases)
        
    def load(self, directory):
        """Loads matrices from the given directory."""
        self.U = np.load(os.path.join(directory, 'U.npy'))
        self.V = np.load(os.path.join(directory, 'V.npy'))
        self.training_data = np.load(os.path.join(directory, 'training_data.npy'))
        self.confidence = np.load(os.path.join(directory, 'params.npy')).flatten()
        if self.biased:
            self.user_biases = np.load(os.path.join(directory, 'user_biases.npy'))
            self.item_biases = np.load(os.path.join(directory, 'item_biases.npy'))
            
        self.C_users, self.P_users, self.C_items, self.P_items, self.mapping_users, self.mapping_hotels = self._build_matrices(self.training_data, self.confidence)
        self.user_dim, self.item_dim = self.P_users.shape
        
    def _single_step(self, lbd):
        """Executes a single optimization step using (biased) ALS, with lbd as regularization factor."""
        C_users, P_users, C_items, P_items, mapping_users, mapping_hotels = self.C_users, self.P_users, self.C_items, self.P_items, self.mapping_users, self.mapping_hotels
        biased = self.biased
        
        # Update U.
        if biased: # Expand matrices to account for biases.
            U_exp = np.hstack((self.user_biases.reshape(-1,1), self.U))
            V_exp = np.hstack((np.ones_like(self.item_biases).reshape(-1,1), self.V))
            kdim = self.latent_dimension + 1
        else: # We work with copies here to make it safer to abort within updates.
            U_exp = self.U.copy()
            V_exp = self.V.copy()
            kdim = self.latent_dimension
        Vt = np.dot(np.transpose(V_exp), V_exp)
        for user_index in tqdm(range(self.user_dim)):
            C = np.diag(C_users[user_index])
            d = np.dot(C, P_users[user_index] - (0 if not biased else self.item_biases))
            val = np.dot(np.linalg.inv(Vt + np.dot(np.dot(V_exp.T, C - np.eye(self.item_dim)), V_exp) + lbd*np.eye(kdim)), np.transpose(V_exp))
            U_exp[user_index] = np.dot(val, d)    
        if biased:
            self.user_biases = U_exp[:,0]
            self.U = U_exp[:,1:]
        else:
            self.U = U_exp
            
        # Update V.
        if biased:
            U_exp = np.hstack((np.ones_like(self.user_biases).reshape(-1,1), self.U))
            V_exp = np.hstack((self.item_biases.reshape(-1,1), self.V))
        else: # We work with copies here to make it safer to abort within updates.
            U_exp = self.U.copy()
            V_exp = self.V.copy()
        
        Ut = np.dot(np.transpose(U_exp), U_exp)
        for item_index in tqdm(range(self.item_dim)):
            C = np.diag(C_items[item_index])
            d = np.dot(C, P_items[item_index] - (0 if not biased else self.user_biases))
            val = np.dot(np.linalg.inv(Ut + np.dot(np.dot(U_exp.T, C-np.eye(self.user_dim)), U_exp) + lbd*np.eye(kdim)), np.transpose(U_exp))
            V_exp[item_index] = np.dot(val, d)
        if biased:
            self.item_biases = V_exp[:, 0]
            self.V = V_exp[:,1:]
        else:
            self.V = V_exp
    
    def compute_loss(self, lbd):
        """Computes loss value on the training data.
        
        Returns a tuple of total loss and prediction loss (excluding regularization loss).
        """
        C_users, P_users, C_items, P_items, mapping_users, mapping_hotels = self.C_users, self.P_users, self.C_items, self.P_items, self.mapping_users, self.mapping_hotels
        main_loss = 0
        # Main loss term.
        for user_index in range(self.user_dim):
            for item_index in range(self.item_dim):
                pred = np.dot(self.U[user_index].T, self.V[item_index])
                if self.biased:
                    pred += self.user_biases[user_index] + self.item_biases[item_index]
                loss = self.C_users[user_index, item_index] * (P_users[user_index, item_index]-pred)**2
                main_loss += loss

        # Regularization term.
        reg_loss = 0
        if lbd > 0:
            for user_index in range(self.user_dim):
                reg_loss += np.sum(self.U[user_index]**2) + (0 if not self.biased else self.user_biases[user_index]**2)
            for item_index in range(self.item_dim):
                reg_loss += np.sum(self.V[item_index]**2) + (0 if not self.biased else self.item_biases[item_index]**2)
            reg_loss *= lbd
        return main_loss + reg_loss, main_loss

    def train(self, lbd, iterations=20, verbose=True):
        """
        Trains the recommendation system.
        
        Keyword arguments:
        lbd: Regularization factor.
        iterations: Number of iterations to run ALS.
        verbose: Whether to plot and output training loss.
        """
        if self.U is None or self.V is None:
            self.U = np.random.normal(size=(self.user_dim, self.latent_dimension))
            self.V = np.random.normal(size=(self.item_dim, self.latent_dimension))
            self.user_biases = np.zeros(self.user_dim)
            self.item_biases = np.zeros(self.item_dim)
            self.history_losses = []
            self.history_main_losses = []
            self.history_avg_score = []
            self.history_avg_rank = []
        
        it = 0
        while(it < iterations):
            self._single_step(lbd) 
            loss, main_loss = self.compute_loss(lbd)
            self.history_losses.append(loss)
            self.history_main_losses.append(main_loss)
                
            if verbose:        
                clear_output(wait=True)
                print('LOSS:', loss, 'MAIN LOSS:', main_loss)
                
                plt.figure(figsize=(5,5))
                plt.title('training loss (lower is better)')
                plt.plot(range(len(self.history_losses)), self.history_losses)
                plt.plot(range(len(self.history_main_losses)), self.history_main_losses, color='orange')
                plt.plot(range(len(self.history_main_losses)), np.array(self.history_losses) - np.array(self.history_main_losses), color='green')
                plt.legend(['total loss', 'data loss', 'regularizing loss'])
                if self.log_dir is not None:
                    plt.savefig(os.path.join(self.log_dir, 'log.png'), bbox_inches='tight', format='png')
                plt.show()     
            it += 1

    def reset(self):
        """Resets the recommendation system's internal state."""
        self.U = None
        self.V = None
        
    def score(self, user_id, hotel_id):
        """Returns the scoring of hotel_id for user_id."""
        if self.U is None or self.V is None:
            raise ValueError('system has to be trained first')
        if user_id not in self.mapping_users:
            raise ValueError('user unknown')
        if hotel_id not in self.mapping_hotels:
            raise ValueError('hotel unknown')
            
        user_index = self.mapping_users[user_id]
        hotel_index = self.mapping_hotels[hotel_id]
        pred = np.dot(self.U[user_index], self.V[hotel_index])
        if self.biased: # Include applicable biases.
            pred += self.user_biases[user_index] + self.item_biases[hotel_index]
        return pred

In [153]:
train_cleaned.head()

Unnamed: 0,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,189768,0.909,100,0.468,0.693,0.0,6,0.116,-7.059,0,0.553,84.762,4,0.308
1,226632,0.0306,68,0.69,0.502,0.0,0,0.145,-7.077,0,0.0638,86.185,4,0.329
2,176561,0.00346,81,0.723,0.809,0.00123,7,0.565,-3.081,0,0.0625,98.007,4,0.274
3,241688,0.634,73,0.566,0.664,0.0,4,0.116,-5.303,0,0.0464,128.945,4,0.437
4,448533,0.131,81,0.69,0.718,0.000729,4,0.615,-4.743,0,0.135,76.007,4,0.401


### Train-val-test split

In [4]:
subset100.head()

Unnamed: 0,Playlistid,Trackid,Artist_Name,Track_Name,Album_Name,Track_Duration,Artist_uri,Track_uri,Album_uri,acousticness,...,loudness,mode,speechiness,tempo,time_signature,valence,Playlist,Album,Track,Artist
1538,155598,27,Enrique Iglesias,Bailando - Spanish Version,SEX AND LOVE,243413,spotify:artist:7qG3b048QCHVRO5Pv1T5lw,spotify:track:32lm3769IRfcnrQV11LO4E,spotify:album:2kZkiVn1m00XcgaWlLb2LD,0.0426,...,-3.503,1,0.108,91.017,4,0.961,Lovey.,44,45,42
1428,153929,13,Fetty Wap,679 (feat. Remy Boyz),679 (feat. Remy Boyz),196693,spotify:artist:6PXS4YHDkKvl1wkIl4V8DL,spotify:track:5NQbUaeTEOGdD6hHcre0dZ,spotify:album:0TyDTzG2zt2tg0wONT9wSU,0.00256,...,-5.738,1,0.318,190.05,4,0.603,All songs,66,83,41
2807,241546,66,Daughtry,It's Not Over,Daughtry (Deluxe Edition),215026,spotify:artist:5P5FTygHyx2G57oszR3Wot,spotify:track:234RqTZmnDTnxWlVciXHLD,spotify:album:6ii8Ja21xLGkFH8CQ6bKjj,0.0503,...,-3.245,0,0.0618,145.927,4,0.294,good old songs,99,122,82
1103,97870,3,Alexandra Stan,Mr. Saxobeat - Radio Edit,Saxobeats,195280,spotify:artist:0BmLNz4nSLfoWYW1cYsElL,spotify:track:4zIaXqKCXZs02eB6jEe5Mf,spotify:album:11UM41JGr8BavupgcGCICa,0.0239,...,-4.162,0,0.0527,127.047,4,0.745,Gym,6,6,6
1130,107065,10,Digital Farm Animals,Millionaire - Alan Walker Remix,Millionaire,189346,spotify:artist:5fyDppLDl1juIu1BcUT5zh,spotify:track:5HzbxweVAtisCrpoOF11qt,spotify:album:05XuMZb14sDGgreCf3Wu6F,0.0186,...,-4.577,1,0.0545,96.044,4,0.424,feels,11,11,11


In [5]:
# Train-val-test split (20%)
train, test = train_test_split(subset100, test_size=0.2, random_state=42, stratify = subset100['Playlistid'])
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify = train['Playlistid'])

In [6]:
# Drop features here
features_drop = ["Playlistid","Playlist","Album", "Track", "Artist", "Trackid", "Artist_Name", "Track_Name", "Album_Name", "Artist_uri", "Track_uri", "Album_uri", "artist_genres", "explicit"]
train_cleaned, val_cleaned, test_cleaned = train.drop(features_drop, axis =1), val.drop(features_drop, axis=1), test.drop(features_drop, axis=1)

In [7]:
train = train.reset_index(drop=True)
train_cleaned = train_cleaned.reset_index(drop=True)

In [9]:
train_cleaned.head()

Unnamed: 0,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,189768,0.909,100,0.468,0.693,0.0,6,0.116,-7.059,0,0.553,84.762,4,0.308
1,226632,0.0306,68,0.69,0.502,0.0,0,0.145,-7.077,0,0.0638,86.185,4,0.329
2,176561,0.00346,81,0.723,0.809,0.00123,7,0.565,-3.081,0,0.0625,98.007,4,0.274
3,241688,0.634,73,0.566,0.664,0.0,4,0.116,-5.303,0,0.0464,128.945,4,0.437
4,448533,0.131,81,0.69,0.718,0.000729,4,0.615,-4.743,0,0.135,76.007,4,0.401


### Content-based Collaborative Filtering

### Create a cosine-similarity matrix

In [11]:
from sklearn.metrics.pairwise import cosine_similarity 
from sklearn.preprocessing import MinMaxScaler

# Standardize the data
scaler = MinMaxScaler()
scaler.fit(train_cleaned)
train_scaled = scaler.transform(train_cleaned)
test_scaled = scaler.transform(test_cleaned)

In [144]:
train_cleaned.head()

Unnamed: 0,Track_Duration,acousticness,artist_popularity,danceability,energy,instrumentalness,key,liveness,loudness,mode,speechiness,tempo,time_signature,valence
0,189768,0.909,100,0.468,0.693,0.0,6,0.116,-7.059,0,0.553,84.762,4,0.308
1,226632,0.0306,68,0.69,0.502,0.0,0,0.145,-7.077,0,0.0638,86.185,4,0.329
2,176561,0.00346,81,0.723,0.809,0.00123,7,0.565,-3.081,0,0.0625,98.007,4,0.274
3,241688,0.634,73,0.566,0.664,0.0,4,0.116,-5.303,0,0.0464,128.945,4,0.437
4,448533,0.131,81,0.69,0.718,0.000729,4,0.615,-4.743,0,0.135,76.007,4,0.401


In [146]:
train_scaled[:5]

array([[2.74086782e-01, 9.43924885e-01, 1.00000000e+00, 4.23728814e-01,
        6.97191399e-01, 0.00000000e+00, 5.45454545e-01, 9.93320405e-02,
        7.42469636e-01, 0.00000000e+00, 8.29554339e-01, 1.64930428e-01,
        7.50000000e-01, 2.93056430e-01],
       [3.39967117e-01, 3.17696885e-02, 6.80000000e-01, 6.74576271e-01,
        5.03109377e-01, 0.00000000e+00, 0.00000000e+00, 1.30575307e-01,
        7.41740891e-01, 0.00000000e+00, 6.45817045e-02, 1.74426908e-01,
        7.50000000e-01, 3.15082861e-01],
       [2.50484309e-01, 3.58675124e-03, 8.10000000e-01, 7.11864407e-01,
        8.15063204e-01, 1.30296610e-03, 6.36363636e-01, 5.83063995e-01,
        9.03522267e-01, 0.00000000e+00, 6.25488663e-02, 2.53321766e-01,
        7.50000000e-01, 2.57394588e-01],
       [3.66873972e-01, 6.58357172e-01, 7.30000000e-01, 5.34463277e-01,
        6.67723448e-01, 0.00000000e+00, 3.63636364e-01, 9.93320405e-02,
        8.13562753e-01, 0.00000000e+00, 3.73729476e-02, 4.59788448e-01,
        7.500

In [12]:
train_scaled_cos_matrix = cosine_similarity(train_scaled)
# np.dot(np.array(trial_scaled[0,]), np.array(trial_scaled[5,]))/ (np.linalg.norm(trial_scaled[0,]) * np.linalg.norm(trial_scaled[5,]))


In [21]:
train_scaled_cos_matrix.shape

(1970, 1970)

In [13]:
train_scaled_cos_matrix[:5]

array([[1.        , 0.77886315, 0.79322869, ..., 0.75797626, 0.89769807,
        0.86462959],
       [0.77886315, 1.        , 0.91136461, ..., 0.74585765, 0.8543402 ,
        0.93407908],
       [0.79322869, 0.91136461, 1.        , ..., 0.7696172 , 0.86108348,
        0.95182369],
       [0.89793126, 0.90315705, 0.88722029, ..., 0.85739621, 0.96830316,
        0.97063001],
       [0.81008439, 0.93095577, 0.95230958, ..., 0.75950365, 0.85586662,
        0.92814526]])

In [20]:
display(subset100.head())
subset100.shape

Unnamed: 0,Playlistid,Trackid,Artist_Name,Track_Name,Album_Name,Track_Duration,Artist_uri,Track_uri,Album_uri,acousticness,...,loudness,mode,speechiness,tempo,time_signature,valence,Playlist,Album,Track,Artist
1538,155598,27,Enrique Iglesias,Bailando - Spanish Version,SEX AND LOVE,243413,spotify:artist:7qG3b048QCHVRO5Pv1T5lw,spotify:track:32lm3769IRfcnrQV11LO4E,spotify:album:2kZkiVn1m00XcgaWlLb2LD,0.0426,...,-3.503,1,0.108,91.017,4,0.961,Lovey.,44,45,42
1428,153929,13,Fetty Wap,679 (feat. Remy Boyz),679 (feat. Remy Boyz),196693,spotify:artist:6PXS4YHDkKvl1wkIl4V8DL,spotify:track:5NQbUaeTEOGdD6hHcre0dZ,spotify:album:0TyDTzG2zt2tg0wONT9wSU,0.00256,...,-5.738,1,0.318,190.05,4,0.603,All songs,66,83,41
2807,241546,66,Daughtry,It's Not Over,Daughtry (Deluxe Edition),215026,spotify:artist:5P5FTygHyx2G57oszR3Wot,spotify:track:234RqTZmnDTnxWlVciXHLD,spotify:album:6ii8Ja21xLGkFH8CQ6bKjj,0.0503,...,-3.245,0,0.0618,145.927,4,0.294,good old songs,99,122,82
1103,97870,3,Alexandra Stan,Mr. Saxobeat - Radio Edit,Saxobeats,195280,spotify:artist:0BmLNz4nSLfoWYW1cYsElL,spotify:track:4zIaXqKCXZs02eB6jEe5Mf,spotify:album:11UM41JGr8BavupgcGCICa,0.0239,...,-4.162,0,0.0527,127.047,4,0.745,Gym,6,6,6
1130,107065,10,Digital Farm Animals,Millionaire - Alan Walker Remix,Millionaire,189346,spotify:artist:5fyDppLDl1juIu1BcUT5zh,spotify:track:5HzbxweVAtisCrpoOF11qt,spotify:album:05XuMZb14sDGgreCf3Wu6F,0.0186,...,-4.577,1,0.0545,96.044,4,0.424,feels,11,11,11


(3079, 28)

In [84]:
def cos_similar_songs_playlist(cos_matrix, orig_df, target_playlist_inx, cand_list_size):
    """
    Input:
    cos_matrix: cosine matrix of the tracks
    orig_df: original df with tracks as rows, but with playlistid and other features (e.g., train)
    target_playlist_inx: index of playlist in the training set
    cand_list_size: candidate list of songs to recommend size (= test-set size * 15)
    
    Output:
    k_song_to_recommend: the most similar tracks per track
    """
    target_playlistid = train.loc[target_playlist_inx, "Playlistid"]
    target_track_inx = np.where(train["Playlistid"] == target_playlistid)
    
    # remove rows in matrix with songs that live in the same playlist with the target song
    # candidate_cos_matrix = np.delete(cos_matrix, target_track_inx, 1) # remoe songs that co-exist in the same playlist
    candidate_cos_matrix = cos_matrix
    
    ## For each song in the playlist, find k similar songs
    cand_list = []
    # cand_list_size = k*15
    # taget_track_inx
    k = np.floor(cand_list_size/len(target_track_inx)) # round(cand_list_size/len(target_track_inx))
    k_rest = cand_list_size - k*len(target_track_inx)
    # e.g., for a candidate list size of 30, get 3 songs for each track first
    for inx, i in enumerate(target_track_inx):
        candidate_song_rec = candidate_cos_matrix[target_inx, ]
        candidate_song_rec_inx = np.argsort(candidate_song_rec)
        unique_candidate_song_sorted = train['Track_uri'][candidate_song_rec_inx][::-1].drop_duplicates()
        tracks_in_target_playlist = train.loc[train["Playlistid"] == target_playlistid, "Track_uri"]
        song_to_recommend = np.array(unique_candidate_song_sorted.loc[~unique_candidate_song_sorted.isin(tracks_in_target_playlist)])
        
        if (k_rest !=0 & inx <= k_rest): # 30-24 = 6; for the first 6 tracks recommend k + 1 songs
            k_song_to_recommend = song_to_recommend[:int(k+1)]
        else:
            k_song_to_recommend = song_to_recommend[:int(k)]
        cand_list.append(k_song_to_recommend)

    return cand_list

In [40]:
target_track_inx = np.where(train["Playlistid"] == target_playlistid)

In [30]:
cos_matrix = train_scaled_cos_matrix
orig_df = train
target_track_inx = 0 # target playlist inx
k = 5

trial_songs = cos_similar_songs_track(train_scaled_cos_matrix, train, 0, 5)

In [94]:
trial_songs

array(['spotify:track:7vRriwrloYVaoAe3a9wJHe',
       'spotify:track:22DKsoYFV5npPXmnPpXL7i',
       'spotify:track:00LfFm08VWeZwB0Zlm24AT',
       'spotify:track:6euh6chaRsi3Ywb1gA9LlO',
       'spotify:track:0QpYkajexWrB0P3TWvkHlm'], dtype=object)

In [33]:
unique_playlistid = train['Playlistid'].drop_duplicates()

In [34]:
unique_playlistid

0        61388
1        51590
2       193450
3       198885
4        73524
5       223534
6       231023
7         2259
8       175237
9       100221
10      256979
12      206080
13      208692
14      151474
15        2535
17       67941
18      114695
20      210308
21       68119
22      221044
23       38828
25       59745
27      117841
28      107065
29      190503
30      211406
31      190574
32      118724
33      229646
35       20043
         ...  
150     196206
154     242823
156     216556
161     123429
162     219212
163     159326
186     120613
187      81542
191     248553
213     252197
224      11119
226     107941
250      10254
252      97870
255      46047
257      77069
300     268318
308     208779
327      90187
335      48330
341     196255
345     182533
379     232421
381     238096
410      62657
434      41347
625     155081
664     161016
946     217189
1130     79469
Name: Playlistid, Length: 100, dtype: int64

## Making Predictions

In [119]:
def nholdout(playlist_id, df):
    '''Pass in a playlist id to get number of songs held out in val/test set'''
    
    return len(df[df.Playlistid == playlist_id].Track_uri)

In [126]:
### Prediction Example
pi = 430 # target playlist index
kpreds = cos_similar_songs_playlist(train_scaled_cos_matrix, train, pi, nholdout(pi, val)*15)[0]

In [148]:
kpreds

array(['spotify:track:4pc01CAhGKx15PY23uPjHp',
       'spotify:track:7vRriwrloYVaoAe3a9wJHe',
       'spotify:track:4jtyUzZm9WLc2AdaJ1dso7',
       'spotify:track:22DKsoYFV5npPXmnPpXL7i',
       'spotify:track:00LfFm08VWeZwB0Zlm24AT',
       'spotify:track:6euh6chaRsi3Ywb1gA9LlO',
       'spotify:track:0QpYkajexWrB0P3TWvkHlm',
       'spotify:track:2durxb17bXcmQJHSt8JAdO',
       'spotify:track:4FHu9bOzBjZurx89CMa42L',
       'spotify:track:1k1Bqnv2R0uJXQN4u6LKYt',
       'spotify:track:7cNz65PfCatRXoX7QtqM2A',
       'spotify:track:0osPUefhvYxoB2eZw6prBt',
       'spotify:track:3lKqtKFnN1Xi4W69YDN9PB',
       'spotify:track:3pndPhlQWjuSoXhcIIdBjv',
       'spotify:track:5dLz5CRzW8RcNEptn0NtOi',
       'spotify:track:03L2AoiRbWhvt7BDMx1jUB',
       'spotify:track:5tz69p7tJuGPeMGwNTxYuV',
       'spotify:track:31Q9ZTF9x81BDonlObCbvP',
       'spotify:track:3ZLyt2ndLFBh148XRYjYYZ',
       'spotify:track:2G9lekfCh83S0lt2yfffBz',
       'spotify:track:68EMU2RD1ECNeOeJ5qAXCV',
       'spoti

In [128]:
val_set = val[val.Playlistid == pi]
val_set = val_set['Track_uri'] # ground truth

## Metrics

In [129]:
def r_precision(prediction, val_set):
# prediction should be a list of predictions
# val_set should be pandas Series of ground truths
    score = np.sum(val_set.isin(prediction))/val_set.shape[0]
    return score

In [130]:
kpreds

array(['spotify:track:4pc01CAhGKx15PY23uPjHp',
       'spotify:track:7vRriwrloYVaoAe3a9wJHe',
       'spotify:track:4jtyUzZm9WLc2AdaJ1dso7',
       'spotify:track:22DKsoYFV5npPXmnPpXL7i',
       'spotify:track:00LfFm08VWeZwB0Zlm24AT',
       'spotify:track:6euh6chaRsi3Ywb1gA9LlO',
       'spotify:track:0QpYkajexWrB0P3TWvkHlm',
       'spotify:track:2durxb17bXcmQJHSt8JAdO',
       'spotify:track:4FHu9bOzBjZurx89CMa42L',
       'spotify:track:1k1Bqnv2R0uJXQN4u6LKYt',
       'spotify:track:7cNz65PfCatRXoX7QtqM2A',
       'spotify:track:0osPUefhvYxoB2eZw6prBt',
       'spotify:track:3lKqtKFnN1Xi4W69YDN9PB',
       'spotify:track:3pndPhlQWjuSoXhcIIdBjv',
       'spotify:track:5dLz5CRzW8RcNEptn0NtOi',
       'spotify:track:03L2AoiRbWhvt7BDMx1jUB',
       'spotify:track:5tz69p7tJuGPeMGwNTxYuV',
       'spotify:track:31Q9ZTF9x81BDonlObCbvP',
       'spotify:track:3ZLyt2ndLFBh148XRYjYYZ',
       'spotify:track:2G9lekfCh83S0lt2yfffBz',
       'spotify:track:68EMU2RD1ECNeOeJ5qAXCV',
       'spoti

In [131]:
### Example Usage
r_precision(kpreds, val_set)

0.0

In [132]:
### NDCG Code Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

In [133]:
### Example Usage
# Generate binary relevance array
r = np.zeros(len(kpreds))
for i, p in enumerate(kpreds):
    if p in val_set:
        r[i] = 1

ndcg_at_k(r, len(r))

0.0

## Baseline Model Performance

In [112]:
unique_playlistid

0        61388
1        51590
2       193450
3       198885
4        73524
5       223534
6       231023
7         2259
8       175237
9       100221
10      256979
12      206080
13      208692
14      151474
15        2535
17       67941
18      114695
20      210308
21       68119
22      221044
23       38828
25       59745
27      117841
28      107065
29      190503
30      211406
31      190574
32      118724
33      229646
35       20043
         ...  
150     196206
154     242823
156     216556
161     123429
162     219212
163     159326
186     120613
187      81542
191     248553
213     252197
224      11119
226     107941
250      10254
252      97870
255      46047
257      77069
300     268318
308     208779
327      90187
335      48330
341     196255
345     182533
379     232421
381     238096
410      62657
434      41347
625     155081
664     161016
946     217189
1130     79469
Name: Playlistid, Length: 100, dtype: int64

In [141]:
rps = []
ndcgs = []
for pid in unique_playlistid: # loop through each playlist
    print(pid)
    ps = cos_similar_songs_playlist(train_scaled_cos_matrix, train, pi, nholdout(pi, val)*50)[0]# predictions
    vs = val[val.Playlistid == pid].Track_uri # ground truth
    rps.append(r_precision(ps, vs)) # append individual r-precision score
    
    # NDCG
    r = np.zeros(len(ps))
    for i, p in enumerate(ps):
        if np.any(vs.isin([p])):
            r[i] = 1
    ndcgs.append(ndcg_at_k(r, len(r)))
    

61388
51590
193450
198885
73524
223534
231023
2259
175237
100221
256979
206080
208692
151474
2535
67941
114695
210308
68119
221044
38828
59745
117841
107065
190503
211406
190574
118724
229646
20043
241349
622
430
92360
33568
4575
22204
56681
241546
168982
164819
155598
11136
181874
116737
118342
195247
120354
6238
151748
89825
174875
167436
249966
266156
186672
271274
201186
72703
89955
237553
1990
193016
58121
153929
230183
204668
37634
42049
194212
196206
242823
216556
123429
219212
159326
120613
81542
248553
252197
11119
107941
10254
97870
46047
77069
268318
208779
90187
48330
196255
182533
232421
238096
62657
41347
155081
161016
217189
79469


In [142]:
avg_rp = np.mean(rps)
avg_ndcg = np.mean(ndcgs)
print('Avg. R-Precision: ', avg_rp)
print('Avg. NDCG: ', avg_ndcg)
print('Total Sum: ', np.mean([avg_rp, avg_ndcg]))

Avg. R-Precision:  0.04360068526244997
Avg. NDCG:  0.031060419956590445
Total Sum:  0.03733055260952021
