In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.neighbors import NearestNeighbors
from scipy.sparse import csr_matrix
import tqdm
from tqdm import tqdm
from IPython.display import clear_output
import matplotlib.pyplot as plt
%matplotlib inline
import os


In [2]:
import argparse
import codecs
import logging
import time
import tqdm

import numpy as np

from implicit.als import AlternatingLeastSquares
from implicit.approximate_als import (AnnoyAlternatingLeastSquares, FaissAlternatingLeastSquares,
                                      NMSLibAlternatingLeastSquares)
from implicit.bpr import BayesianPersonalizedRanking
from implicit.nearest_neighbours import (BM25Recommender, CosineRecommender,
                                         TFIDFRecommender, bm25_weight)
# from implicit.datasets.lastfm import get_lastfm


# maps command line model argument to class name
MODELS = {"als":  AlternatingLeastSquares,
          "nmslib_als": NMSLibAlternatingLeastSquares,
          "annoy_als": AnnoyAlternatingLeastSquares,
          "faiss_als": FaissAlternatingLeastSquares,
          "tfidf": TFIDFRecommender,
          "cosine": CosineRecommender,
          "bpr": BayesianPersonalizedRanking,
          "bm25": BM25Recommender}

def get_model(model_name):
    print("getting model %s" % model_name)
    model_class = MODELS.get(model_name)
    if not model_class:
        raise ValueError("Unknown Model '%s'" % model_name)

    # some default params
    if issubclass(model_class, AlternatingLeastSquares):
        params = {'factors': 16, 'dtype': np.float32}
    elif model_name == "bm25":
        params = {'K1': 100, 'B': 0.5}
    elif model_name == "bpr":
        params = {'factors': 63}
    else:
        params = {}

    return model_class(**params)

def calculate_recommendations(playlistids, trackids, user_item_csr, output_filename, model_name="als"):
    """ Generates artist recommendations for each user in the dataset """
    # train the model based off input params
#     artists, users, plays = get_lastfm()
    artists = trackids
    users = playlistids
    plays = user_item_csr

    # create a model from the input data
    model = get_model(model_name)

    # if we're training an ALS based model, weight input for last.fm
    # by bm25
    if issubclass(model.__class__, AlternatingLeastSquares):
        # lets weight these models by bm25weight.
#         logging.debug("weighting matrix by bm25_weight")
#         plays = bm25_weight(plays, K1=100, B=0.8)

        # also disable building approximate recommend index
        model.approximate_similar_items = False

    # this is actually disturbingly expensive:
    plays = plays.tocsr()

    logging.debug("training model %s", model_name)
    start = time.time()
    model.fit(plays)
    logging.debug("trained model '%s' in %0.2fs", model_name, time.time() - start)

    # generate recommendations for each user and write out to a file
    start = time.time()
    user_plays = plays.T.tocsr()
    with tqdm.tqdm(total=len(users)) as progress:
        with codecs.open(output_filename, "w", "utf8") as o:
            for userid, username in enumerate(users):
                for artistid, score in model.recommend(userid, user_plays, N=1000000):
                    o.write("%s\t%s\t%s\n" % (username, artists[artistid], score))
                progress.update(1)
    logging.debug("generated recommendations in %0.2fs",  time.time() - start)

In [3]:
subset10k = pd.read_csv("../raw_data/track_meta_milestone3.csv", index_col=["Unnamed: 0"])
# subset100 = pd.read_csv("../raw_data/track_meta_100subset_new.csv")

In [4]:
subset10k.shape, subset10k.Playlistid.nunique()

((926540, 28), 27016)

In [5]:
# Train-test split
train, test = train_test_split(subset10k, test_size=0.2, random_state=42, stratify = subset10k['Playlistid'])
train, val = train_test_split(train, test_size=0.2, random_state=42, stratify = train['Playlistid'])

In [6]:
playlistids = train['Playlistid'].unique()
trackids = train['Track_uri'].unique()
len(np.unique(playlistids)), len(np.unique(trackids))

(27016, 58365)

In [7]:
# Create Binary Sparse Matrix
co_mat = pd.crosstab(train.Playlistid, train.Track_uri)
co_mat = co_mat.clip(upper=1)
assert np.max(co_mat.describe().loc['max']) == 1

co_mat_sparse = csr_matrix(co_mat)

In [8]:
co_mat_sparse.shape

(27016, 58365)

In [9]:
calculate_recommendations(playlistids, trackids, co_mat_sparse, "../recommendation_10k.tsv")

  0%|          | 0/15 [00:00<?, ?it/s]

getting model als


100%|██████████| 15.0/15 [00:07<00:00,  2.36it/s]
100%|██████████| 27016/27016 [45:13<00:00,  8.55it/s] 


## Reading in results

In [7]:
rec_100 = pd.read_csv("../recommendation_10k.tsv", delimiter="\t", header = None)

In [8]:
rec_100.head()

Unnamed: 0,0,1,2
0,252291,spotify:track:19DOPRc6bW9OFtFHKA9dEE,3e-05
1,252291,spotify:track:4PvcavllGpo6lB8VkIgXIZ,2.9e-05
2,252291,spotify:track:7JOizhmt3HlBgQyJEa0AgK,2.8e-05
3,252291,spotify:track:62oGUBJQGPa3emMGMejBhm,2.8e-05
4,252291,spotify:track:1aTRTY9s4LqQHPkkRHPCzx,2.8e-05


In [12]:
rec_100[0].unique()

array([252291, 263760,    835, ...,    233, 258865, 172427])

In [13]:
len(np.unique(trackids))

58365

In [14]:
np.array(rec_100.loc[rec_100[0] == 198885, 1][:15])

array(['spotify:track:0udoMICxzaUbNUT8EVRq8B',
       'spotify:track:16qYlQ6koFxYVbiJbGHblz',
       'spotify:track:2hgzdQdnfWwtdpZbhZlV72',
       'spotify:track:4tUOIdAbmMCYnrxSVbyc9V',
       'spotify:track:2erNwv0Yti7uirpIU8wbvv',
       'spotify:track:1I93YiKgu2P5pwprWmziPI',
       'spotify:track:3jYRpwbctfqB77uU7T7K3U',
       'spotify:track:13mlT5Io2Aqr22UEPIt5sJ',
       'spotify:track:665Jxlgi1HamPKbW1vwzx4',
       'spotify:track:2CoGg71SqwNgriNkOLClae',
       'spotify:track:0MuqFuh4MerIiFsYWuEFaP',
       'spotify:track:46hu9JtTWN8DshperdNaUU',
       'spotify:track:5B03nSG8pLE48jh7kmasuL',
       'spotify:track:2va7KtTfrc0yKUBUH3yFTd',
       'spotify:track:55HOaYeJSCZSLPhaYkYf7h'], dtype=object)

In [15]:
# # recommend items for a user
# user_items = item_user_data.T.tocsr()
# recommendations = model.recommend(userid, user_items)

# # find related items
# related = model.similar_items(itemid)

NameError: name 'item_user_data' is not defined

## Making Predictions

In [9]:
def nholdout(playlist_id, df):
    '''Pass in a playlist id to get number of songs held out in val/test set'''
    return len(df[df.Playlistid == playlist_id].Track_uri)

## Metrics

In [10]:
def r_precision(prediction, val_set):
    # prediction should be a list of predictions
    # val_set should be pandas Series of ground truths
    score = np.sum(val_set.isin(prediction))/val_set.shape[0]
    return score

In [11]:
### NDCG Code Source: https://gist.github.com/bwhite/3726239
def dcg_at_k(r, k, method=0):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 0:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        elif method == 1:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
        else:
            raise ValueError('method must be 0 or 1.')
    return 0.


def ndcg_at_k(r, k, method=0):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max

## Model Performance

In [20]:
rec_100.head()

Unnamed: 0,0,1,2
0,252291,spotify:track:19DOPRc6bW9OFtFHKA9dEE,3e-05
1,252291,spotify:track:4PvcavllGpo6lB8VkIgXIZ,2.9e-05
2,252291,spotify:track:7JOizhmt3HlBgQyJEa0AgK,2.8e-05
3,252291,spotify:track:62oGUBJQGPa3emMGMejBhm,2.8e-05
4,252291,spotify:track:1aTRTY9s4LqQHPkkRHPCzx,2.8e-05


In [22]:
rec_100[0].unique()

array([252291, 263760,    835, ...,    233, 258865, 172427])

In [24]:
rec_100[0].nunique()

27016

In [35]:
pid = 0
ps = pd.Series(np.array(rec_100.loc[rec_100[0] == pid, 1][:nholdout(pid, train)*15]))
vs = test[test.Playlistid == pid].Track_uri

In [36]:
r_precision(ps, vs)

0.1

In [21]:
np.co_mat.index

Int64Index([     0,      1,      2,      3,      4,      5,      6,      7,
                 8,      9,
            ...
            271274, 271276, 271286, 271288, 271289, 271304, 271305, 271312,
            271338, 271339],
           dtype='int64', name='Playlistid', length=27016)

In [12]:
from tqdm import tqdm

In [15]:
rec_100[0].unique()

array([252291, 263760,    835, ...,    233, 258865, 172427])

In [16]:
rps = []
ndcgs = []
for pid in tqdm(rec_100[0].unique()):
    ps = pd.Series(np.array(rec_100.loc[rec_100[0] == pid, 1][:nholdout(pid, train)*15]))
#     ps = als_similar_songs_playlist(rs, train, pid, nholdout(pid, train)*15)
    vs = test[test.Playlistid == pid].Track_uri # ground truth
    rps.append(r_precision(ps, vs))
    
    r = np.zeros(len(ps))
    for i, p in enumerate(ps):
        if np.any(vs.isin([p])):
            r[i] = 1
    ndcgs.append(ndcg_at_k(r, len(r)))

100%|██████████| 27016/27016 [8:05:47<00:00,  1.04it/s]  


In [17]:
len(rps), len(ndcgs)

(27016, 27016)

In [18]:
avg_rp = np.mean(rps)
avg_ndcg = np.mean(ndcgs)
print('Avg. R-Precision: ', avg_rp)
print('Avg. NDCG: ', avg_ndcg)
print('Total Sum: ', np.mean([avg_rp, avg_ndcg]))

Avg. R-Precision:  0.01321971153443536
Avg. NDCG:  0.01654988573561711
Total Sum:  0.014884798635026234
