In [1]:
import numpy as np
import pandas as pd
from scipy import sparse
from sklearn.decomposition import NMF
from sklearn.model_selection import train_test_split
from itertools import islice
import logging

In [2]:
logging.basicConfig(format='%(asctime)s %(message)s ', datefmt='%Y-%m-%d %I:%M:%S %p', level=logging.DEBUG)

In [3]:
class RandomRecommender:
    def fit(self, scores):
        self.tracks = np.unique(scores.track)
        
    def recommend(self, user_id, count=None):
        result = self.tracks.copy()
        np.random.shuffle(result)
        if count is not None:
            result = result[:count]
        return result

In [4]:
class PopularRecommender:
    def fit(self, scores):
        self.popular = scores.groupby('track').score.sum().reset_index().sort_values('score', ascending=False)['track'].values
        
    def recommend(self, user_id, count=None):
        result = self.popular
        if count is not None:
            result = result[:count]
        return result
    

In [5]:
class ColaborativeRecommender:
    def __init__(self, rank=10):
        self.rank = rank
        self.new_user_recommender = PopularRecommender()
    
    def fit(self, scores):
        scores = scores.assign(user=scores.user.astype('category'), track=scores.track.astype('category'))
        mat = sparse.coo_matrix((
            scores.score.values.astype('float32'),
            (scores.user.cat.codes.values,
             scores.track.cat.codes.values))).tocsr()
        model = NMF(n_components=self.rank)
        model.fit(mat)
        self.new_user_recommender.fit(scores)
        self.mat = mat
        self.model = model
        self.track_categories = scores.track.cat.categories
        self.user_index = {Id:index for index, Id in enumerate(scores.user.cat.categories)}
        
    def recommend(self, user_id, count=None):
        if user_id not in self.user_index:
            return self.new_user_recommender.recommend(user_id, count)
        index = self.user_index[user_id]
        user_history = self.mat[index]
        score_pred = self.model.inverse_transform(self.model.transform(user_history)).ravel()
        result = self.track_categories[np.argsort(score_pred)[::-1]]
        if count is not None:
            result = result[:count]
        return result
    

In [6]:
def get_rank(array, value):
    try:
        return np.nonzero(array == value)[0][0] + 1
    except:
        return np.inf

In [7]:
def mean_reciprocal_rank(recommender, user_ids, track_ids):
    reciprocal_ranks = []
    for user_id, track_id in zip(user_ids, track_ids):
        recommendations = recommender.recommend(user_id)
        rank = get_rank(recommendations, track_id)
        reciprocal_ranks.append(1/rank)
    
    return np.mean(reciprocal_ranks)

In [8]:
logging.info('preparing data')

2020-02-26 12:52:19 AM preparing data 


In [9]:
track_download = pd.read_csv('beeptunes_v1/track_download.csv')
track_download = track_download[['USER_ID', 'TRACK_ID']].assign(r=1)

In [10]:
track_like = pd.read_csv('beeptunes_v1/track_like.csv')
track_like = track_like[['USER_ID', 'TRACK_ID']].assign(r=5)

In [11]:
album_track_purchase = pd.read_csv('beeptunes_v1/album_track_purchase.csv')
track_purchase = album_track_purchase[['USER_ID', 'TRACK_ID']][~album_track_purchase.TRACK_ID.isna()].assign(r=10)
track_purchase.TRACK_ID = track_purchase.TRACK_ID.astype('int')

In [12]:
total = pd.concat([track_download, track_like, track_purchase])
total.columns = ['user', 'track', 'score']
total = total.groupby(['user', 'track']).score.sum().reset_index()

In [13]:
train, validation = train_test_split(total, test_size=5000)

In [14]:
recommenders = [RandomRecommender(), PopularRecommender(), ColaborativeRecommender()]

In [15]:
for recommender in recommenders:
    name = recommender.__class__.__name__
    logging.info('starting {}'.format(name))
    recommender.fit(train)
    logging.info('fitted')
    score = mean_reciprocal_rank(recommender, validation.user, validation.track)
    logging.info('score (mean reciprocal rank): {:0.5f}'.format(score))

2020-02-26 12:53:17 AM starting RandomRecommender 
2020-02-26 12:53:18 AM fitted 
2020-02-26 12:53:27 AM score (mean reciprocal rank): 0.00016 
2020-02-26 12:53:27 AM starting PopularRecommender 
2020-02-26 12:53:28 AM fitted 
2020-02-26 12:53:29 AM score (mean reciprocal rank): 0.00676 
2020-02-26 12:53:29 AM starting ColaborativeRecommender 
2020-02-26 12:54:16 AM fitted 
2020-02-26 12:55:57 AM score (mean reciprocal rank): 0.01444 
