In [100]:
import numpy as np
import pandas as pd
import scipy.sparse as sparse
from sklearn.decomposition import NMF
from numpy import save
from pathlib import Path
from sklearn import preprocessing
from scipy.sparse import coo_matrix
import ml_metrics
import sklearn.metrics as metrics
import time
from collections import defaultdict

In [2]:
path = Path('../../../../')

In [3]:
track_like_df = pd.read_csv(path/'ml/EDA/outputs/track_like_df.csv')
track_like_df = track_like_df[['USER_ID', 'TRACK_ID']].assign(r=3)
track_like_df.columns = ['user', 'track', 'score']

In [4]:
track_download_df = pd.read_csv(path/'ml/EDA/outputs/track_download_df.csv')
track_download_df = track_download_df[['USER_ID', 'TRACK_ID']].assign(r=1)
track_download_df.columns = ['user', 'track', 'score']

In [5]:
track_purchase_df = pd.read_csv(path/'ml/EDA/outputs/track_purchase_df.csv')
track_purchase_df = track_purchase_df[['USER_ID','TRACK_ID']].assign(r=2)
track_purchase_df.columns = ['user','track','score']

In [6]:
user_info_df = pd.read_csv(path/'ml/EDA/outputs/user_info.csv')

In [7]:
user_info_df.sort_values('total_interactions',ascending=False)

Unnamed: 0,USER_ID,noTrack_purchases,noAlbum_purchases,noTrack_likes,noAlbum_likes,noArtist_likes,noTrack_downloads,noAlbum_downloads,total_interactions
10812,3606398,1007.0,1110.0,,7.0,20.0,14642.0,1561.0,18347.0
9623,3603424,2627.0,863.0,,,8.0,12415.0,1626.0,17539.0
16,3568922,5.0,4.0,24.0,3827.0,2147.0,9019.0,1550.0,16576.0
5780,3593600,243.0,980.0,,11.0,18.0,12388.0,1315.0,14955.0
179077,64359968,6747.0,,,1.0,2.0,6747.0,988.0,14485.0
...,...,...,...,...,...,...,...,...,...
277887,321749603,,,1.0,,,,,1.0
277914,321860997,,,1.0,,,,,1.0
277922,321886366,,,1.0,,,,,1.0
277924,321892524,,,1.0,,,,,1.0


In [8]:
print(track_like_df.shape)
print(track_download_df.shape)
print(track_purchase_df.shape)
print(user_info_df.shape)

(346043, 3)
(16266541, 3)
(3367956, 3)
(586785, 9)


In [9]:
total = pd.concat([track_download_df, track_like_df, track_purchase_df])
total = total.groupby(['user', 'track']).score.sum().reset_index()
total.shape

(16612906, 3)

In [10]:
min_interactions = 20
target_users = user_info_df.loc[user_info_df['total_interactions']>=min_interactions]['USER_ID'].values
target_users.shape

(201779,)

In [11]:
total = total.loc[total['user'].isin(target_users)]
total.shape

(14626903, 3)

In [12]:
total.describe()

Unnamed: 0,user,track,score
count,14626900.0,14626900.0,14626900.0
mean,163564300.0,174832900.0,1.422875
std,207713800.0,221262000.0,0.817791
min,3568884.0,2828445.0,1.0
25%,8039696.0,2867901.0,1.0
50%,38601130.0,24842430.0,1.0
75%,364555700.0,468725800.0,1.0
max,557769500.0,557766700.0,6.0


In [13]:
total['user'].nunique(),total['track'].nunique()

(201753, 95964)

In [14]:
def train_valid_split(rating_df, no_users, no_items_per_user):
    gb = rating_df.groupby('user')
    test_df = pd.concat([gb.get_group(group)[:no_items_per_user] for i,group in enumerate(gb.groups) if i < no_users])
    train_df = pd.concat([rating_df,test_df]).drop_duplicates(keep=False)
    test_df =  test_df.loc[(test_df['user'].isin(train_df['user']) & (test_df['track'].isin(train_df['track'])))]
    return train_df,test_df

In [15]:
train_df,test_df = train_valid_split(total, 1000, 10)
train_df.shape,test_df.shape

((14616958, 3), (9448, 3))

In [16]:
train_df['user'].nunique(),test_df['user'].nunique()

(201698, 945)

In [17]:
class RandomRecommender:
    def fit(self, scores):
        self.tracks = np.unique(scores.track)
        
    def recommend(self, user_id, count=None):
        result = self.tracks.copy()
        np.random.shuffle(result)
        if count is not None:
            result = result[:count]
        return result

In [23]:
class PopularRecommender:
    def fit(self, scores):
        self.popular = scores.groupby('track').score.sum().reset_index().sort_values('score', ascending=False)['track'].values
        
    def recommend(self, user_id, count=None):
        result = self.popular
        if count is not None:
            result = result[:count]
        return result
    
    def recommend_all(self,user_ids,count=10):
        recomms = dict()
        result = self.popular
        for user_id in user_ids:
            recomms[user_id]=result[:count]
        return  recomms

In [19]:
class CollaborativeRecommender:
    def __init__(self, rank = 10):
        self.rank = rank
        self.new_user_recommender = PopularRecommender()
        
    def fit(self, scores, tol=1e-8, max_iter=4000, random_state=None):
        scores = scores.assign(user=scores.user.astype('category'), track=scores.track.astype('category'))
        mat = sparse.coo_matrix((
            scores.score.values.astype('float32'),
            (scores.user.cat.codes.values,
             scores.track.cat.codes.values))).tocsr()
        model = NMF(n_components=self.rank,tol=tol,max_iter=max_iter,random_state=random_state)
        model.fit(mat)
        self.new_user_recommender.fit(scores)
        self.model = model
        self.user_categories = scores.user.cat.categories
        self.track_categories = scores.track.cat.categories
        self.mat = mat
        self.user_index = {Id:index for index, Id in enumerate(scores.user.cat.categories)}
        self.track_index = {Id:index for index, Id in enumerate(scores.track.cat.categories)}
        
    def recommend(self, user_id, count=5):
        if user_id not in self.user_index:
            return self.new_user_recommender.recommend(user_id, count)
        index = self.user_index[user_id]
        user_history = self.mat[index]
        score_pred = self.model.inverse_transform(self.model.transform(user_history)).squeeze()       
        
        top_tracks = sorted(zip(score_pred, self.track_categories), reverse=True)
        top_tracks = [x[1] for x in top_tracks]
        
        previously_downloaded = set(self.track_categories[self.mat[index].nonzero()[1]])
        top_tracks = [x for x in top_tracks if x not in previously_downloaded]
        return top_tracks[:count]
    
    def predict(self,test_df):
        preds = pd.DataFrame(columns=['user','track','actual','est'])
        preds[['user','track','actual']] = test_df[['user','track','score']].copy()
        for i,row in preds.iterrows():
            preds.loc[i,'est'] =  (self.model.inverse_transform(self.model.transform(self.mat[self.user_index[row['user']]]))).squeeze()[self.track_index[row['track']]]
        return preds

In [19]:
recommenders = [ColaborativeRecommender(),PopularRecommender(),RandomRecommender()]
test = test_df.copy().groupby('user')['track'].agg({'actual': (lambda x: list(set(x)))})
test.reset_index(inplace=True)
for recommender in recommenders:
    start_time = time.time()
    recommender.fit(train_df)
    print(recommender.__class__.__name__,'fit done. fit time:',time.time()-start_time,'Seconds')
    start_time = time.time()
    recs = []
    for user in test['user']:
        predictions = recommender.recommend(user, 5)
        recs.append(predictions)
        
    print(recommender.__class__.__name__,'prediction done. prediction time:',time.time()-start_time,'Seconds')
    test[recommender.__class__.__name__] = recs

test.head()

is deprecated and will be removed in a future version. Use                 named aggregation instead.

    >>> grouper.agg(name_1=func_1, name_2=func_2)

  


ColaborativeRecommender fit done. fit time: 88.83590722084045 Seconds
ColaborativeRecommender prediction done. prediction time: 153.4138810634613 Seconds
PopularRecommender fit done. fit time: 0.7122011184692383 Seconds
PopularRecommender prediction done. prediction time: 0.0008141994476318359 Seconds
RandomRecommender fit done. fit time: 0.8274800777435303 Seconds
RandomRecommender prediction done. prediction time: 1.8302509784698486 Seconds


Unnamed: 0,user,actual,ColaborativeRecommender,PopularRecommender,RandomRecommender
0,3568884,"[2828448.0, 2828457.0, 2828719.0, 2828721.0, 2...","[7377525.0, 219866931.0, 219866927.0, 21986695...","[6059894.0, 2867903.0, 436926390.0, 2852012.0,...","[2843372.0, 75737694.0, 2852985.0, 512371215.0..."
1,3568886,"[2828883.0, 2828468.0, 2828469.0, 2828470.0, 2...","[7377525.0, 219866931.0, 219866927.0, 21986695...","[6059894.0, 2867903.0, 436926390.0, 2852012.0,...","[2849415.0, 519234378.0, 526910560.0, 50319986..."
2,3568888,"[2842080.0, 2842081.0, 2829029.0, 2829798.0, 2...","[7377553.0, 12264925.0, 7377525.0, 219866931.0...","[6059894.0, 2867903.0, 436926390.0, 2852012.0,...","[487799530.0, 468637599.0, 482220152.0, 803346..."
3,3568890,"[2829891.0, 2829892.0, 2829101.0, 2829197.0, 2...","[7377553.0, 7377525.0, 219866931.0, 219866927....","[6059894.0, 2867903.0, 436926390.0, 2852012.0,...","[2839872.0, 2846794.0, 2857115.0, 2829855.0, 4..."
4,3568892,"[2828468.0, 2828469.0, 2828470.0, 2828471.0, 2...","[7377525.0, 219866931.0, 219866927.0, 21986695...","[6059894.0, 2867903.0, 436926390.0, 2852012.0,...","[77321424.0, 2832088.0, 2853713.0, 526283227.0..."


In [20]:
print('Collaborative map@k score:',ml_metrics.mapk(test['actual'],test['ColaborativeRecommender'],k=10))
print('Popular map@k score:',ml_metrics.mapk(test['actual'],test['PopularRecommender'],k=10))
print('Random map@k score:',ml_metrics.mapk(test['actual'],test['RandomRecommender'],k=10))

Collaborative makp score: 0.002738977072310406
Popular makp score: 0.00035978835978835974
Random makp score: 0.0


In [21]:
recommender = ColaborativeRecommender(10)
recommender.fit(train_df)

In [22]:
preds = recommender.predict(test_df=test_df)
preds.head()

Unnamed: 0,user,track,actual,est
0,3568884,2828448.0,11.986814,0.00149225
1,3568884,2828457.0,11.986814,0.00174326
2,3568884,2828719.0,3.22757,0.255247
3,3568884,2828721.0,3.22757,0.0815859
4,3568884,2828724.0,3.22757,0.0721323


In [23]:
recommender.model.reconstruction_err_

22756.9095884601

In [35]:
print(metrics.r2_score(preds['actual'],preds['est']))
print(metrics.mean_squared_error(preds['actual'],preds['est']))

-2.116506791273196
51.65366174769558


In [25]:
%%timeit
recommender.recommend(user_id=train_df.iloc[0]['user'],count=10)

167 ms ± 1.76 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)


## Using surprise library

In [16]:
import surprise
from surprise import Reader
from surprise import Dataset
from surprise.model_selection import cross_validate,PredefinedKFold
from collections import defaultdict

In [17]:
def precision_recall_at_k(predictions, k=10, threshold=3.5):
    '''Return precision and recall at k metrics for each user.'''
    c = 0
    # First map the predictions to each user.
    user_est_true = defaultdict(list)
    for uid, _, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions = dict()
    recalls = dict()
    for uid, user_ratings in user_est_true.items():

        # Sort user ratings by estimated value
        user_ratings.sort(key=lambda x: x[0], reverse=True)

        # Number of relevant items
        n_rel = sum((true_r >= threshold) for (_, true_r) in user_ratings)
        
        if n_rel == 0:
            c += 1
            continue

        # Number of recommended items in top k
        n_rec_k = sum((est >= threshold) for (est, _) in user_ratings[:k])

        # Number of relevant and recommended items in top k
        n_rel_and_rec_k = sum(((true_r >= threshold) and (est >= threshold))
                              for (est, true_r) in user_ratings[:k])

        # Precision@K: Proportion of recommended items that are relevant
        precisions[uid] = n_rel_and_rec_k / n_rec_k if n_rec_k != 0 else 0

        # Recall@K: Proportion of relevant items that are recommended
        recalls[uid] = n_rel_and_rec_k / n_rel
        
#         print(n_rel,n_rec_k,n_rel_and_rec_k)
        
    print(c)

    return precisions, recalls


In [18]:
def get_top_n(predictions, n=10):
    '''Return the top-N recommendation for each user from a set of predictions.

    Args:
        predictions(list of Prediction objects): The list of predictions, as
            returned by the test method of an algorithm.
        n(int): The number of recommendation to output for each user. Default
            is 10.

    Returns:
    A dict where keys are user (raw) ids and values are lists of tuples:
        [(raw item id, rating estimation), ...] of size n.
    '''

    # First map the predictions to each user.
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))

    # Then sort the predictions for each user and retrieve the k highest ones.
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]

    return top_n


In [64]:
reader = Reader(rating_scale=(train_df['score'].min(),train_df['score'].max()))
train_ds = Dataset.load_from_df(train_df,reader=reader)

In [65]:
trainset = train_ds.build_full_trainset()

In [66]:
testset = [(row['user'],row['track'],row['score']) for i,row in test_df.iterrows()]

In [67]:
start_time = time.time()
algo = surprise.NMF(n_factors=30,n_epochs=100,biased=True,random_state=28,verbose=True)
algo.fit(trainset)
print('training time:',time.time()-start_time,'Seconds')

Processing epoch 0
Processing epoch 1
Processing epoch 2
Processing epoch 3
Processing epoch 4
Processing epoch 5
Processing epoch 6
Processing epoch 7
Processing epoch 8
Processing epoch 9
Processing epoch 10
Processing epoch 11
Processing epoch 12
Processing epoch 13
Processing epoch 14
Processing epoch 15
Processing epoch 16
Processing epoch 17
Processing epoch 18
Processing epoch 19
Processing epoch 20
Processing epoch 21
Processing epoch 22
Processing epoch 23
Processing epoch 24
Processing epoch 25
Processing epoch 26
Processing epoch 27
Processing epoch 28
Processing epoch 29
Processing epoch 30
Processing epoch 31
Processing epoch 32
Processing epoch 33
Processing epoch 34
Processing epoch 35
Processing epoch 36
Processing epoch 37
Processing epoch 38
Processing epoch 39
Processing epoch 40
Processing epoch 41
Processing epoch 42
Processing epoch 43
Processing epoch 44
Processing epoch 45
Processing epoch 46
Processing epoch 47
Processing epoch 48
Processing epoch 49
Processing

In [68]:
start_time = time.time()
predictions = algo.test(testset)
predictions_df = pd.DataFrame(predictions)
print('prediction time:',time.time()-start_time,'Seconds')

prediction time: 0.07785677909851074 Seconds


In [69]:
predictions_df

Unnamed: 0,uid,iid,r_ui,est,details
0,3568884.0,2828448.0,3.0,1.000000,{'was_impossible': False}
1,3568884.0,2828457.0,3.0,1.000000,{'was_impossible': False}
2,3568884.0,2828719.0,1.0,1.868288,{'was_impossible': False}
3,3568884.0,2828721.0,1.0,1.119160,{'was_impossible': False}
4,3568884.0,2828724.0,1.0,1.106811,{'was_impossible': False}
...,...,...,...,...,...
9443,3580324.0,2828795.0,3.0,2.575708,{'was_impossible': False}
9444,3580324.0,2828913.0,3.0,2.309610,{'was_impossible': False}
9445,3580324.0,2828976.0,3.0,2.262948,{'was_impossible': False}
9446,3580324.0,2829022.0,3.0,2.247217,{'was_impossible': False}


In [91]:
precisions, recalls = precision_recall_at_k(predictions, k=10, threshold=2)

print('map@k:',sum(prec for prec in precisions.values()) / len(precisions))
print('mar@k:',sum(rec for rec in recalls.values()) / len(recalls))
print('r2-score:',metrics.r2_score(predictions_df['r_ui'],predictions_df['est']))
print('MSE:',metrics.mean_squared_error(predictions_df['r_ui'],predictions_df['est']))

448
map@k: 0.4592164095685223
mar@k: 0.4110640030660151
r2-score: 0.21842873930101459
MSE: 0.6719727962509439


In [93]:
class Surp_CollaborativeRecommender:

    def __init__(self):
        self.top_n = defaultdict(list)
        return

    def fit(self, scores, n_factors=30, n_epochs=100, biased=True, random_state=None, verbose=False):
        self.reader = Reader(rating_scale=(scores['score'].min(), scores['score'].max()))
        self.scores = scores
        self.scores_ds = Dataset.load_from_df(self.scores, reader=self.reader)
        self.trainset = self.scores_ds.build_full_trainset()
        self.algo = surprise.NMF(n_factors=n_factors, n_epochs=n_epochs, biased=biased, random_state=random_state,
                                 verbose=verbose)
        self.algo.fit(trainset=self.trainset)

    def recommend(self, user_id, count=10):
        if len(self.top_n[user_id]) != 0:
            return np.array(self.top_n[user_id])[:,0]
        predictions = []
        for track_id in self.scores.track.unique():
            predictions.append(self.algo.predict(user_id,track_id))
        for uid, iid, true_r, est, _ in predictions:
            self.top_n[uid].append((iid, est))

        for uid, user_ratings in self.top_n.items():
            user_ratings.sort(key=lambda x: x[1], reverse=True)
            self.top_n[uid] = user_ratings[:count]
            
        return np.array(self.top_n[user_id])[:,0]

    def recommend_all(self,user_ids=None,count=10):
        recomms = dict()
        for user_id in user_ids:
            recomms[user_id]=self.recommend(user_id,count)

        return recomms

In [94]:
scr = Surp_CollaborativeRecommender()
scr.fit(total,n_epochs=1,verbose=True)

Processing epoch 0
