In [1]:
import pandas as pd
import numpy as np
from sklearn.neighbors import NearestNeighbors
from sklearn.metrics.pairwise import pairwise_distances
from scipy.sparse import csr_matrix
import time

In [2]:
user_artist_w = pd.read_csv("data/user_artists.dat", sep="\t")

In [5]:
matrix = user_artist_w.pivot(index ='artistID', columns='userID', values='weight').fillna(0)
matrix_sparse = csr_matrix(matrix)

matrix_sparse

<17632x1892 sparse matrix of type '<class 'numpy.float64'>'
	with 92834 stored elements in Compressed Sparse Row format>

In [7]:
matrix

userID,2,3,4,5,6,7,8,9,10,11,...,2090,2091,2092,2093,2094,2095,2096,2097,2099,2100
artistID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,408.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18741,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18742,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18743,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
18744,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [21]:
item_similarity = pairwise_distances(matrix_sparse, metric='cosine')
user_similarity = pairwise_distances(matrix_sparse.T, metric='cosine')

In [24]:
# checking
print(item_similarity.shape)
print(user_similarity.shape)

(17632, 17632)
(1892, 1892)


In [25]:
# Make prediction
def predict(matrix, similarity, type='user'):
    if type == 'user':
        mean_user_rating = matrix.mean(axis=1)
        ratings_diff = (matrix - mean_user_rating)
        pred = mean_user_rating + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'item':
        pred = matrix.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])
    return pred

item_prediction = predict(matrix_sparse.T, item_similarity, type='item')
user_prediction = predict(matrix_sparse.T, user_similarity, type='user')

In [29]:
item_similarity[0]

array([0., 1., 1., ..., 1., 1., 1.])

In [28]:
print(item_prediction.shape)
print(user_prediction.shape)

(1892, 17632)
(1892, 17632)


In [35]:
from surprise import Dataset, SVD
from surprise.model_selection import GridSearchCV
# Use movielens-100K
data = Dataset.load_from_df(user_artist_w[['userID', 'artistID', 'weight']], Reader(rating_scale=(1, user_artist_w['weight'].max())))


param_grid = {"n_epochs": [5, 10], "lr_all": [0.002, 0.005], "reg_all": [0.4, 0.6]}
gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)

gs.fit(data)

# best RMSE score
print(gs.best_score["rmse"])

# combination of parameters that gave the best RMSE score
print(gs.best_params["rmse"])

351951.8776210446
{'n_epochs': 5, 'lr_all': 0.005, 'reg_all': 0.4}
