In [13]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt
import time
import matplotlib.pyplot as plt

import warnings; warnings.simplefilter('ignore')

In [4]:
music_rat_df = pd.read_json("data/Digital_Music_5.json", lines = True)
#meta_df = pd.read_json("meta_Digital_Music.json", lines = True)

In [5]:
music_rat_df = music_rat_df[['asin', 'overall', 'reviewText', 'reviewerID', 'reviewerName', 'summary']]

In [6]:
music_rat_df.rename(columns = {'asin':'artist_id', 'overall':'rating', 'reviewerID':'reviewer_id', 'reviewerName':'reviewer_name', 'reviewText':'review_text'}, inplace = True)

In [7]:
music_rat_df.head()

Unnamed: 0,artist_id,rating,review_text,reviewer_id,reviewer_name,summary
0,5555991584,5,"It's hard to believe ""Memory of Trees"" came ou...",A3EBHHCZO6V2A4,"Amaranth ""music fan""",Enya's last great album
1,5555991584,5,"A clasically-styled and introverted album, Mem...",AZPWAXJG9OJXV,bethtexas,Enya at her most elegant
2,5555991584,5,I never thought Enya would reach the sublime h...,A38IRL0X2T4DPF,bob turnley,The best so far
3,5555991584,5,This is the third review of an irish album I w...,A22IK3I6U76GX0,Calle,Ireland produces good music.
4,5555991584,4,"Enya, despite being a successful recording art...",A1AISPOIIHTHXX,"Cloud ""...""",4.5; music to dream to


In [8]:
us_rat_df = music_rat_df.pivot(index = 'reviewer_id', columns ='artist_id', values = 'rating').fillna(0)
us_rat_df.head()

artist_id,5555991584,B0000000ZW,B00000016T,B00000016W,B00000017R,B0000001P4,B0000002HZ,B0000002J9,B0000002JR,B0000002ME,...,B00II5VHBU,B00IOVH8AW,B00IXZ9QP4,B00J80ED9M,B00JJCQRDE,B00JJOG5D4,B00JRBLSR2,B00JTHVWO8,B00JYKU6BK,B00KILDVEI
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
A08161909WK3HU7UYTMW,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A1020L7BWW9RAX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A10323WWTFPSGP,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103KNDW8GN92L,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
A103W7ZPKGOCC9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [9]:
print ("Number of reviewers: "+ str(len(music_rat_df['reviewer_id'].unique())))
print ("Number of items rated: " + str(len(music_rat_df['artist_id'].unique())))

Number of reviewers: 5541
Number of items rated: 3568


In [25]:
n_unique_users = music_rat_df['reviewer_id'].nunique()
n_unique_artists = music_rat_df['artist_id'].nunique()

In [26]:
#Calculate sparsity level of dataset matrix

sparsity=round(1.0-len(music_rat_df)/float(n_unique_users*n_unique_artists),3)
print('The sparsity level of dataset is ' +  str(sparsity*100) + '%')

The sparsity level of dataset is 99.7%


# Train Test Split

In [27]:
train_data, test_data = train_test_split(us_rat_df, test_size = 0.25)

In [28]:
#Create two user-item matrices, one for training and another for testing

train_data_matrix = train_data.as_matrix()
test_data_matrix = test_data.as_matrix()

# Memory-Based Collaborative Filtering

- <h4> First calculate similarity between users </h4>
<p> For that purpose, we will use the following formula: 
<img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(u_k,u_a)=\frac{u_k&space;\cdot&space;u_a&space;}{&space;\left&space;\|&space;u_k&space;\right&space;\|&space;\left&space;\|&space;u_a&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{k,m}x_{a,m}}{\sqrt{\sum&space;x_{k,m}^2\sum&space;x_{a,m}^2}}"/>

In [29]:
#First calculate similarity between users|
user_similarity = pairwise_distances(train_data_matrix, metric = 'cosine')

 - <h4> Then calculate similarity between artists </h4>
 <p> For that purpose, similar formula will be used:</p>
 
 <img class="aligncenter size-thumbnail img-responsive" src="https://latex.codecogs.com/gif.latex?s_u^{cos}(i_m,i_b)=\frac{i_m&space;\cdot&space;i_b&space;}{&space;\left&space;\|&space;i_m&space;\right&space;\|&space;\left&space;\|&space;i_b&space;\right&space;\|&space;}&space;=\frac{\sum&space;x_{a,m}x_{a,b}}{\sqrt{\sum&space;x_{a,m}^2\sum&space;x_{a,b}^2}}
"/>

In [31]:
#Next calculate similarity between artists
artist_similarity = pairwise_distances(train_data_matrix.T, metric = 'cosine')

In [35]:
def predict_ratings (ratings, similarity, type = 'user'):
    if type =='user':
        mean_user_rating = ratings.mean(axis = 1)
        #We use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        prediction = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis=1)]).T
    elif type == 'artist':
        prediction = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis=1)])     
    return prediction

In [40]:
#Get predictions
user_prediction = predict_ratings(train_data_matrix, user_similarity, type ='user')
artist_prediction = predict_ratings(train_data_matrix, artist_similarity, type = 'artist')

- <h4> Evaluate model performance using RMSE : </h4>
<img src="https://latex.codecogs.com/gif.latex?RMSE&space;=\sqrt{\frac{1}{N}&space;\sum&space;(x_i&space;-\hat{x_i})^2}" title="RMSE =\sqrt{\frac{1}{N} \sum (x_i -\hat{x_i})^2}" />

In [41]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [43]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Artist-based CF RMSE: ' + str(rmse(artist_prediction, test_data_matrix)))

User-based CF RMSE: 4.352838124246542
Artist-based CF RMSE: 4.370581815048999


In [48]:
#Return for the given user_id 10 most similar users
user_similarity

array([[0.        , 1.        , 0.91607004, ..., 1.        , 1.        ,
        1.        ],
       [1.        , 0.        , 1.        , ..., 1.        , 1.        ,
        1.        ],
       [0.91607004, 1.        , 0.        , ..., 1.        , 1.        ,
        1.        ],
       ...,
       [1.        , 1.        , 1.        , ..., 0.        , 1.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 0.        ,
        1.        ],
       [1.        , 1.        , 1.        , ..., 1.        , 1.        ,
        0.        ]])

# Model-Based Collaborative filtering