In [1]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from scipy.sparse.linalg import svds
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.cross_validation import train_test_split
from sklearn.metrics import mean_squared_error
from math import sqrt

import warnings; warnings.simplefilter('ignore')



In [2]:
music_data_detailed = pd.read_json("Digital_Music_5.json", lines = True)

In [3]:
music_data_detailed = music_data_detailed[['asin', 'overall', 'reviewText', 'reviewerID', 'reviewerName', 'summary']]

In [4]:
music_data_detailed.rename(columns = {'asin':'artist_id', 'overall':'rating', 'reviewerID':'reviewer_id', 'reviewerName':'reviewer_name', 'reviewText':'review_text'}, inplace = True)

In [108]:
music_data_detailed.head()

Unnamed: 0,artist_id,rating,review_text,reviewer_id,reviewer_name,summary
0,5555991584,5,"It's hard to believe ""Memory of Trees"" came ou...",A3EBHHCZO6V2A4,"Amaranth ""music fan""",Enya's last great album
1,5555991584,5,"A clasically-styled and introverted album, Mem...",AZPWAXJG9OJXV,bethtexas,Enya at her most elegant
2,5555991584,5,I never thought Enya would reach the sublime h...,A38IRL0X2T4DPF,bob turnley,The best so far
3,5555991584,5,This is the third review of an irish album I w...,A22IK3I6U76GX0,Calle,Ireland produces good music.
4,5555991584,4,"Enya, despite being a successful recording art...",A1AISPOIIHTHXX,"Cloud ""...""",4.5; music to dream to


In [6]:
print ("Number of reviewers: "+ str(len(music_data_detailed['reviewer_id'].unique())))
print ("Number of items rated: " + str(len(music_data_detailed['artist_id'].unique())))

Number of reviewers: 5541
Number of items rated: 3568


In [7]:
n_unique_users = len(music_data_detailed['reviewer_id'].unique())
n_unique_items = len(music_data_detailed['artist_id'].unique())

In [8]:
r_df = music_data_detailed.pivot(index = 'reviewer_id', columns ='artist_id', values = 'rating').fillna(0)

<h2> Train Test Split </h2>

In [9]:
users_dictionary = {}
items_dictionary = {}
ci = 0 
cu = 0

for i in range(0, music_data_detailed.shape[0]-1):
    if (items_dictionary.get(music_data_detailed.iloc[i].artist_id) == None):
        items_dictionary.update({ music_data_detailed.iloc[i].artist_id : ci})
        ci+=1
        
    if (users_dictionary.get(music_data_detailed.iloc[i].reviewer_id) == None):
        users_dictionary.update({music_data_detailed.iloc[i].reviewer_id : cu})
        cu+=1

In [10]:
print(len(items_dictionary))
print(len(users_dictionary))

item_val_array = np.fromiter(iter(items_dictionary.values()), dtype=int)
item_val_array = np.sort(item_val_array)

user_val_array = np.fromiter(iter(users_dictionary.values()), dtype = int)
user_val_array = np.sort(user_val_array)

3568
5541


In [11]:
item_val_array

array([   0,    1,    2, ..., 3565, 3566, 3567])

In [12]:
user_val_array

array([   0,    1,    2, ..., 5538, 5539, 5540])

In [13]:
train_data, test_data = train_test_split(music_data_detailed, test_size = 0.25)

<h2> Memory-Based Collaborative Filtering </h2>

In [14]:
#Create two user-item matrices, one for training and another for testing
train_data_matrix = np.zeros((n_unique_users, n_unique_items))
for line in train_data.itertuples():
    train_data_matrix[users_dictionary.get(line[4]), items_dictionary.get(line[1])] = line[2]
    
test_data_matrix = np.zeros((n_unique_users, n_unique_items))
for line in test_data.itertuples():
    test_data_matrix[users_dictionary.get(line[4]), items_dictionary.get(line[1])] = line[2]

In [15]:
user_similarity = pairwise_distances(train_data_matrix, metric = 'cosine')

In [19]:
item_similarity = pairwise_distances(train_data_matrix.T, metric = 'cosine')

In [20]:
def predict(ratings, similarity, type = 'user'):
    if type == 'user':
        mean_user_rating = ratings.mean(axis = 1)
        #You use np.newaxis so that mean_user_rating has same format as ratings
        ratings_diff = (ratings - mean_user_rating[:, np.newaxis])
        pred = mean_user_rating[:, np.newaxis] + similarity.dot(ratings_diff) / np.array([np.abs(similarity).sum(axis = 1)]).T
    elif type == 'item':
        pred = ratings.dot(similarity) / np.array([np.abs(similarity).sum(axis = 1)])
    return pred

In [21]:
item_prediction = predict(train_data_matrix, item_similarity, type = 'item')
user_prediction = predict(train_data_matrix, user_similarity, type = 'user')

In [22]:
def rmse(prediction, ground_truth):
    prediction = prediction[ground_truth.nonzero()].flatten() 
    ground_truth = ground_truth[ground_truth.nonzero()].flatten()
    return sqrt(mean_squared_error(prediction, ground_truth))

In [23]:
print('User-based CF RMSE: ' + str(rmse(user_prediction, test_data_matrix)))
print('Item-based CF RMSE: ' + str(rmse(item_prediction, test_data_matrix)))

User-based CF RMSE: 4.312930350250258
Item-based CF RMSE: 4.326188696903299


SyntaxError: invalid syntax (<ipython-input-5-39236e13ef1a>, line 1)

# Model-based Collaborative Filtering

In [132]:
sparsity=round(1.0-len(music_data_detailed)/float(n_unique_users*n_unique_items),3)
print('The sparsity level of dataset is ' +  str(sparsity*100) + '%')

The sparsity level of dataset is 99.7%


In [149]:
#get SVD components from train matrix. Choose k.
u, s, vt = svds(train_data_matrix, k = 50)
s_diag_matrix=np.diag(s)
X_pred = np.dot(np.dot(u, s_diag_matrix), vt)
print('User-based CF MSE: ' + str(rmse(X_pred, test_data_matrix)))

User-based CF MSE: 4.141157987186154


# SVD

In [166]:
r_df = music_data_detailed.pivot(index = 'reviewer_id', columns ='artist_id', values = 'rating').fillna(0)

r_df.tail()

artist_id,5555991584,B0000000ZW,B00000016T,B00000016W,B00000017R,B0000001P4,B0000002HZ,B0000002J9,B0000002JR,B0000002ME,...,B00II5VHBU,B00IOVH8AW,B00IXZ9QP4,B00J80ED9M,B00JJCQRDE,B00JJOG5D4,B00JRBLSR2,B00JTHVWO8,B00JYKU6BK,B00KILDVEI
reviewer_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
AZVH70JMJ2IHX,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZVHFS4KQBQTK,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZWCAUCNLGL4H,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZY3MDFJJFI91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
AZYZA8FDL48GD,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [156]:
r = r_df.as_matrix()
user_ratings_mean = np.mean(r, axis = 1)
r_demeaned = r - user_ratings_mean.reshape(-1, 1)

In [157]:
train_data, test_data = train_test_split(r_demeaned, test_size = 0.25)

In [159]:
from scipy.sparse.linalg import svds
U, sigma, Vt = svds(train_data, k = 50)

In [160]:
sigma = np.diag(sigma)
all_user_predicted_ratings = np.dot(np.dot(U, sigma), Vt)

In [161]:
print('User-based CF MSE: ' + str(rmse(all_user_predicted_ratings, test_data_matrix)))

IndexError: index 4155 is out of bounds for axis 0 with size 4155