# Collaborative filtering using SVD and matrix factorization

The scikit-surprise library is designed specifically for building and evaluating recommendation engines. More information can be found at http://surpriselib.com/


In [31]:
import pandas as pd
import numpy as np

%matplotlib inline
import matplotlib.pyplot as plt

from surprise import Dataset, Reader
from surprise import NMF, SVD, KNNWithMeans
from surprise import dump
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV, KFold
from surprise import accuracy

## Read in data

In [6]:
df_books = pd.read_pickle('datasets/clean/books_clean.pkl')
df_ratings = pd.read_csv( 'datasets/raw/ratings_raw.csv' )

## Train-Test-Validation Split

We'll use 80% of the data for training, 10% for testing, and set aside the last 10% for validation to be used when we build the final model.

In [7]:
df_ratings.head(2)

Unnamed: 0,user_id,book_id,rating
0,1,258,5
1,2,4081,4


In [26]:
# train-test split by randomly split on 'df_ratings' dataset (on user-book rating combination)
idx = list(df_ratings.index)
np.random.seed(42)
np.random.shuffle(idx) # shuffle the list of index

# 80% trainset, 10% testset, 10% validation (valset)
lower_threshold = int(0.8 * len(df_ratings)) 
upper_threshold = int(0.9 * len(df_ratings)) 
trainset = df_ratings.loc[idx[:lower_threshold]]                             
testset = df_ratings.loc[idx[lower_threshold:upper_threshold]]
valset = df_ratings.loc[idx[upper_threshold:]]

print("# of users in total:" , df_ratings.user_id.nunique())
print("# of users in the training set: ", trainset.user_id.nunique())
print("# of books in total:" , df_ratings.book_id.nunique())
print("# of books in the training set: ", trainset.book_id.nunique())
print("% of books in the training, testing and validation set: ",
      len(trainset)/len(df_ratings), len(testset)/len(df_ratings), 
      len(valset)/len(df_ratings))

# of users in total: 53424
# of users in the training set:  53424
# of books in total: 10000
# of books in the training set:  10000
% of books in the training, testing and validation set:  0.7999999665354802 0.3000000501967797 0.3000000501967797


## Matrix factorization using scikit-surprise package


In [30]:
# load trainset and testset into Surprise

# create a Reader object with the rating_scale from 1 to 5
reader = Reader(rating_scale=(1, 5))

# load trainset, note: 
#    the columns must correspond to user id, item id and ratings in the exact order
data_train = Dataset.load_from_df(trainset, reader)
# prepare a trainset object out of the training data to feed to .fit() method
training = data_train.build_full_trainset()


# load testset
data_test = Dataset.load_from_df(testset, reader)
# prepare a testset object out of the test data to feed to .test() method
testing = data_test.construct_testset(data_test.raw_ratings)


## SVD without bias for matrix factorization

In [13]:
%%time
# simple SVD model
svd = SVD(n_factors=20, n_epochs = 30, biased=False) # initiate a SVD algorithm object
svd.fit(training) # training on the trainset
pred_svd = svd.test(testing) # predict ratings for the testset
accuracy.rmse(pred_svd) # compute RMSE score

# user and item matrix with latent features
mean = svd.trainset.global_mean # global mean rating of the trainset
user_latent, item_latent = svd.pu, svd.qi
print(user_latent.shape, item_latent.shape)

RMSE: 0.8252
(53424, 20) (10000, 20)
CPU times: user 4min 18s, sys: 2.13 s, total: 4min 20s
Wall time: 4min 25s


## SVD with bias for matrix factorization

In [33]:
%%time
# SVD model with bias
svd = SVD(n_factors=20, n_epochs = 30, biased=True) # initiate a SVD algorithm object
svd.fit(training) # training on the trainset
pred_svd = svd.test(testing) # predict ratings for the testset
accuracy.rmse(pred_svd) # compute RMSE score

# user and item matrix with latent features
mean = svd.trainset.global_mean # global mean rating of the trainset
user_latent, item_latent = svd.pu, svd.qi
print(user_latent.shape, item_latent.shape)

RMSE: 0.8291
(53424, 20) (10000, 20)
CPU times: user 4min 22s, sys: 7.84 s, total: 4min 30s
Wall time: 4min 36s


## NMF (non-negative matrix factorization) without bias term

In [32]:
%%time
# simple NMF model
nmf = NMF(n_factors=20, n_epochs = 30, biased=False) # initiate a NMF algorithm object
nmf.fit(training) # training on the trainset
pred_nmf = nmf.test(testing) # predict ratings for the testset
accuracy.rmse(pred_nmf) # compute RMSE score

# user and item matrix with latent features
mean = nmf.trainset.global_mean # global mean rating of the trainset
user_latent, item_latent = nmf.pu, nmf.qi
print(user_latent.shape, item_latent.shape)

RMSE: 0.9179
(53424, 20) (10000, 20)
CPU times: user 5min 9s, sys: 9.72 s, total: 5min 18s
Wall time: 5min 25s


## KNN

In [None]:
%%time
# simple NMF model
knn = KNNWithMean(k=40, min_k=1) # initiate a KNN algorithm object
knn.fit(training) # training on the trainset
pred_knn = knn.test(testing) # predict ratings for the testset
accuracy.rmse(pred_knn) # compute RMSE score

# user and item matrix with latent features
mean = knn.trainset.global_mean # global mean rating of the trainset
user_latent, item_latent = knn.pu, knn.qi
print(user_latent.shape, item_latent.shape)