# Latent Factor Analysis for Book Recommendation
using the Book Crossing Datset

## Preliminary

In [125]:
import pandas as pd
import numpy as np

from surprise import SVDpp
from surprise import Dataset, Reader, SVD, evaluate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

from sklearn.decomposition import FactorAnalysis

from scipy.sparse.linalg import svds

## Data Wrangling

In [79]:
raw_path = '../data/BX-Book-Ratings.csv'
raw = pd.read_csv(raw_path, 
                  sep=';',
                  header = 0,
                  names = ['user', 'isbn', 'rating'],
                  encoding = 'ISO-8859-1')
raw.shape

(1149780, 3)

In [81]:
bookfile_path = '../data/BX-Books.csv'
df_book = pd.read_csv( bookfile_path,
                      sep = ';',
                      header = 0,
                      error_bad_lines = False,
                      usecols = [0,1,2],
                      index_col = 0,
                      names = ['isbn', 'title', 'author'],
                      encoding = 'iso-8859-1'
                     )
df_book.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


### Remove books without informations

In [82]:
missing_books = raw[-raw['isbn'].isin(df_book.index)]
df_rating = raw[raw['isbn'].isin(df_book.index)]
print( f'Found {len(missing_books)} missing books: {"{:.2%}".format(len(missing_books)/len(raw))}')

Found 118605 missing books: 10.32%


### Remove Books and Users with limited ratings
we are setting a **threshold** here of minimum 10 ratings  
users or books that has fewer than 10 ratings are simply not worth the effort

In [83]:
books_ratings_count = df_rating.isbn.value_counts()
users_ratings_count = df_rating.user.value_counts()
users_ratings_count.head()

11676     11144
198711     6456
153662     5814
98391      5779
35859      5646
Name: user, dtype: int64

In [103]:
rating_threshold = 10
books_tokeep = books_ratings_count[books_ratings_count>= rating_threshold]
users_tokeep = users_ratings_count[users_ratings_count>= rating_threshold]
print(f'Only {"{:.2%}".format(len(books_tokeep)/ len(books_ratings_count))} books will be kept')
print(f'Only {"{:.2%}".format(len(users_tokeep)/ len(users_ratings_count))} users will be kept')
      
print(f'There are {"{:,}".format(len(users_tokeep))} users and {"{:,}".format(len(books_tokeep))} books.')

Only 6.47% books will be kept
Only 12.75% users will be kept
There are 11,746 users and 17,479 books.


In [95]:
df_rating_clean = df_rating[ df_rating.isbn.isin(books_tokeep.index)]
df_rating_clean = df_rating_clean[ df_rating_clean.user.isin(users_tokeep.index)]
print( f'We have {"{:,}".format(len(df_rating))} ratings before applying threshold, now we have {"{:,}".format(len(df_rating_clean))}')

We have 1,031,175 ratings before applying threshold, now we have 428,085


### Remove Zeros

In [96]:
zeros = df_rating_clean[df_rating_clean.rating == 0]
print( f'Found {len(zeros)} records that is 0: {"{:.2%}".format(len(zeros)/len(df_rating_clean))}')

Found 284422 records that is 0: 66.44%


In [97]:
df_rating_clean = df_rating_clean[df_rating_clean.rating != 0]
df_rating_clean.shape

(143663, 3)

### Create Sparse Matrix
Usage \#3 from [documentation](https://kite.com/python/docs/scipy.sparse.coo_matrix):  
`coo_matrix((data, (i, j)), [shape=(M, N)])`

In [99]:
from scipy.sparse import coo_matrix
data = df_rating_clean.rating.astype(float)
i = df_rating_clean.user.astype('category').cat.codes
j = df_rating_clean.isbn.astype('category').cat.codes

R = coo_matrix(( data, (i,j)))

In [101]:
R.shape

(10807, 17091)

## Models

### Using SK-Learn Factor Analysis
**Unfortunately this doesn't support a Sparse Matrix**

In [108]:
transformer = FactorAnalysis(n_components = 3, random_state = 420)
R_transformed = transformer.fit_transform(R.toarray())
R_transformed.shape

(10807, 3)

### Using Scipy's SVDs

In [133]:
R_matrix = df_rating_clean.pivot( index = 'user', columns = 'isbn', values = 'rating').fillna(0)
print(f'There are {len(df_rating_clean.user.unique())} users and {len(df_rating_clean.isbn.unique())} books.')

There are 10807 users and 17091 books.


we need to normalize the matrix for decomposition

In [126]:
R_np = R_matrix.as_matrix()

# Get each user's mean rating
mean_rating = np.mean(R_np, axis = 1)

  """Entry point for launching an IPython kernel.


In [134]:
R_normal = R_np - mean_rating.reshape(-1,1)
R_normal.shape

(10807, 17091)

In [135]:
U, sigma, Vt = svds(R_normal, k = 50)

### Using Surprise's SVD

In [118]:
s_reader = Reader( rating_scale= (1,10))
# data have to follow this structure:
#   user: item: rating
s_data = Dataset.load_from_df(df_rating_clean[['user', 'isbn', 'rating']], s_reader)

# split the dataset for 10-fold evaluation
s_data.split( n_folds = 10)

svd = SVD()

# Evaluate RMSE of SVD
evaluate(svd, s_data, measures = ['RMSE'])

Evaluating RMSE of algorithm SVD.





------------
Fold 1
RMSE: 1.5607
------------
Fold 2
RMSE: 1.5748
------------
Fold 3
RMSE: 1.5664
------------
Fold 4
RMSE: 1.5634
------------
Fold 5
RMSE: 1.5669
------------
Fold 6
RMSE: 1.5840
------------
Fold 7
RMSE: 1.5673
------------
Fold 8
RMSE: 1.5953
------------
Fold 9
RMSE: 1.5752
------------
Fold 10
RMSE: 1.5806
------------
------------
Mean RMSE: 1.5734
------------
------------


CaseInsensitiveDefaultDict(list,
                           {'rmse': [1.560701373728127,
                             1.5748004999455878,
                             1.5664122786319488,
                             1.5633606660840293,
                             1.5668599999133244,
                             1.5839884522036791,
                             1.5672826771535835,
                             1.5952544933233372,
                             1.5752110639703307,
                             1.5806040241882806]})

In [109]:
s_reader = Reader( rating_scale= (1,10))
# data have to follow this structure:
#   user: item: rating
s_data = Dataset.load_from_df(df_rating_clean[['user', 'isbn', 'rating']], s_reader)
trainset, testset = train_test_split(s_data, test_size=0.3)


In [112]:
# Use the new parameters with the train data
algo = SVD(n_factors = 3, n_epochs = 100, lr_all = 0.005, reg_all = 0.1)
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x152bd0f60>

In [113]:
test_pred = algo.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)

SVD : Test Set
RMSE: 1.6507


1.6506542083687001

## Code that doesn't work

In [10]:
# ----- SVD ----- #

param_grid = {'n_factors': [110, 120, 140, 160], 'n_epochs': [90, 100, 110], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
algo = gs.best_estimator['rmse']



KeyboardInterrupt: 

In [None]:
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)

# Use the new parameters with the train data
algo = SVD(n_factors=160, n_epochs=100, lr_all=0.005, reg_all=0.1)
algo.fit(trainset)
test_pred = algo.test(testset)
print("SVD : Test Set")
accuracy.rmse(test_pred, verbose=True)


In [None]:
# ----- SVD++ ----- #

param_grid = {'n_factors': [20, 30, 40], 'n_epochs': [20,30,40], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVDpp, param_grid, measures=['rmse', 'mae'], cv=3)
gs.fit(data)
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])
cross_validate(algo, data, measures=['RMSE', 'MAE'], cv=5, verbose=True)



In [None]:
# Use the new parameters with the train data
algo = SVDpp(n_factors=40, n_epochs=40, lr_all=0.008, reg_all=0.1)
algo = SVDpp()
algo.fit(trainset)
test_pred = algo.test(testset)
print("SVD++ : Test Set")
accuracy.rmse(test_pred, verbose=True)