# Latent Factor Analysis for Book Recommendation
using the [Book Crossing Datset](http://www2.informatik.uni-freiburg.de/~cziegler/BX/) and [**surprise**](http://surpriselib.com/)

## Preliminary

In [35]:
import pandas as pd
import numpy as np
import time

from surprise import SVDpp
from surprise import Dataset, Reader, SVD, evaluate
from surprise import accuracy
from surprise.model_selection import train_test_split
from surprise.model_selection import GridSearchCV
from surprise.model_selection import cross_validate

## Data Wrangling

In [43]:
raw_path = '../data/BX-Book-Ratings.csv'
raw = pd.read_csv(raw_path, 
                  sep=';',
                  header = 0,
                  names = ['user', 'isbn', 'rating'],
                  encoding = 'ISO-8859-1')
raw.head()

Unnamed: 0,user,isbn,rating
0,276725,034545104X,0
1,276726,0155061224,5
2,276727,0446520802,0
3,276729,052165615X,3
4,276729,0521795028,6


In [3]:
bookfile_path = '../data/BX-Books.csv'
df_book = pd.read_csv( bookfile_path,
                      sep = ';',
                      header = 0,
                      error_bad_lines = False,
                      usecols = [0,1,2],
                      index_col = 0,
                      names = ['isbn', 'title', 'author'],
                      encoding = 'iso-8859-1'
                     )
df_book.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


#### Build util function GetUserRatedBooks() and GetBookDetails()

In [152]:
def GetUserRatedBooks( user_id, return_rating = False):
    udata = df_rating_clean[ df_rating_clean.user == user_id]
    if return_rating:
        return {udata.iloc[i].isbn : udata.iloc[i].rating for i in range(len(udata))}
    else:
        return list(udata['isbn'])

def GetBookDetails( isbn, field = 'title'):
    return str(df_book[df_book.index == isbn][field].values)

In [155]:
GetUserRatedBooks(276847, True)

{'3404148576': 8,
 '3423071516': 10,
 '3442413508': 10,
 '3442437717': 7,
 '3442444020': 8,
 '3442446414': 10,
 '3442448530': 7,
 '3551551677': 10,
 '3551551685': 10,
 '3551551693': 10,
 '3551551936': 10}

In [154]:
GetBookDetails('0195153448')

"['Classical Mythology']"

### Remove books without informations

In [60]:
missing_books = raw[-raw['isbn'].isin(df_book.index)]
df_rating = raw[raw['isbn'].isin(df_book.index)]
print( f'Found {len(missing_books)} missing books: {"{:.2%}".format(len(missing_books)/len(raw))}')

Found 118605 missing books: 10.32%


### Remove Books and Users with limited ratings
we are setting a **threshold** here of minimum 10 ratings  
users or books that has fewer than 10 ratings are simply not worth the effort

In [61]:
books_ratings_count = df_rating.isbn.value_counts()
users_ratings_count = df_rating.user.value_counts()

In [62]:
rating_threshold = 10
books_tokeep = books_ratings_count[books_ratings_count>= rating_threshold]
users_tokeep = users_ratings_count[users_ratings_count>= rating_threshold]
print(f'Only {"{:.2%}".format(len(books_tokeep)/ len(books_ratings_count))} books will be kept')
print(f'Only {"{:.2%}".format(len(users_tokeep)/ len(users_ratings_count))} users will be kept')
      
print(f'There are {"{:,}".format(len(users_tokeep))} users and {"{:,}".format(len(books_tokeep))} books.')

Only 6.47% books will be kept
Only 12.75% users will be kept
There are 11,746 users and 17,479 books.


In [63]:
df_rating_clean = df_rating[ df_rating.isbn.isin(books_tokeep.index)]
df_rating_clean = df_rating_clean[ df_rating_clean.user.isin(users_tokeep.index)]
print( f'We have {"{:,}".format(len(df_rating))} ratings before applying threshold, now we have {"{:,}".format(len(df_rating_clean))}')

We have 1,031,175 ratings before applying threshold, now we have 428,085


### Remove Zeros

In [65]:
zeros = df_rating_clean[df_rating_clean.rating == 0]
print( f'Found {len(zeros)} records that is 0: {"{:.2%}".format(len(zeros)/len(df_rating_clean))}')

Found 284422 records that is 0: 66.44%


In [66]:
df_rating_clean = df_rating_clean[df_rating_clean.rating != 0]
df_rating_clean.shape

(143663, 3)

### Sparseness of our Rating Matrix

In [71]:
count_users = len(df_rating_clean.user.unique())
count_items = len(df_rating_clean.isbn.unique())
count_ratings = len(df_rating_clean)
print(f'We have {count_users} users X {count_items} items and {count_ratings} ratings. Resulting in a rating matrix that has a density of {"{:.2%}".format(count_ratings/ (count_users * count_items))}')

We have 10807 users X 17091 items and 143663 ratings. Resulting in a rating matrix that has a density of 0.08%


## Models

#### Model Parameters

In [166]:
n_factors = 10
n_epochs = 30
lr = 0.002
random_state = 420
verbose = False

test_userid = 276847

#### Loading the dataset

In [69]:
s_reader = Reader( rating_scale= (1,10))
# data have to follow this structure:
#   user: item: rating
s_data = Dataset.load_from_df(df_rating_clean[['user', 'isbn', 'rating']], s_reader)

### Using Surprise's SVD

In [165]:
# Train-Test Split
trainset, testset = train_test_split(s_data, test_size=0.1)

# define the model
svd = SVD(n_factors, n_epochs, lr_all = lr, random_state= random_state, verbose= verbose)

# Fit + Test-Predict
s_time = time.time()
svd.fit(trainset)
e_time = time.time()
print(f'Model building took {"{:.2f}".format((e_time - s_time)/60)} minutes')

test_predict = svd.test(testset)
print("Model accuracy using SVD : Test Set")
accuracy.rmse(test_predict, verbose=True)

Model building took 0.09 minutes
Model accuracy using SVD : Test Set
RMSE: 1.6423


1.6422678447254193

In [143]:
def GetPredictions( trained_model, user_id, item_ids, org_df, verbose = False, top_n = None):
    if not org_df.user.isin([user_id]).any():
        print(f'{user_id} not in trained dataset. Cannot be predicted.')
        return None
    
    pred_df = org_df[ org_df.user == user_id]
    if verbose:
        s_time = time.time()
        print(f'user {user_id} had {len(pred_df)} ratings')

    compute_count = 1
    for item in item_ids:
        if not pred_df.isbn.isin( [item] ).any():
            ipred = trained_model.predict( uid= user_id, iid = item)
            ipred_data = {'user': user_id, 'isbn': item, 'rating': ipred.est}
            pred_df = pred_df.append( ipred_data, ignore_index = True)
            compute_count +=1
    
    if verbose:
        e_time = time.time()
        print(f'computed prediction for {compute_count} items.\n Took {e_time - s_time} seconds.')
        
    pred_df = pred_df.sort_values( by = ['rating'], ascending = False)
    
    if top_n:
        old_books = GetUserRatedBooks(user_id, return_rating= False)
        return pred_df[-pred_df.isbn.isin(old_books)][:top_n]
    else:
        return pred_df

#### Let's Make a Prediction

In [145]:
user_pred = GetPredictions( svd , test_userid, df_rating_clean.isbn.unique(), df_rating_clean, 
                          top_n = 10, verbose = True)

user 276847 had 11 ratings
computed prediction for 17081 items.
 Took 47.158957958221436 seconds.


In [161]:
old_books = GetUserRatedBooks(test_userid, return_rating= True)
print(f'User {test_userid} already read:\n---------------------')
for ibook in old_books:
    bname = GetBookDetails(ibook)
    print(f'{str(bname)}: {old_books[ibook]}')

print(f'\nWe recommend:\n---------------------')
for ibook in user_pred.isbn:
    bname = GetBookDetails(ibook)
    print(f'{str(bname)}')

User 276847 already read:
---------------------
['Nordermoor']: 8
['Der Kleine Hobbit']: 10
['Auf Ehre und Gewissen. Roman.']: 10
['Asche zu Asche.']: 7
['Denn sie betrÃ?Â¼gt man nicht.']: 8
['Mit dem KÃ?Â¼hlschrank durch Irland.']: 10
['Die HirnkÃ?Â¶nigin.']: 7
['Harry Potter und der Stein der Weisen']: 10
['Harry Potter und die Kammer des Schreckens']: 10
['Harry Potter und der Gefangene von Azkaban']: 10
['Harry Potter Und Der Feuerkelch']: 10

We recommend:
---------------------
['The Return of the King (The Lord of the Rings, Part 3)']
['To Kill a Mockingbird']
['Harry Potter and the Order of the Phoenix (Book 5)']
['Harry Potter and the Goblet of Fire (Book 4)']
["Harry Potter and the Sorcerer's Stone (Harry Potter (Paperback))"]
['Harry Potter and the Prisoner of Azkaban (Book 3)']
['The Fellowship of the Ring (The Lord of the Rings, Part 1)']
["Harry Potter and the Sorcerer's Stone (Book 1)"]
['Harry Potter and the Chamber of Secrets (Book 2)']
['Harry Potter and the Prisoner o

### Using Surprise SVD++

In [167]:
# define the model
svd_pp = SVDpp(n_factors, n_epochs, lr_all = lr, random_state= random_state, verbose= verbose)

# Fit + Test-Predict
s_time = time.time()
svd_pp.fit(trainset)
e_time = time.time()
print(f'Model building took {"{:.2f}".format((e_time - s_time)/60)} minutes')

test_predict_pp = svd_pp.test(testset)
print("Model accuracy using SVD : Test Set")
accuracy.rmse(test_predict_pp, verbose=True)

Model building took 5.25 minutes
Model accuracy using SVD : Test Set
RMSE: 1.5683


1.5682822819369824

### Make Prediction Again

In [170]:
user_pred_pp = GetPredictions( svd_pp , test_userid, df_rating_clean.isbn.unique(), df_rating_clean, 
                          top_n = 10, verbose = True)

user 276847 had 11 ratings
computed prediction for 17081 items.
 Took 48.51417803764343 seconds.


In [171]:
old_books = GetUserRatedBooks(test_userid, return_rating= True)
print(f'User {test_userid} already read:\n---------------------')
for ibook in old_books:
    bname = GetBookDetails(ibook)
    print(f'{str(bname)}: {old_books[ibook]}')

print(f'\nWe recommend:\n---------------------')
for ibook in user_pred_pp.isbn:
    bname = GetBookDetails(ibook)
    print(f'{str(bname)}')

User 276847 already read:
---------------------
['Nordermoor']: 8
['Der Kleine Hobbit']: 10
['Auf Ehre und Gewissen. Roman.']: 10
['Asche zu Asche.']: 7
['Denn sie betrÃ?Â¼gt man nicht.']: 8
['Mit dem KÃ?Â¼hlschrank durch Irland.']: 10
['Die HirnkÃ?Â¶nigin.']: 7
['Harry Potter und der Stein der Weisen']: 10
['Harry Potter und die Kammer des Schreckens']: 10
['Harry Potter und der Gefangene von Azkaban']: 10
['Harry Potter Und Der Feuerkelch']: 10

We recommend:
---------------------
['Harry Potter and the Goblet of Fire (Book 4)']
['Weirdos From Another Planet!']
['The Return of the King (The Lord of the Rings, Part 3)']
['Dune (Remembering Tomorrow)']
["The Time Traveler's Wife"]
['Harry Potter and the Chamber of Secrets Postcard Book']
['To Kill a Mockingbird']
["My Sister's Keeper : A Novel (Picoult, Jodi)"]
['The Little Prince']
["Charlotte's Web (Trophy Newbery)"]
