# Latent Factor Analysis for Book Recommendation
using the Book Crossing Datset

## Preliminary

In [1]:
import pandas as pd
import numpy as np
import time

from numpy.linalg import norm

## Data Wrangling

In [2]:
raw_path = '../data/BX-Book-Ratings.csv'
raw = pd.read_csv(raw_path, 
                  sep=';',
                  header = 0,
                  names = ['user', 'isbn', 'rating'],
                  encoding = 'ISO-8859-1')
raw.shape

(1149780, 3)

In [3]:
bookfile_path = '../data/BX-Books.csv'
df_book = pd.read_csv( bookfile_path,
                      sep = ';',
                      header = 0,
                      error_bad_lines = False,
                      usecols = [0,1,2],
                      index_col = 0,
                      names = ['isbn', 'title', 'author'],
                      encoding = 'iso-8859-1'
                     )
df_book.head()

Unnamed: 0_level_0,title,author
isbn,Unnamed: 1_level_1,Unnamed: 2_level_1
195153448,Classical Mythology,Mark P. O. Morford
2005018,Clara Callan,Richard Bruce Wright
60973129,Decision in Normandy,Carlo D'Este
374157065,Flu: The Story of the Great Influenza Pandemic...,Gina Bari Kolata
393045218,The Mummies of Urumchi,E. J. W. Barber


### Remove books without informations

In [4]:
missing_books = raw[-raw['isbn'].isin(df_book.index)]
df_rating = raw[raw['isbn'].isin(df_book.index)]
print( f'Found {len(missing_books)} missing books: {"{:.2%}".format(len(missing_books)/len(raw))}')

Found 118605 missing books: 10.32%


### Remove Books and Users with limited ratings
we are setting a **threshold** here of minimum 10 ratings  
users or books that has fewer than 10 ratings are simply not worth the effort

In [5]:
books_ratings_count = df_rating.isbn.value_counts()
users_ratings_count = df_rating.user.value_counts()

In [6]:
rating_threshold = 10
books_tokeep = books_ratings_count[books_ratings_count>= rating_threshold]
users_tokeep = users_ratings_count[users_ratings_count>= rating_threshold]
print(f'Only {"{:.2%}".format(len(books_tokeep)/ len(books_ratings_count))} books will be kept')
print(f'Only {"{:.2%}".format(len(users_tokeep)/ len(users_ratings_count))} users will be kept')
      
print(f'There are {"{:,}".format(len(users_tokeep))} users and {"{:,}".format(len(books_tokeep))} books.')

Only 6.47% books will be kept
Only 12.75% users will be kept
There are 11,746 users and 17,479 books.


In [7]:
df_rating_clean = df_rating[ df_rating.isbn.isin(books_tokeep.index)]
df_rating_clean = df_rating_clean[ df_rating_clean.user.isin(users_tokeep.index)]
print( f'We have {"{:,}".format(len(df_rating))} ratings before applying threshold, now we have {"{:,}".format(len(df_rating_clean))}')

We have 1,031,175 ratings before applying threshold, now we have 428,085


### Remove Zeros

In [8]:
zeros = df_rating_clean[df_rating_clean.rating == 0]
print( f'Found {len(zeros)} records that is 0: {"{:.2%}".format(len(zeros)/len(df_rating_clean))}')

Found 284422 records that is 0: 66.44%


In [9]:
df_rating_clean = df_rating_clean[df_rating_clean.rating != 0]
df_rating_clean.shape

(143663, 3)

### Sparseness of our rating Matrix

In [10]:
count_users = len(df_rating_clean.user.unique())
count_items = len(df_rating_clean.isbn.unique())
count_ratings = len(df_rating_clean)
print(f'We have {count_users} users X {count_items} items and {count_ratings} ratings. Resulting in a rating matrix that has a density of {"{:.2%}".format(count_ratings/ (count_users * count_items))}')

We have 10807 users X 17091 items and 143663 ratings. Resulting in a rating matrix that has a density of 0.08%


### Create Sparse Matrix
Usage \#3 from [documentation](https://kite.com/python/docs/scipy.sparse.coo_matrix):  
`coo_matrix((data, (i, j)), [shape=(M, N)])`

In [11]:
df_R_org = df_rating_clean.pivot( index = 'user', columns = 'isbn', values = 'rating').fillna(0)

from scipy.sparse import coo_matrix
# data = df_rating_clean.rating.astype(float)
# i = df_rating_clean.user.astype('category').cat.codes.copy()
# j = df_rating_clean.isbn.astype('category').cat.codes.copy()

# R = coo_matrix(( data, (i,j)))
R = coo_matrix( df_R_org.values)
print(f'rating matrix R is of type {type(R)} and shape {R.shape}')

rating matrix R is of type <class 'scipy.sparse.coo.coo_matrix'> and shape (10807, 17091)


### Latent Factor Analysis with Biases

Much of the observed variation in rating values is due to effects associated with either users or items, known as biases or intercepts, independent of any interactions.

$b_{ui}$ = μ + $b_i$ + $b_u$

For example, suppose that you want a first-order estimate for user Joe’s rating of the movie Titanic. Now, say that the average rating over all movies, μ, is 3.7 stars. Furthermore, Titanic is better than an average movie, so it tends to be rated 0.5 stars above the average.On the other hand, Joe is a critical user, who tends to rate 0.3 stars lower than the average. Thus, the estimate for Titanic’s rating by Joe would be 3.9 stars (3.7 + 0.5 - 0.3).

Biases extend the Equation $r_{ui}$= $q_{i}p_{u}$ as follows:

$r_{ui}$= μ+ $b_i$ + $b_u$ + $q_{i}p_{u}$

The system learns by minimizing the squared error function: 

$min_{p,q,b}  \sum_  rui_{k}- \mu -b_{u}-b_{i}-p.q+ \lambda  \big( \|p \|^2+ \|q \|^2+\|b_{u} \|^2 +\|b_{i} \|^2\big)  
 $

In [12]:
## pow is power function pow(x,n) = x^n
## e is error

def error(R,P,Q,b,b_u,b_i,lamda=0.02):
    ratings = R.data
    rows = R.row
    cols = R.col
    e = 0   
    for ui in range(len(ratings)):
        rui=ratings[ui]
        i = rows[ui]
        j = cols[ui]
        r_cap=np.dot(P[i,:],Q[:,j])+b+b_u[i]+b_i[j]
        length_terms= [norm(P[i,:]),norm(Q[:,j]),b_u[i],b_i[j]]
        if rui>0:
            e= e + (rui-r_cap)**2+\
                lamda*(sum([i**2 for i in length_terms]))
    return e

In [15]:
## R coo-matrix of rating with user-books
##  K no. of latent factors
## lr is learning rate
## lamda is regualarization parameter

def SGD(R, K, lamda=0.02,steps=10, lr=0.001):
    s_time = time.time()

    M,N = R.shape
    P = np.random.normal(scale=1./K,size=(M,K))
    Q = np.random.normal(scale=1./K,size=(K,N))
    
    # Initialize the biases
    # b_u the bias of users
    # b_i the bias of books
    b_u = np.zeros(M)
    b_i = np.zeros(N)
    b = np.mean(R.data)
    
    params = [R,P,Q,b,b_u,b_i,lamda]
    
    rmse = np.sqrt(error(*params)/len(R.data))
    print(f"Initial RMSE {'{:.4f}'.format(rmse)}")
    
    for step in range(steps):
        for ui in range(len(R.data)):
            rui=R.data[ui]
            u = R.row[ui]
            i = R.col[ui]
            if rui>0:
                eui=rui-np.dot(P[u,:],Q[:,i])-b-b_i[i]-b_u[u]
                # Update biases
                b_u[u] += 2*lr * (eui - lamda * b_u[u])
                b_i[i] += 2*lr * (eui - lamda * b_i[i])
            
                P[u,:]=P[u,:]+2*lr*(eui*Q[:,i]-lamda*P[u,:])  
                Q[:,i]=Q[:,i]+2*lr*(eui*P[u,:]-lamda*Q[:,i])
          
        rmse = np.sqrt(error(*params)/len(R.data))
          
        if step%5==0:
            print(f"RMSE {'{:.4f}'.format(rmse)}")
        elif rmse <0.5:
            break
    
    e_time = time.time()
    print(f"Final RMSE {'{:.4f}'.format(rmse)}")
    print(f'SGD took {"{:.2f}".format((e_time - s_time)/ 60)} minutes.')
          
    return P,Q,b_u,b_i,b

In [26]:
(P,Q,b_u,b_i,b)=SGD(R,K=10,lr=0.002,lamda=0.02, steps=30)

Initial RMSE 1.7922
RMSE 1.6935
RMSE 1.5442
RMSE 1.4774
RMSE 1.4232
RMSE 1.3739
RMSE 1.3278
Final RMSE 1.2892
SGD took 2.89 minutes.


### Creating the Prediction Matrix

In [16]:
sgd_outputs = {
    'P': P,
    'Q': Q,
    'b_u': b_u,
    'b_i': b_i,
    'b': b
              }
print(f'Shapes of SGD function output:')
for var in sgd_outputs:
    print(f'{var}: {sgd_outputs[var].shape}')

Shapes of SGD function output:
P: (10807, 10)
Q: (10, 17091)
b_u: (10807,)
b_i: (17091,)
b: ()


In [17]:
m , _ = P.shape
_ , n = Q.shape
R_hat =np.dot(P, Q )+ b_u.reshape(m,1) + (b_i.T).reshape(1,n) + b
print(f'R_hat is type {type(R_hat)} and shape {R_hat.shape}.')

R_hat is type <class 'numpy.ndarray'> and shape (10807, 17091).


#### Turn R_hat from a Numpy array to a DF for easier lookup

In [18]:
df_R_hat = pd.DataFrame(data = R_hat, index = df_R_org.index, columns= df_R_org.columns)

### Util functions

In [19]:
def GetUserRatedBooks( user_id, return_rating = False):
    udata = df_rating_clean[ df_rating_clean.user == user_id]
    if return_rating:
        return {udata.iloc[i].isbn : udata.iloc[i].rating for i in range(len(udata))}
    else:
        return list(udata['isbn'])

def GetBookDetails( isbn, field = 'title'):
    return str(df_book[df_book.index == isbn][field].values)

In [20]:
uid = 276847

In [21]:
def RatingMatrixLookup( user_id , item_id, R_predicted):
    return R_predicted[ R_predicted.index == user_id][item_id]

In [22]:
RatingMatrixLookup( uid, '3404148576', df_R_hat)

user
276847    8.207228
Name: 3404148576, dtype: float64

In [23]:
RatingMatrixLookup( uid, '3404148576', df_R_org)

user
276847    8.0
Name: 3404148576, dtype: float64

In [24]:
def GetPredictions( userid, df_R_hat, top_n, verbose = False):
    if not df_R_hat.index.isin([userid]).any():
        print(f'User {userid} does exist in rating matrix')
        return None
    
    pred_df = df_R_hat[ df_R_hat.index == userid].T.sort_values(by = userid, ascending = False)
    
    old_books = GetUserRatedBooks(userid, return_rating = True)
    
    top_books = pred_df[ ~pred_df.index.isin(old_books)][:top_n].index
    
    if verbose:
        print(f'User {userid} already read:\n---------------------')
        for book in old_books:
            bname = GetBookDetails(book)
            print(f'{str(bname)}: {old_books[book]}')
        print(f'\nWe recommend:\n---------------------')
        for book in top_books:
            bname = GetBookDetails(book)
            print(f'{str(bname)}')

In [27]:
GetPredictions( uid, df_R_hat, 20, verbose = True)

User 276847 already read:
---------------------
['Nordermoor']: 8
['Der Kleine Hobbit']: 10
['Auf Ehre und Gewissen. Roman.']: 10
['Asche zu Asche.']: 7
['Denn sie betrÃ?Â¼gt man nicht.']: 8
['Mit dem KÃ?Â¼hlschrank durch Irland.']: 10
['Die HirnkÃ?Â¶nigin.']: 7
['Harry Potter und der Stein der Weisen']: 10
['Harry Potter und die Kammer des Schreckens']: 10
['Harry Potter und der Gefangene von Azkaban']: 10
['Harry Potter Und Der Feuerkelch']: 10

We recommend:
---------------------
["My Sister's Keeper : A Novel (Picoult, Jodi)"]
['Dilbert: A Book of Postcards']
['Harry Potter and the Chamber of Secrets Postcard Book']
['84 Charing Cross Road']
["The Time Traveler's Wife"]
['Fox in Socks (I Can Read It All by Myself Beginner Books)']
['Calvin and Hobbes']
['The Two Towers (The Lord of the Rings, Part 2)']
['Weirdos From Another Planet!']
['The Return of the King (The Lord of the Rings, Part 3)']
['Lonesome Dove']
['Where the Sidewalk Ends : Poems and Drawings']
['A Tree Grows in Brook