# Latent Factor Analysis (LFA) using SGD
1.  Load data
-  Prepocessing  
    -  Drop 0
    -  Convert to Sparse
-  Define error function
-  Using SGD to minmize error function
-  Prediction for user already in database
-  References

# Libraries

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import time

from numpy.linalg import norm 
from scipy.sparse import coo_matrix

# Load Data

In [None]:
def loadingData(dataFile,nrows=None):
    
#     if nrows =-1
    
    
    df = pd.read_csv(dataFile,
                     sep=";",
                     header=0,
                     names=["user","isbn","rating"],
                     encoding='iso-8859-1',
                     nrows=nrows
                    )
    return df

# Preprocess


In [None]:
# does not work on whole data set coz its too large
# R = df.pivot(index='user',columns='isbn',values='rating')

In [None]:
def covertToSparse(df):
    # sparse matrix works more efficiently
    df['rating'] = df['rating'].astype(float)
    df['user'] = df['user'].astype("category")
    df['isbn'] = df['isbn'].astype("category")

    # convert str to catergory codes because spare matrix cannot contain string
    isbn_code = df['isbn'].cat.codes.copy()
    user_code = df['user'].cat.codes.copy()

    R = coo_matrix((df['rating'],(user_code, isbn_code)))
    
    return R

In [None]:
def filterBooks(df, book_threshold = 0, user_threshold = 0):
    
    books_ratings_count = df.isbn.value_counts() # count number of review of each book
    users_ratings_count = df.user.value_counts() # count number of review of each book

    # filtering ,obtain index
    books_tokeep = books_ratings_count[books_ratings_count >= book_threshold]
    users_tokeep = users_ratings_count[users_ratings_count >= user_threshold]
    
    # filtering
    df_clean = df[df.isbn.isin(books_tokeep.index)]
    df_clean = df_clean[df_clean.user.isin(users_tokeep.index)]
    
    

    def cal_size(df):
        r,c = df.shape
        size = r*c
        return size
    
    pc = cal_size(df_clean)/cal_size(df) * 100
    
    print(f"Book, User Threshold: {(book_threshold, user_threshold )}")
    print(f"INPUT SIZE: {df.shape}")
    print(f"OUTPUT SIZE: {df_clean.shape}")
    
    print(f"Data size reduced to: {pc:.2f}%")

    return df_clean

# Error Function

![error function](\nb_img\error.png)

In [None]:
def cal_error(R,P,Q,lambda_=0.02):
    # error function to be minimized
    ratings = R.data
    rows = R.row
    cols = R.col
    error = 0
    
    for ui in range(len(ratings)):
        rui = ratings[ui]
        u= rows[ui]
        i= cols[ui]
        
        # adding bias
        mean = np.mean(R.data) # mean score of all rating
        ui = np.mean(P[u,:]) # mean rating given by that user
        bi = np.mean(Q[:,i]) # mean rating give to that movie
        bui = mean + ui + bi
        
        if rui > 0:
            rui_hat = P[u,:]@Q[:,i] + mean + ui + bi # adding bias
            terms = [ui,bi,norm(P[u,:],2),norm(Q[:,i],2)]
            error = error + (rui - rui_hat)**2 + \
                    lambda_ * sum([i**2 for i in terms])

    return error

# SGD Function
![sgd](\nb_img\sgd.png)

In [None]:
def SGD_bias(R,K=5,lambda_=0.02,steps=10,gamma=0.001,rmse_target=1,
             initialize=True,P_hat=None,Q_hat=None,
             verbose=False):

    # lambda_: regularization
    # gamma :learning rate
    
    if P_hat ==None and Q_hat==None:
        # initialise matrix P and Q
        M,N = R.shape
        P = np.random.rand(M,K)
        Q = np.random.rand(K,N)
        
    # load pretrained weights, used for predicting new user
    elif P_hat !=None and Q_hat!=None:
        P = P_hat
        Q = Q_hat

    #initial RMSE
    rmse = np.sqrt(cal_error(R,P,Q,lambda_)/len(R.data))
    print(f"STARTING RMSE: {rmse:.2f}")

    for step in range(steps):
        
        for ui in range(len(R.data)):
            rui = R.data[ui] # serialize matrix
            u = R.row[ui] # get user index (row)
            i = R.col[ui] # get item index (col)

            # adding bias
            mean = np.mean(R.data) # mean score of all rating
            ui = np.mean(P[u,:]) # mean rating given by that user
            bi = np.mean(Q[:,i]) # mean rating give to that movie
            bui = mean + ui + bi
    
            # update P,Q matrix
            rui_hat = P[u,:]@Q[:,i] + mean + ui + bi
            eui = rui - rui_hat
            P[u,:] = P[u,:] + gamma * (eui * Q[:,i] - lambda_ * P[u,:])  
            Q[:,i] = Q[:,i] + gamma * (eui * P[u,:] - lambda_ * Q[:,i])  

            
        rmse = np.sqrt(cal_error(R,P,Q,lambda_)/len(R.data))

        if verbose:
            print(f"STEP NO: {step+1} - CURRENT RMSE: {rmse:.2f}")
            
        if rmse < rmse_target:
            break
            

    print(f"STEP NO: {step+1} - FINAL RMSE: {rmse:.2f}")

    return P,Q,rmse

# Production

In [None]:
starttime = time.time()

nrows = None
# nrows= 5000
dataFile='data\BX-Book-Ratings.csv'
df = loadingData(dataFile,nrows)

#filtering
book_threshold,user_theshold = 10,10
df = filterBooks(df,book_threshold,user_theshold)

# block for dropping 0 values
print(f"DF size: {df.shape}")
df = df[df.rating!=0]
print(f"DF size after dropping 0: {df.shape}")

# convert df to sparse matrix
R = covertToSparse(df)
print(f"Rating matrix shape: {R.shape}")

# sanity check
print(f"Number of unique users in df: {len(df.user.unique())}")
print(f"Number of unique books in df: {len(df.isbn.unique())}")
print(f"Number of rows and cols,i.e. unique users,books in R matrix: {R.shape}")

In [None]:
#SGD
params = {'R':R,
          'K':5,
          'lambda_':0.02,
          'steps':500,
          'gamma':0.01,
          'verbose':True,
          'rmse_target':1          
         }



P,Q,rmse = SGD_bias(**params)

duration = time.time() - starttime
print(f"Process time: {duration:.2f}")

In [None]:
import sys
sys.exit()


# how to input new user?

layer 1: user demographic, bio
user content based filtering to generate

layer 2: punch in social media account, obtain social graph 
recommend using colaborative filtering

layer 3: rate some movies to cold start the LFA process

old user, look up updated R matrix
new user, give random 10 movies to rate, , upate P matrix, reclculate PQ matrix by SGD

In [None]:
test_users = [94100, 173415, 116122, 55490, 108950, 148898, 133706, 36299, 262070, 106849]

In [None]:
P.shape

In [None]:
Q.shape

In [None]:
A = np.array(np.mat('1 2; 3 4'), subok=True)
A

In [None]:
np.indices?

In [None]:
#prediction block
R_hat = P@Q

R_hat.shape

In [None]:
max(R_hat[0,:])

In [None]:
min(R_hat[0,:])

In [None]:
sns.heatmap(R_hat,annot=False,cmap='plasma',vmax=0)

if oldd user read 10 books, how to prevent system from reocmmending the same 10 books

save and load trained weiights, PQ
save and display rating matrix
load rating matrix to predicut user behaviour

# References

Yehuda Koren, Robert Bell and Chris Volinsky (2019). Matrix Factorization Techniques for Recommender Systems - IEEE Journals & Magazine. [online] Ieeexplore.ieee.org. Available at: https://ieeexplore.ieee.org/document/5197422 [Accessed 10 Jan. 2019];
https://datajobs.com/data-science-repo/Recommender-Systems-[Netflix].pdf[Accessed 10 Jan. 2019].

# SS Misc code

In [None]:
P.conc([0,0,0,0,0])

In [None]:
new_user = np.array([0,0,0,0,0])
P_prime = np.append((P,new_user),axis=0)

In [None]:
P

In [None]:
P,Q,rmse = SGD_bias(**params)

In [None]:
new_user = np.array([[0,7,10,0,0]])
P_prime = np.concatenate((P,new_user))
P_prime.shape

P2,Q2, rmse = SGD_bias(**params,P_hat=P_prime,Q_hat=Q)

In [None]:
R.shape

In [None]:
P.shape

In [None]:
P2.shape

In [None]:
P_prime.shape

In [None]:
A = np.array([[1,2,3],[4,5,6]])
A

In [None]:
B = np.array([[1,1,1]])
np.append(A,B)

In [None]:
np.concatenate((A,B))

In [None]:
P.shape

In [None]:
P_prime.shape

In [None]:
df.head()

In [None]:
# query spare matrix

df[df.user == 276726]

In [None]:
R.data

In [None]:
R.col

In [None]:
R_hat[0,:]

In [None]:
R_hat.shape

In [None]:
# sparse matrix works more efficiently
df['rating'] = df['rating'].astype(float)
df['user'] = df['user'].astype("category")
df['isbn'] = df['isbn'].astype("category")

# convert str to catergory codes because spare matrix cannot contain string
isbn_code = df['isbn'].cat.codes.copy()
user_code = df['user'].cat.codes.copy()

R = coo_matrix((df['rating'],(user_code, isbn_code)))
R.shape

In [None]:
print(df.shape)
df = df[df.rating!=0]
df.shape

In [None]:
np.any(np.isnan(df))

In [None]:
dataFile='data\BX-Book-Ratings.csv'

df = pd.read_csv(dataFile,sep=";",
                 header=0,
                 names=["user","isbn","rating"],
                 encoding='iso-8859-1',
                 nrows=1000
                )
df.head()

In [None]:
# R = np.array([[3,0,2],[4,1,9],[9,2,1]])
M,N = R.shape
K=10
P = np.random.rand(M,K)
Q = np.random.rand(K,N)
cal_error(R,P,Q,0.02)

In [None]:
import os
os.startfile(os.getcwd())

In [None]:
# # WRONG,NO REGULARIZATION TERMS

# def SGD(R,K=5,lambda_=0.02,steps=10,gamma=0.001,verbose=False,rmse_target=1):

#     # lambda_: regularization
#     # gamma :learning rate
    
#     # initialise matrix P and Q
#     M,N = R.shape
#     P = np.random.rand(M,K)
#     Q = np.random.rand(K,N)
    
#     #initial RMSE
#     rmse = np.sqrt(mean_squared_error(R.toarray(), P@Q))
#     print(f"STARTING RMSE: {rmse:.2f}")

#     for step in range(steps):
        
#         for ui in range(len(R.data)):
#             rui = R.data[ui] # serialize matrix
#             u = R.row[ui] # get user index (row)
#             i = R.col[ui] # get item index (col)

# #             # adding bias
# #             mean = np.mean(R.data) # mean score of all rating
# #             ui = np.mean(P[u,:]) # mean rating given by that user
# #             bi = np.mean(Q[:,i]) # mean rating give to that movie

# #             bui = mean + ui + bi
            
# #             rui_hat = P[u,:] @ Q[:,i] + mean + ui + bi
#             rui_hat = P[u,:] @ Q[:,i] # sum(row x col)
#             error = rui - rui_hat

#             # update P,Q matrix
#             P[u,:] = P[u,:] + gamma * (error * Q[:,i] - lambda_ * P[u,:])  
#             Q[:,i] = Q[:,i] + gamma * (error * P[u,:] - lambda_ * Q[:,i])  

            
#         rmse = np.sqrt(mean_squared_error(R.toarray(), P@Q))

#         if verbose:
#             print(f"STEP NO: {step+1} - CURRENT RMSE:{rmse:.2f}")
            
#         if rmse < rmse_target:
#             break
            
#     if verbose:
#         print(f"STEP NO: {step+1} - CURRENT RMSE:{rmse:.2f}")

#     return P,Q,rmse

In [None]:
# # THE ERROR FUNCTION IS WRONG
# # bias are not accounted for in SGD

# def SGD_bias_old(R,K=5,lambda_=0.02,steps=10,gamma=0.001,verbose=False,rmse_target=1):

#     # lambda_: regularization
#     # gamma :learning rate
    
#     # initialise matrix P and Q
#     M,N = R.shape
#     P = np.random.rand(M,K)
#     Q = np.random.rand(K,N)
    
#     #initial RMSE
#     rmse = np.sqrt(mean_squared_error(R.toarray(), P@Q))
#     print(f"STARTING RMSE: {rmse:.2f}")

#     for step in range(steps):
        
#         for ui in range(len(R.data)):
#             rui = R.data[ui] # serialize matrix
#             u = R.row[ui] # get user index (row)
#             i = R.col[ui] # get item index (col)

#             # adding bias
#             mean = np.mean(R.data) # mean score of all rating
#             ui = np.mean(P[u,:]) # mean rating given by that user
#             bi = np.mean(Q[:,i]) # mean rating give to that movie

#             bui = mean + ui + bi
            
#             rui_hat = P[u,:] @ Q[:,i] + mean + ui + bi
#             error = rui - rui_hat

#             # update P,Q matrix
#             P[u,:] = P[u,:] + gamma * (error * Q[:,i] - lambda_ * P[u,:])  
#             Q[:,i] = Q[:,i] + gamma * (error * P[u,:] - lambda_ * Q[:,i])  

            
#         rmse = np.sqrt(mean_squared_error(R.toarray(), P@Q))

#         if verbose:
#             print(f"STEP NO: {step+1} - CURRENT RMSE:{rmse:.2f}")
            
#         if rmse < rmse_target:
#             break
            
#     if verbose:
#         print(f"STEP NO: {step+1} - CURRENT RMSE:{rmse:.2f}")

#     return P,Q,rmse

In [None]:
# # element by element approach

# def SGD_old(R,K=5,lambda_=0.02,steps=10,gamma=0.001,verbose=False,rmse_target=1):
# # def SGD(R,K,lambda_,steps,gamma,verbose,rmse_target):    
#     # lambda_: regularization
#     # gamma :learning rate

#     # initialise matrix P and Q
#     M,N = R.shape
#     P = np.random.rand(M,K)
#     Q = np.random.rand(K,N)
    
#     #initial RMSE
#     rmse = np.sqrt(mean_squared_error(R.toarray(), P@Q))
#     print(f"STARTING RMSE: {rmse:.2f}")

#     for step in range(steps):
#         for ui in range(len(R.data)):
#             rui = R.data[ui] # serialize matrix
#             u = R.row[ui] # get user index (row)
#             i = R.col[ui] # get item index (col)

#             rui_hat = P[u,:] @ Q[:,i] # sum(row x col)
#             error = rui - rui_hat

#             # update P,Q matrix
#             P[u,:] = P[u,:] + gamma * (error * Q[:,i] - lambda_ * P[u,:])  
#             Q[:,i] = Q[:,i] + gamma * (error * P[u,:] - lambda_ * Q[:,i])  

            
#         rmse = np.sqrt(mean_squared_error(R.toarray(), P@Q))
#         if rmse < rmse_target:
#             break

#     if verbose == True:
#         print(f"FINAL RMSE: {rmse:.2f}")

#     return P,Q,rmse

# Inspection , EDA

In [None]:
sns.distplot(df.rating,
             bins=range(10),
             hist_kws={"histtype": "step",
                       "linewidth": 3,
                       "alpha": 1,
                       "color": "r"},
             kde=False,

            )
            

In [None]:
sns.distplot(df[df.rating!=0].rating,
             bins=range(10),
             hist_kws={"histtype": "step",
                       "linewidth": 3,
                       "alpha": 1,
                       "color": "r"},
             kde=False,
            )

In [None]:
sns.heatmap(df==0,cmap='plasma_r')

norm?

In [None]:
a= [1,2,3]
np.mean(a)**2

In [None]:
norm(a,2)

In [None]:
norm(42,2)

In [None]:
df.head()

In [None]:
df.groupby('user').count().head()

In [None]:
df_user = df.groupby('user').count().sort_values('rating',ascending=False).isbn 

In [None]:
df_user

In [None]:
# def filterBooks(df, rating_threshold = 10):
    
#     books_ratings_count = df.isbn.value_counts() # count number of review of each book
#     users_ratings_count = df.user.value_counts() # count number of review of each book

#     # filtering ,obtain index
#     books_tokeep = books_ratings_count[books_ratings_count >= rating_threshold]
#     users_tokeep = users_ratings_count[users_ratings_count >= rating_threshold]
    
#     # filtering
#     df_clean = df[df.isbn.isin(books_tokeep.index)]
#     df_clean = df_clean[df_clean.user.isin(users_tokeep.index)]
    
#     print(f"INPUT SIZE: {df.shape}")
#     print(f"OUTPUT SIZE: {df_clean.shape}")
    
#     def cal_size(df):
#         r,c = df.shape
#         size = r*c
#         return size
    
#     pc = cal_size(df_clean)/cal_size(df) * 100
    
#     print(f"Data size reduced to: {pc:.2f}%")

#     return df_clean

In [None]:
df_test = filterBooks(df,2)
df_test.shape

In [None]:
books_ratings_count = df.isbn.value_counts()
type(books_ratings_count)
books_ratings_count.head()

In [None]:
df.isin?