In [2]:
import numpy as np
import scipy.sparse as sps
import scipy as sp
from matplotlib import pyplot as plt



In [3]:
# create numpy array
R = np.array([[7,6,7,4,5,4],[6,7,0,4,3,4],[0,3,3,1,1,0],[1,2,2,3,3,4],[1,0,1,2,3,3]])
R

array([[7, 6, 7, 4, 5, 4],
       [6, 7, 0, 4, 3, 4],
       [0, 3, 3, 1, 1, 0],
       [1, 2, 2, 3, 3, 4],
       [1, 0, 1, 2, 3, 3]])

In [4]:
Rs = sps.coo_array(R)
Rsc = Rs.tocsc()
Rsr = Rs.tocsr()

In [29]:
#
num_users = 5
num_items = 6

#
nnz_row_per_col = np.diff(Rsc.indptr)
nnz_col_per_row = np.diff(Rsr.indptr)

nonzero_rows = Rsc.nonzero()[0]
nonzero_cols = Rsc.nonzero()[1]

# find mean of each row and column
meanCol = Rsc.mean(axis=0)*num_users/nnz_row_per_col
meanRow = Rsc.mean(axis=1)*num_items/nnz_col_per_row

# find std of each row and column
Rsc2 = Rsc**2
stdCol = np.sqrt(Rsc2.mean(axis=0)*num_users/nnz_row_per_col - (Rsc.mean(axis=0)*num_users/nnz_row_per_col)**2)
stdRow = np.sqrt(Rsc2.mean(axis=1)*num_items/nnz_col_per_row - (Rsc.mean(axis=1)*num_items/nnz_col_per_row)**2)

# compute alternate arrays
Rsc_mean = Rsc.copy()
Rsc_mean.data = Rsc_mean.data - np.take(meanRow, Rsc_mean.indices)
Rsc_mean_sq = Rsc_mean**2

In [120]:
# user-based collaborative filter

# compute pearson correlation matrix 
# O(num_users^2*num_ratings_max)), where num_ratings_max is the maximum number of ratings by a single user

# compute numerator of pearson correlation once using matrix multiplication
num = Rsc_mean @ np.transpose(Rsc_mean) # returns num_users x num_users covariate matrix
num = num.toarray() # num is dense anyway

pcorr = np.zeros(shape=(num_users,num_users))
for j in range(num_users):
    for k in range(num_users):
        ij = np.intersect1d(Rsc_mean.getrow(j).indices, Rsc_mean.getrow(k).indices)
        den = np.sqrt( np.sum(Rsc_mean_sq[[j],ij]) * np.sum(Rsc_mean_sq[[k],ij]) )

        # insert discount factor for when the number of similar ratings between user j and user k is less than beta
        beta = -1 # set to -1 to ignore discount factoring
        if(len(ij)<beta):
            pcorr[k,j] = (num[k,j]/den)*len(ij)/beta
        else:
            pcorr[k,j] = (num[k,j]/den)

# loop over all users and compute prediction for missing users based on k_nearest_neighbors 
# Time complexity: [O(num_users*k_max)] where k_max is the max number of ratings needed to predict for a user 

k_sim_users = 2
for juser in range(num_users):

    # based on pearson correlation, compute unrated movies for juser with k = 2
    # find items that juser did not rate

    k_unrated_items = list(set(nonzero_cols[nonzero_rows == juser]) ^ set(range(num_items))) # list(set(aa[ja]).symmetric_difference(set(range(num_items))))
    rhat = []
    for k in k_unrated_items:
        # find all users that did not rate kth item
        k_unrated_user = list(set(nonzero_rows[nonzero_cols == k]) ^ set(range(num_users)))

        # find top kusers that are similar to juser who also rated kth item
        kusers = np.argsort(pcorr[juser,:])
        kusers = [jk for jk in kusers if jk not in k_unrated_user][-k_sim_users:]        

        # filter out users that have a particularly low pcorr
        pcorr_filt =  pcorr[juser,kusers]
        pcorr_filt[pcorr_filt<0] = 0

        if(np.sum(pcorr_filt) == 0):
            # no similar users - set rating to mean of current user
            rhat.append(meanRow[juser])
        else:
            # compute rating prediction using kth user
            rhat.append(meanRow[juser] + np.dot(Rsc_mean[kusers,[k]],pcorr_filt)/np.sum(pcorr_filt))

    print(juser,rhat)

0 []
1 [6.013729659957364]
2 [3.3463952993809016, 0.8584109681112311]
3 []
4 [1.5]


In [122]:
# Item based collaborative filtering


# compute adjusted cosine similarity matrix 
# Time complexity: O(num_items^2*num_ratings_max), where num_ratings_max is the maximum number of ratings for one item
num = np.transpose(Rsc_mean) @ Rsc_mean # returns num_items x num_items covariate matrix
num = num.toarray() # num is dense anyway

pcorr = np.zeros(shape=(num_items,num_items))
for j in range(num_items):
    for k in range(num_items):
        ij = np.intersect1d(Rsc_mean.getcol(j).indices, Rsc_mean.getcol(k).indices)
        den = np.sqrt(np.sum(Rsc_mean_sq[ij,[j]]) * np.sum(Rsc_mean_sq[ij,[k]]))

        # insert discount factor for when the number of similar ratings between item j and item k is less than beta
        beta = -1 # set to -1 to ignore discount factoring
        if(len(ij)<beta):
            pcorr[j,k] = (num[j,k]/den)*len(ij)/beta
        else:
            pcorr[j,k] = (num[j,k]/den)

# loop over all items and compute prediction for missing items based on k_nearest_neighbors 
# Time complexity: O(num_users*k_max) where k_max is the max number of ratings needed to predict for a user 

k_sim_items = 2
for juser in range(num_users):

    # find items that juser did not rate 
    k_unrated_items = list(set(nonzero_cols[nonzero_rows == juser]) ^ set(range(num_items)))

    rhat = []
    for k in k_unrated_items:

        # find top k_sim_items that are similar to item k
        kitems = np.argsort(pcorr[k,:])
        kitems = [jk for jk in kitems if jk not in k_unrated_items][-k_sim_items:]

        # filter out users that have a particularly low pcorr
        pcorr_filt =  pcorr[k,kitems]
        pcorr_filt[pcorr_filt < 0] = 0

        if(np.sum(pcorr_filt) == 0):
            # no similar items - set rating to mean of current user
            rhat.append(meanRow[juser])
        else:
            # compute rating prediction using juser and kitems
            rhat.append(np.dot(Rsc[[juser],kitems],pcorr_filt)/np.sum(pcorr_filt))

    print(juser,np.round(rhat,3))

0 []
1 [6.489]
2 [3. 1.]
3 []
4 [1.]
