In [1]:
import numpy as np
import glob, time
from astropy.table import Table
from astropy.io import ascii

In [355]:
def make_R(filename, col_user, col_item, col_rating, fraction = 0.8):
    start_time = time.time()
    '''create R matrix from a file that contains a table of user IDs, item IDs and ratings
    parameters:
        filename: str; the filename of the table
        col_user: str; the column name of user IDs
        col_item: str; the column name of item IDs
        col_rating: str; the column name of the ratings
        fraction: float; the fraction of data to be used for training
    returns:
        rTrain: 2D numpy array; training R matrix in with r_{i, j} is the rating that user i given to item j
        rTest: 2D numpy array; testing R matrix in with r_{i, j} is the rating that user i given to item j'''
    t = ascii.read(filename)
    nData = np.size(t) # total number of data points(ratings)
    nTrain = int(np.rint(fraction * nData)) # number of training data
    nUser = np.max(t[col_user]) # total number of users
    nitem = np.max(t[col_item]) # total number of items
    
    idxData = np.arange(nData) # an array of idx of data
    np.random.shuffle(idxData) # randomize idx
    idxTrain, idxTest = idxData[:nTrain], idxData[nTrain:]
    
    idxUser = t.index_column(col_user)
    idxItem = t.index_column(col_item)
    idxRating = t.index_column(col_rating)
    
    rTrain = np.zeros(shape = (nUser, nitem))
    rTest = np.zeros(shape = (nUser, nitem))
    for nrow in idxTrain:
        i = t[nrow][idxUser] - 1 # the userId
        j = t[nrow][idxItem] - 1 # the itemId
        rTrain[i][j] = t[nrow][idxRating] # the rating
    
    for nrow in idxTest:
        i = t[nrow][idxUser] - 1 # the userId
        j = t[nrow][idxItem] - 1 # the itemId
        rTest[i][j] = t[nrow][idxRating] # the rating
    
    print('the process took %.2f s' % (time.time() - start_time))
    return rTrain, rTest

R_train, R_test = make_R(filename = 'movie_ratings.csv', col_user = 'userId', col_item = 'movieId', col_rating = 'rating')

the process took 2.46 s


In [356]:
class recommendation_system():

    def __init__(self, R_train, R_test):
        '''R_train: 2D numpy array; R matrix in with r_{i, j} is the rating that user i given to item j'''
        self.R_train = R_train
        self.R_test = R_test
        self.nUser, self.nItem = R_train.shape
        self.result = None

    def matrix_factorization(self, k, alpha, _lambda, iterations):
        '''perform matrix factorization to predict empty entries in a matrix.
        parameters:
            k: int; dimensions of u and v vector
            alpha: float; learning rate
            _lambda: float; regularization parameter'''
        self.k = k
        self.alpha = alpha
        self.iterations = iterations
        self._lambda = _lambda
        
        def sgd():
            '''stochastic graident descent'''
            for i, j, r in self.samples:
                # Computer prediction and error
                if _lambda is not None:
                    prediction = self.b + self.b_u[i] + self.b_i[j] + self.U[i, :].dot(self.V[j, :].T)
                    e = (r - prediction)
                    # Update biases
                    self.b_u[i] += self.alpha * (e - self._lambda * self.b_u[i])
                    self.b_i[j] += self.alpha * (e - self._lambda * self.b_i[j])

                    # Update user and item latent feature matrices
                    self.U[i, :] += self.alpha * (e * self.V[j, :] - self._lambda * self.U[i,:])
                    self.V[j, :] += self.alpha * (e * self.U[i, :] - self._lambda * self.V[j,:])
                else:
                    prediction = self.U[i, :].dot(self.V[j, :].T)
                    e = (r - prediction)
                    # Update user and item latent feature matrices
                    self.U[i, :] += self.alpha * e * self.V[j, :]
                    self.V[j, :] += self.alpha * e * self.U[i, :]
                    
        start_time = time.time()
        # Initialize user and item latent feature matrice
        self.U = np.random.normal(scale = 1./self.k, size = (self.nUser, self.k))
        self.V = np.random.normal(scale = 1./self.k, size = (self.nItem, self.k))
        
        if _lambda is not None:
            # Initialize the biases
            self.b_u = np.zeros(self.nUser)
            self.b_i = np.zeros(self.nItem)
            self.b = np.mean(self.R_train[np.where(self.R_train != 0)])

        # Create a list of training samples
        self.samples = [
            (i, j, self.R_train[i, j])
            for i in range(self.nUser)
            for j in range(self.nItem)
            if self.R_train[i, j] > 0
        ]

        # Perform stochastic gradient descent for number of iterations
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            sgd()
#             mse_train, mse_test = self.mse()
            if (i + 1) % 10 == 0:
                print("Iteration: %d " % (i + 1))
#                 print("Iteration: %d ; train error = %.4f; test error = %.4f" % (i + 1, mse_train, mse_test))
                
        if _lambda is not None:
            self.result = self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.U.dot(self.V.T)
        else:
            self.result = self.U.dot(self.V.T)
        
        mse_train, mse_test = self.mse()
        print('training is complete! it took %.2f s' % (time.time() - start_time))
        print("train error = %.4f; test error = %.4f" % (mse_train, mse_test))
        return self.result
    
    def mse(self):
        '''Compute the total mean square error for training and testing data'''
        xTrain, yTrain = self.R_train.nonzero()
        xTest, yTest = self.R_test.nonzero()
        trainError, testError = 0, 0
        for x, y in zip(xTrain, yTrain):
            trainError += pow(self.R_train[x, y] - self.result[x, y], 2)
            
        for x, y in zip(xTest, yTest):
            testError += pow(self.R_test[x, y] - self.result[x, y], 2)
            
        return np.sqrt(trainError), np.sqrt(testError)
        
    def correlation_similarity(self, k):
        start_time = time.time()
        self.result = self.R_train.copy()
#         R_normal = (R.T - np.nanmean(R, axis = 1)).T # normalizing with the average
        
        R_sparse = sparse.csr_matrix(self.R_train)
        S = cosine_similarity(R_sparse) # similarity 
        for row in np.arange(self.nUser): # for all users
            idxS = np.argsort(S[row])
#             for col in np.where(R[row] == 0)[0]: # for item(col) that doesn't have a rating
            for col in np.arange(self.nItem): # for all items
#                 print(row, col, R[row][col])
                idx_n = np.where(self.R_train[:, col] > 0)[0]# the idx of users who rated this item
#                 print(idx_n)
                if np.size(idx_n) < k:
                    self.result[row][col] = self.R_train[idx_n, col].mean()
                else:
                    idx_knn = idxS[np.isin(idxS, idx_n, assume_unique = True)][-k-1:-1] # idxS with ratings
#                     idx_knn = idx_n[np.argsort(S[row][idx_n])][-k:]# idx of k-nearest neighbors
#                 print(idx_knn)
                    self.result[row][col] = self.R_train[idx_knn, col].mean() # mean of the knn rating for this item
        print('training is complete! it took %.2f s' % (time.time() - start_time))
        return self.result

In [413]:
def correlation_similarity(R):
    start_time = time.time()
    result = R.copy()
    R_normal_nan = (R.T - np.nanmean(R, axis = 1)).T # normalizing with the average
    R_normal_zero = np.nan_to_num(R_normal_nan, copy = True)
    print(R)
    R_sparse = sparse.csr_matrix(R_normal_zero)
    S = cosine_similarity(R_sparse) # similarity 
    print(R_normal_zero)
    for row in np.arange(R.shape[0]): # for all users
        similarity = S[row]
        idxS = np.argwhere(similarity > 0.)
        R_S = R_normal_nan[idxS]
        result[row] = np.nanmean(R_S, axis = 0)
    print('training is complete! it took %.2f s' % (time.time() - start_time))
    return result

In [414]:
correlation_similarity(R_train)

[[4.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 [0.  0.  0.  ... 0.  0.  0. ]
 ...
 [2.5 2.  2.  ... 0.  0.  0. ]
 [3.  0.  0.  ... 0.  0.  0. ]
 [5.  0.  0.  ... 0.  0.  0. ]]


MemoryError: 

In [357]:
mf = recommendation_system(R_train, R_test)
# mf.correlation_similarity(3)
mf.matrix_factorization(k = 300, alpha = 0.01, _lambda = 0.01, iterations = 20)

Iteration: 10 
Iteration: 20 
training is complete! it took 50.56 s
train error = 204.2289; test error = 121.3293


array([[4.7027215 , 4.33693953, 4.08562656, ..., 4.21682621, 4.21502434,
        4.33077482],
       [4.01905588, 3.63531399, 3.39480715, ..., 3.54744653, 3.54745645,
        3.66168143],
       [2.52581468, 2.19881993, 1.9320147 , ..., 2.09470072, 2.09120804,
        2.20589518],
       ...,
       [3.05999193, 2.69855982, 2.76905344, ..., 3.1301858 , 3.12137232,
        3.22730916],
       [3.67085348, 3.3082022 , 3.06056092, ..., 3.21727928, 3.21677988,
        3.33145243],
       [4.29142238, 3.58806829, 3.45318063, ..., 3.49679645, 3.5012661 ,
        3.63570256]])

In [358]:
mf.mse()

(204.22888307712753, 121.32933523305672)

In [91]:
mf.mse()

215.19221896310728

In [173]:
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse



In [359]:
tR = np.array([
    [5, 3, np.nan, 1, 1],
    [4, np.nan, np.nan, 1, 1],
    [1, 1, np.nan, 5, 4],
    [1, np.nan, np.nan, 4, 5],
    [np.nan, 1, 5, 4, 4],
])

In [361]:
np.nanmean(tR, axis = 1)

array([2.5       , 2.        , 2.75      , 3.33333333, 3.5       ])

In [362]:
(tR.T - np.nanmean(tR, axis = 1)).T

array([[ 2.5       ,  0.5       ,         nan, -1.5       , -1.5       ],
       [ 2.        ,         nan,         nan, -1.        , -1.        ],
       [-1.75      , -1.75      ,         nan,  2.25      ,  1.25      ],
       [-2.33333333,         nan,         nan,  0.66666667,  1.66666667],
       [        nan, -2.5       ,  1.5       ,  0.5       ,  0.5       ]])

In [368]:
h = np.array([0, 2, 4])
tR[h]

array([[ 5.,  3., nan,  1.,  1.],
       [ 1.,  1., nan,  5.,  4.],
       [nan,  1.,  5.,  4.,  4.]])

In [366]:
correlation_similarity(tR, 3)

AttributeError: 'numpy.ndarray' object has no attribute 'nUser'

In [175]:
tR_sparse = sparse.csr_matrix(tR)

In [239]:
h = np.array([0, 1, 2])
tR[4][h]

array([0, 1, 5])

In [177]:
similarities

array([[1.        , 0.86091606, 0.42289003, 0.36896403, 0.18257419],
       [0.86091606, 1.        , 0.42008403, 0.47058824, 0.14969624],
       [0.42289003, 0.42008403, 1.        , 0.98019606, 0.62360956],
       [0.36896403, 0.47058824, 0.98019606, 1.        , 0.59878495],
       [0.18257419, 0.14969624, 0.62360956, 0.59878495, 1.        ]])

In [165]:
from scipy import spatial

In [179]:
1 - spatial.distance.cosine(tR[0], tR[2])

0.4228900316110311

In [384]:
R_train

array([[4. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       [0. , 0. , 0. , ..., 0. , 0. , 0. ],
       ...,
       [2.5, 2. , 2. , ..., 0. , 0. , 0. ],
       [3. , 0. , 0. , ..., 0. , 0. , 0. ],
       [5. , 0. , 0. , ..., 0. , 0. , 0. ]])

In [126]:
np.arange(100)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [161]:
x = np.arange(100)
np.random.shuffle(x)
training, test = x[:80], x[80:]

In [162]:
training

array([74, 58, 27, 14, 17, 70, 82,  0, 87, 65, 85, 79,  9, 33, 36, 72, 29,
       96, 38, 52, 67, 53, 43, 21, 41, 11, 75, 62, 37, 30, 39, 25, 60, 97,
       69, 80, 47, 71,  1, 26,  3, 59, 94, 45, 90, 12, 89, 28, 81, 76, 31,
       83, 68, 98, 73, 55, 88, 66,  7, 78, 34, 16,  5, 95, 24, 42, 13, 19,
       50, 10, 99, 46,  8, 61, 18, 20, 22, 32, 44, 92])

In [303]:
def mock_R(nU, nI, f = 0.2, ft = 0.8):
    R = np.zeros(shape = (nU, nI))
    n = int(np.rint(f * nU * nI))
    idxU = np.random.choice(np.arange(nU), size = n)
    idxI = np.random.choice(np.arange(nI), size = n)
    for x, y in zip(idxU, idxI):
        R[x][y] = np.random.randint(low = 1, high = 10)
    return R

In [304]:
mR = mock_R(40, 50)
mR

array([[7., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 2., 0.],
       ...,
       [5., 0., 0., ..., 0., 4., 0.],
       [0., 0., 0., ..., 0., 0., 4.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [318]:
mmR = recommendation_system(mR, mR)

In [319]:
mmR.matrix_factorization(k = 300, alpha = 0.01, _lambda = 0.01, iterations = 20)

Iteration: 10 ; train error = 49.3843; test error = 49.3843
Iteration: 20 ; train error = 49.3843; test error = 49.3843
training is complete! it took 0.12 s


array([[4.9062603 , 4.90628366, 4.90625455, ..., 4.90661179, 4.90666117,
        4.90663051],
       [4.90656739, 4.90666464, 4.9065231 , ..., 4.9065938 , 4.90626056,
        4.90630948],
       [4.9065554 , 4.90644708, 4.90647314, ..., 4.90695252, 4.90680165,
        4.90671574],
       ...,
       [4.90657811, 4.90677054, 4.90696317, ..., 4.90693909, 4.90658596,
        4.90675202],
       [4.90629563, 4.90659874, 4.90636596, ..., 4.9065348 , 4.90658854,
        4.90662293],
       [4.90647931, 4.90657469, 4.90645965, ..., 4.90672111, 4.90664056,
        4.90670018]])