In [1]:
import numpy as np
import glob, time
from astropy.table import Table
from astropy.io import ascii

In [121]:
dir(Table)

['Column',
 'ColumnClass',
 'MaskedColumn',
 'Row',
 'TableColumns',
 'TableFormatter',
 '__array__',
 '__bytes__',
 '__class__',
 '__copy__',
 '__deepcopy__',
 '__delattr__',
 '__delitem__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_add_as_mixin_column',
 '_base_repr_',
 '_check_names_dtype',
 '_convert_col_for_table',
 '_convert_string_dtype',
 '_init_from_cols',
 '_init_from_dict',
 '_init_from_list',
 '_init_from_list_of_dicts',
 '_init_from_ndarray',
 '_init_from_table',
 '_ipython_key_completions_',
 '_is_list_or_tuple_of_str',
 '_make_index_row_display_table',
 '_make_table_from_cols',
 '_mask',
 '_new_from

In [None]:
help(Table.)

In [144]:
def make_R(filename, col_user, col_item, col_rating, fraction = 0.8):
    '''create R matrix from a file that contains a table of user IDs, item IDs and ratings
    parameters:
        filename: str; the filename of the table
        col_user: str; the column name of user IDs
        col_item: str; the column name of item IDs
        col_rating: str; the column name of the ratings
        fraction: float; the fraction of data to be used for training
    returns:
        R: 2D numpy array; R matrix in with r_{i, j} is the rating that user i given to item j'''
    t = ascii.read(filename)
    nData = np.size(t) # total number of data points(ratings)
    nUser = np.max(t[col_user]) # total number of users
    nitem = np.max(t[col_item]) # total number of items
    
    idxData = np.arange(nData) # an array of idx of data randomized
    np.random.shuffle(idxData)
    
    nTrain = int(np.rint(fraction * nData)) # number of training data
    print(nTrain)
    idxTrain, idxTest = idxData[:nTrain], idxData[nTrain:]
    
    idxUser = t.index_column(col_user)
    idxItem = t.index_column(col_item)
    idxRating = t.index_column(col_rating)
    
    rTrain = np.zeros(shape = (nUser, nitem))
    rTest = np.zeros(shape = (nUser, nitem))
    for nrow in idxTrain:
        i = t[nrow][idxUser] - 1 # the userId
        j = t[nrow][idxItem] - 1 # the itemId
        rTrain[i][j] = t[nrow][idxRating] # the rating
    
    for nrow in idxTest:
        i = t[nrow][idxUser] - 1 # the userId
        j = t[nrow][idxItem] - 1 # the itemId
        rTest[i][j] = t[nrow][idxRating] # the rating
    
    return rTrain, rTest

R_train, R_test = make_R(filename = 'movie_ratings.csv', col_user = 'userId', col_item = 'movieId', col_rating = 'rating')

80669


In [158]:
class recommendation_system():

    def __init__(self, R_train, R_test, k, alpha, _lambda, iterations):
        """
        Perform matrix factorization to predict empty
        entries in a matrix.

        parameters:
            R: 2D numpy array; R matrix in with r_{i, j} is the rating that user i given to item j
            k: int; dimensions of u and v vector
            alpha (float) : learning rate
            _lambda (float)  : regularization parameter
        """

        self.R_train = R_train
        self.R_test = R_test
        self.num_users, self.num_items = R_train.shape
        self.k = k
        self.alpha = alpha
        self.iterations = iterations
        self._lambda = _lambda

    def train(self):
        start_time = time.time()
        # Initialize user and item latent feature matrice
        self.U = np.random.normal(scale = 1./self.k, size = (self.num_users, self.k))
        self.V = np.random.normal(scale = 1./self.k, size = (self.num_items, self.k))
        
        if self.regularization() == True:
            # Initialize the biases
            self.b_u = np.zeros(self.num_users)
            self.b_i = np.zeros(self.num_items)
            self.b = np.mean(self.R_train[np.where(self.R_train != 0)])

        # Create a list of training samples
        self.samples = [
            (i, j, self.R_train[i, j])
            for i in range(self.num_users)
            for j in range(self.num_items)
            if self.R_train[i, j] > 0
        ]

        # Perform stochastic gradient descent for number of iterations
        training_process = []
        for i in range(self.iterations):
            np.random.shuffle(self.samples)
            self.sgd()
            mse_train, mse_test = self.mse()
            training_process.append((i, mse_train, mse_test))
            if (i + 1) % 10 == 0:
                print("Iteration: %d ; train error = %.4f; test error = %.4f" % (i + 1, mse_train, mse_test))
                
        print('training is complete! it took %.2f s' % (time.time() - start_time))
        return training_process
    
    def regularization(self):
        '''If we want to regularize the result'''
        return False if self._lambda is None else True

    def mse(self):
        """Compute the total mean square error for training and testing data
        """
        xTrain, yTrain = self.R_train.nonzero()
        xTest, yTest = self.R_test.nonzero()
        predicted = self.full_matrix()
        trainError, testError = 0, 0
        for x, y in zip(xTrain, yTrain):
            trainError += pow(self.R_train[x, y] - predicted[x, y], 2)
            
        for x, y in zip(xTest, yTest):
            testError += pow(self.R_test[x, y] - predicted[x, y], 2)
            
        return np.sqrt(trainError), np.sqrt(testError)

    def sgd(self):
        """
        Perform stochastic graident descent
        """
        for i, j, r in self.samples:
            # Computer prediction and error
            prediction = self.get_rating(i, j)
            e = (r - prediction)
            
            if self.regularization() == True:
                # Update biases
                self.b_u[i] += self.alpha * (e - self._lambda * self.b_u[i])
                self.b_i[j] += self.alpha * (e - self._lambda * self.b_i[j])

                # Update user and item latent feature matrices
                self.U[i, :] += self.alpha * (e * self.V[j, :] - self._lambda * self.U[i,:])
                self.V[j, :] += self.alpha * (e * self.U[i, :] - self._lambda * self.V[j,:])
            else:
                # Update user and item latent feature matrices
                self.U[i, :] += self.alpha * e * self.V[j, :]
                self.V[j, :] += self.alpha * e * self.U[i, :]

    def get_rating(self, i, j):
        """
        Get the predicted rating of user i and item j
        """
        if self.regularization() == True:
            prediction = self.b + self.b_u[i] + self.b_i[j] + self.U[i, :].dot(self.V[j, :].T)
        else:
            prediction = self.U[i, :].dot(self.V[j, :].T)
        return prediction

    def full_matrix(self):
        """
        Computer the full matrix using the resultant biases, P and Q
        """
        if self.regularization() == True:
            return self.b + self.b_u[:,np.newaxis] + self.b_i[np.newaxis:,] + self.U.dot(self.V.T)
        else:
            return self.U.dot(self.V.T)

In [159]:
mf = recommendation_system(R_train, R_test, k = 300, alpha = 0.01, _lambda = 0.01, iterations = 20)

In [160]:
mf.train()

Iteration: 10 ; train error = 234.1607; test error = 123.6600
Iteration: 20 ; train error = 204.3388; test error = 121.2744
training is complete! it took 84.85 s


[(0, 255.59846965311002, 128.98801979928677),
 (1, 248.8114651800095, 126.71255506648048),
 (2, 244.9808560648217, 125.66284881311068),
 (3, 242.35774602561017, 124.94917838560346),
 (4, 240.36543795221752, 124.58855408294613),
 (5, 238.8007567864682, 124.24547634473196),
 (6, 237.4844051635022, 124.07441640995648),
 (7, 236.3549265639069, 124.00940503638874),
 (8, 235.12780381628582, 123.67416672524146),
 (9, 234.16073934712588, 123.66000614613448),
 (10, 233.04434204354425, 123.50254277746828),
 (11, 231.76815458468883, 123.31642624280254),
 (12, 230.281901586051, 123.26295709703764),
 (13, 228.36892894489145, 123.0820193302953),
 (14, 225.99939008326643, 122.84160629526978),
 (15, 222.92519121849998, 122.62066651409545),
 (16, 219.1409952083324, 122.29743869753929),
 (17, 214.82481657914974, 122.01367702519659),
 (18, 209.8210575306947, 121.568167213406),
 (19, 204.33878936348006, 121.27435720631227)]

In [90]:
mf.full_matrix()

array([[4.56763753, 4.20996919, 3.98995323, ..., 4.16250062, 4.16325625,
        4.27553269],
       [3.91985196, 3.55119324, 3.27110652, ..., 3.49681   , 3.49725763,
        3.60714968],
       [2.65044326, 2.4413029 , 2.03177587, ..., 2.35160182, 2.35253557,
        2.45289135],
       ...,
       [2.4322246 , 2.4558676 , 2.43950429, ..., 3.21243986, 3.20568998,
        3.31120859],
       [3.58993069, 3.23484471, 2.98374352, ..., 3.2111378 , 3.21128943,
        3.32319877],
       [4.45860103, 3.38899298, 3.40442967, ..., 3.52596812, 3.52582102,
        3.66390808]])

In [91]:
mf.mse()

215.19221896310728

In [93]:
mf.get_rating(45, 500)

4.067989332619494

In [None]:
def correlation_similarity():
    def similarity(A, B):
        dot_product = A.dot(B)

In [115]:
tR = np.array([
    [5, 3, 0, 1],
    [4, 0, 0, 1],
    [1, 1, 0, 5],
    [1, 0, 0, 4],
    [0, 1, 5, 4],
])

In [116]:
i,j = np.nonzero(tR)

ix = np.random.choice(len(i), int(np.floor(0.5 * len(i))), replace = True)

In [117]:
i, j

(array([0, 0, 0, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4]),
 array([0, 1, 3, 0, 3, 0, 1, 3, 0, 3, 1, 2, 3]))

In [138]:
mf

<__main__.recommendation_system at 0x7f328bbc9588>

In [126]:
np.arange(100)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84,
       85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99])

In [127]:
x = np.arange(100)
np.random.shuffle(x)
training, test = x[:80], x[80:]

In [128]:
training

array([ 9, 52, 51, 12, 67, 26, 60,  7, 83, 90, 79, 86, 24, 20, 21, 85, 65,
       73, 97, 63, 34, 71, 92, 28, 47, 33, 84, 69, 25,  0, 18, 54, 30, 74,
       91,  8, 57, 66, 19, 42,  4, 70, 46, 93, 75, 95, 43, 98, 77, 13, 55,
       61,  1, 44, 32,  3, 62, 72, 96, 80, 11, 35, 17,  6, 99, 49, 48, 78,
       15, 45, 94,  5, 81, 88, 76, 50, 14, 37, 59, 68])