In [2]:
import numpy as np
import pandas as pd
import datetime
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse
from sklearn.model_selection import train_test_split

# Matrix Factorization

In [9]:
class MF():
    def __init__(self, rating_matrix, K, lam = 0.01, X_init = None, b_init = None, W_init = None, d_init = None,\
                 lr = 0.5, max_iter = 1000, print_every = 100, user_based = True):
        self.rating_matrix = rating_matrix
        self.K = K # number of features
        self.lam = lam
        self.lr = lr
        self.max_iter = max_iter
        self.print_every = print_every
        self.user_based = user_based # Check to use User-User CF
        self.n_users = int(np.max(rating_matrix[:, 0])) + 1# Number of users, +1 because indices starts from 0
        self.n_items = int(np.max(rating_matrix[:, 1])) + 1# Number of items
        self.X = np.random.randn(K, self.n_items) if X_init is None else X_init
        self.b = np.random.randn(1, self.n_items) if b_init is None else b_init
        self.W = np.random.randn(K, self.n_users) if W_init is None else W_init
        self.d = np.random.randn(1, self.n_users) if d_init is None else d_init
        self.n_ratings = len(rating_matrix) # number of ratings
        self.normalized_rm = self.rating_matrix.copy() # Normalized Rating Matrix
        
    def normalize_matrix(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
        else:
            user_col = 1
            item_col = 0
            n_objects = self.n_items
        self.mu = np.zeros((n_objects))
        #objects = self.rating_matrix[:, user_col]
        for obj in range(n_objects):
            ids = np.where(self.rating_matrix[:, user_col] == obj)[0]
            item_ids = self.normalized_rm[ids.tolist(), item_col]
            ratings = self.normalized_rm[ids.tolist(), 2]
            self.mu[obj] = np.mean(ratings) if ids.size > 0 else 0
            self.normalized_rm[ids.tolist(), 2] = ratings - self.mu[obj]
        
    def get_items_rated_by_user(self, user_id):
        """
        Use to update_W and return (item_ids, ratings)
        """
        ids = np.where(self.normalized_rm[:, 0] == user_id)[0]
        item_ids, ratings = self.normalized_rm[ids.tolist(), 1], self.normalized_rm[ids.tolist(), 2]
        return item_ids.astype('int'), ratings
    
    def get_users_rate_items(self, item_id):
        """
        Use to update_X and return (user_ids, ratings)
        """
        ids = np.where(self.normalized_rm[:, 1] == item_id)[0]
        user_ids, ratings = self.normalized_rm[ids.tolist(), 0], self.normalized_rm[ids.tolist(), 2]
        return user_ids.astype('int'), ratings
    
    def loss(self):
        L = 0
        for user in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(user)
            X_hat = self.X[:, item_ids.tolist()]# X_hat = (19 item feature, #item user rated)
            b_hat = self.b[0, item_ids.tolist()]
            L+=0.5*np.sum((np.dot(X_hat.T, self.W[:, user]) + b_hat + self.d[0, user] - ratings)**2)
        L/=self.n_ratings
        L+=0.5*self.lam*(np.sum(self.X**2) + np.sum(self.W**2))
        return L
    
    def update_W(self):
        idx_users = np.random.permutation(self.n_users)
        for user in idx_users:
            item_ids, ratings = self.get_items_rated_by_user(user)
            X_hat = self.X[:, item_ids.tolist()]# X_hat = (19 item feature, #item user rated)
            b_hat = self.b[0, item_ids.tolist()]

            error = np.dot(X_hat.T, self.W[:, user]) + b_hat + self.d[0, user] - ratings
            grad_W_hat = np.dot(X_hat, error)/self.n_ratings + self.lam * self.W[:, user]
            grad_d = np.sum(error) / self.n_ratings

            self.W[:, user] -= self.lr * grad_W_hat
            self.d[0, user] -= self.lr * grad_d
    
    def update_X(self):
        idx_items = np.random.permutation(self.n_items)
        for item in idx_items:
            user_ids, ratings = self.get_users_rate_items(item)
            W_hat = self.W[:, user_ids.tolist()]
            d_hat = self.d[0, user_ids.tolist()]

            error = np.dot(W_hat.T, self.X[:, item]) + d_hat + self.b[0, item] - ratings
            grad_X_hat = np.dot(W_hat, error)/self.n_ratings + self.lam * self.X[:, item]
            grad_b = np.sum(error)/self.n_ratings

            self.X[:, item] -= self.lr * grad_X_hat
            self.b[0, item] -= self.lr * grad_b
            
    
    def fit(self):
        self.normalize_matrix()
        for it in range(self.max_iter):
            self.update_X()
            self.update_W()
            if (it + 1) % self.print_every == 0:
                rmse = self.evaluate_RMSE(self.rating_matrix)
                print('Iteration {}, loss = {:.4f}, RMSE = {:.4f}'.format(it + 1, self.loss(), rmse))
                
    def pred(self, user_id, item_id):
        if self.user_based:
            bias = self.mu[user_id]
        else:
            bias = self.mu[item_id]
        
        pred = np.dot(self.X[:, item_id].T, self.W[:, user_id]) + self.b[0, item_id] + self.d[0, user_id]  + bias
        return pred
    
    def evaluate_RMSE(self, rating_matrix):
        tot = cnt = 0
        for user in range(self.n_users):
            item_ids, ratings = self.get_items_rated_by_user(user)
            err = np.dot(self.X[:, item_ids.tolist()].T, self.W[:, user]) + self.b[0, item_ids.tolist()].T + self.d[0, user] - ratings
            tot += np.sum(err*err)
            cnt += len(ratings)
        return np.sqrt(tot/cnt)

# Applied in Sample Test ex.dat

In [3]:
ratings = pd.read_csv('ex.dat', sep=' ', names=['user_id', 'item_id', 'rating'], encoding='latin-1')
ratings.head()

Unnamed: 0,user_id,item_id,rating
0,0,0,5.0
1,0,1,4.0
2,0,3,2.0
3,0,4,2.0
4,1,0,5.0


In [4]:
rating_train = ratings.as_matrix()
rating_train.shape

(22, 3)

In [11]:
recommender = MF(rating_train, K = 2, max_iter = 1000)
recommender.fit()

# Applied in MovieLens 100K

In [12]:
rating_base = pd.read_csv('../Content-Based/ml-100k/ua.base', sep='\t', names=['user_id', 'item id', 'rating', 'timestamp'])
rating_test = pd.read_csv('../Content-Based/ml-100k/ua.test', sep='\t', names=['user_id', 'item id', 'rating', 'timestamp'])
rating_base.head()

Unnamed: 0,user_id,item id,rating,timestamp
0,1,1,5,874965758
1,1,2,3,876893171
2,1,3,4,878542960
3,1,4,3,876893119
4,1,5,3,889751712


In [13]:
rating_train = rating_base.as_matrix()
rating_test = rating_test.as_matrix()

In [14]:
rating_train[:, :2] -= 1
rating_test[:, :2] -= 1

In [15]:
recommender = MF(rating_train, K = 50, lam = .01, lr = 50, max_iter = 30, print_every=5, user_based = 1)
recommender.fit()

Iteration 5, loss = 1.2765, RMSE = 1.1820
Iteration 10, loss = 0.4970, RMSE = 0.9964
Iteration 15, loss = 0.4040, RMSE = 0.8989
Iteration 20, loss = 0.3525, RMSE = 0.8397
Iteration 25, loss = 0.3200, RMSE = 0.8000
Iteration 30, loss = 0.2977, RMSE = 0.7716


In [16]:
recommender = MF(rating_train, K = 50, lam = .01, lr = 50, max_iter = 30, print_every=5, user_based = 0)
recommender.fit()

Iteration 5, loss = 1.2455, RMSE = 1.1523
Iteration 10, loss = 0.4701, RMSE = 0.9691
Iteration 15, loss = 0.3802, RMSE = 0.8720
Iteration 20, loss = 0.3307, RMSE = 0.8132
Iteration 25, loss = 0.2998, RMSE = 0.7743
Iteration 30, loss = 0.2789, RMSE = 0.7469


# Applied in MovieLens 1M

In [30]:
ratings = pd.read_csv('ml-1m/ratings.dat', sep='::', names=['user_id', 'item id', 'rating', 'timestamp'])
ratings.head()

  if __name__ == '__main__':


Unnamed: 0,user_id,item id,rating,timestamp
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [31]:
rating_train, rating_test = train_test_split(ratings, test_size = 0.33, random_state = 0)
print('Train length:', rating_train.shape[0])
print('Test length:', rating_test.shape[0])

Train length: 670140
Test length: 330069


In [32]:
rating_train = rating_train.as_matrix()
rating_test = rating_test.as_matrix()

In [33]:
recommender = MF(rating_train, K = 2, lam = .1, lr = 2, max_iter = 10, print_every=2, user_based = 1)
recommender.fit()

Iteration 2, loss = 1.0839, RMSE = 0.7097
Iteration 4, loss = 0.9868, RMSE = 0.7097
Iteration 6, loss = 0.9079, RMSE = 0.7097
Iteration 8, loss = 0.8424, RMSE = 0.7097
Iteration 10, loss = 0.7871, RMSE = 0.7097


In [14]:
recommender.evaluate_RMSE(rating_test)

0.7098783205705074