In [3]:
import numpy as np
import random

# MF

In [4]:
class MatrixFactorization(object):

    def __init__(self, K=20, alpha=1e-6, beta = 0.0):
        self.K = K  
        self.alpha = alpha
        self.beta = beta


    def fit(self, X, n_user, n_item, n_iter = 100):
        self.R = X.copy()
        self.samples = X.copy()

        self.user_factors = np.random.rand(n_user, self.K)
        self.item_factors = np.random.rand(n_item, self.K)
                
        #stochastic gradient descent 
        self.loss = []
        for i in range(n_iter):
            self.sgd()
            mse = self.mse()
            self.loss.append((i, mse))  
    
    def sgd(self):
        np.random.shuffle(self.samples)
        for user, item, rating in self.samples:
            err = rating - self.predict_pair(user, item)  
            
            # update parameter
            self.user_factors[user] += self.alpha * (err * self.item_factors[item] - self.beta * self.user_factors[user])
            self.item_factors[item] += self.alpha * (err * self.user_factors[user] - self.beta * self.item_factors[item])            
    
    def mse(self):
        predicted = self.predict(self.R)
        error = np.hstack((self.R, np.array(predicted).reshape(-1, 1)))
        error = np.sqrt(pow((error[:, 2] - error[:, 3]), 2).mean())
        return error
    
    def predict_pair(self, user, item):
        return np.inner(self.user_factors[user], self.item_factors[item])
    
    def predict(self, X):
        rate = []
        for row in X:
            rate.append(self.predict_pair(row[0], row[1]))            
        return rate
    
    def get_full_matrix(self):
        return np.inner(self.user_factors, self.item_factors)

# Bias MF

In [5]:
class BiasMatrixFactorization(object):

    def __init__(self, K=20, alpha=1e-6, beta = 0.0):
        self.K = K  
        self.alpha = alpha
        self.beta = beta

            
    def fit(self, X, n_user, n_item, n_iter = 100):
        self.R = X.copy()
        self.samples = X.copy()

        self.user_factors = np.random.rand(n_user, self.K)
        self.item_factors = np.random.rand(n_item, self.K)
        
        self.bias_u = np.zeros(n_user)
        self.bias_i = np.zeros(n_item)
        self.bias = np.mean(X[:, 2])
        
        #stochastic gradient descent         
        self.loss = []
        for i in range(n_iter):
            self.sgd()
            mse = self.mse()
            self.loss.append((i, mse))
            
    def sgd(self):
        np.random.shuffle(self.samples)
        for user, item, rating in self.samples:
            err = rating - self.predict_pair(user, item)
            
            # update parameter
            self.bias_u[user] += self.alpha * (err - self.beta * self.bias_u[user])
            self.bias_i[item] += self.alpha * (err - self.beta * self.bias_i[item])
            
            self.user_factors[user] += self.alpha * (err * self.item_factors[item] - self.beta * self.user_factors[user])
            self.item_factors[item] += self.alpha * (err * self.user_factors[user] - self.beta * self.item_factors[item])            
    
    def mse(self):
        predicted = self.predict(self.R)
        error = np.hstack((self.R, np.array(predicted).reshape(-1, 1)))
        error = np.sqrt(pow((error[:, 2] - error[:, 3]), 2).mean())
        return error
    
    def predict_pair(self, user, item):
        return self.bias + self.bias_u[user] + self.bias_i[item] + np.inner(self.user_factors[user], self.item_factors[item])
    
    def predict(self, X):
        rate = []
        for row in X:
            rate.append(self.predict_pair(row[0], row[1]))            
        return rate
    
    def get_full_matrix(self):
        return self.bias + self.bias_u.reshape(-1, 1) + self.bias_i + np.inner(self.user_factors, self.item_factors)

# load data

In [6]:
import pandas as pd

def load_ml100k():
    samples = pd.read_csv('../ml-100k/u.data', sep = '\t', header=None)
    
    samples = samples.iloc[:, :3]
    samples.columns = ['user', 'item', 'rate']
    
    samples['user'] = samples['user'] - 1
    samples['item'] = samples['item'] - 1
    
    return samples

# main

In [7]:
df = np.array(load_ml100k())

n_user = np.unique(df[:, 0]).max() + 1
n_item = np.unique(df[:, 1]).max() + 1
n_rate = np.unique(df[:, 2]).max()

random.shuffle(df)
train_size = int(df.shape[0] * 0.8)
train_df = df[:train_size]
test_df = df[train_size:]

In [8]:
#Matrix Factorization
MF = MatrixFactorization(K = 20, alpha = 0.01, beta = 0.5)
MF.fit(train_df, n_user, n_item, n_iter = 10)

pre = MF.predict(test_df)
ret1 = np.hstack((test_df, np.array(pre).reshape(-1, 1)))
np.sqrt(pow((ret1[:, 2] - ret1[:, 3]), 2).mean())

1.0615200471939445

In [79]:
pred = pd.DataFrame(MF.get_full_matrix(), index=df.index, columns=df.columns)

In [68]:
col_names = ['user_id', 'item_id', 'rating', 'timestamp']
udata_df = pd.read_csv('../ml-100k/u.data', names=col_names, sep='\t')

df = udata_df.pivot_table(index='user_id', columns='item_id', values='rating')

In [80]:
fill_df = df.copy()
fill_df[df.isnull()] = pred[df.isnull()]

In [81]:
df.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,


In [82]:
fill_df.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,3.50148,4.485066,4.900022,4.529244,4.083364,3.830128,3.610718,3.720068,4.114721,3.512091
2,4.0,2.779634,2.582645,3.145491,2.940252,3.371623,3.303167,3.256236,3.303304,2.0,...,3.21264,4.134098,4.530516,4.183231,3.772309,3.5165,3.304836,3.439337,3.799487,3.249839
3,2.612019,2.173595,2.015914,2.46037,2.300819,2.627473,2.5839,2.545853,2.582516,2.638213,...,2.48803,3.271268,3.54999,3.199903,2.886775,2.645256,2.536317,2.863869,2.939479,2.662368
4,3.923513,3.268515,3.037502,3.699895,3.448536,3.971102,3.885377,3.830359,3.885672,3.970765,...,3.727909,4.770907,5.317601,4.888683,4.442439,4.12219,3.850677,4.072624,4.533027,3.86311
5,4.0,3.0,2.285806,2.78076,2.595316,2.976943,2.920081,2.877449,2.919081,2.980278,...,2.827553,3.651512,4.001036,3.649873,3.321468,3.080423,2.949379,3.032882,3.348651,2.862934


In [83]:
pred.head()

item_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,3.623585,3.015586,2.805445,3.413931,3.187991,3.656931,3.584397,3.533743,3.583942,3.659268,...,3.50148,4.485066,4.900022,4.529244,4.083364,3.830128,3.610718,3.720068,4.114721,3.512091
2,3.340107,2.779634,2.582645,3.145491,2.940252,3.371623,3.303167,3.256236,3.303304,3.372559,...,3.21264,4.134098,4.530516,4.183231,3.772309,3.5165,3.304836,3.439337,3.799487,3.249839
3,2.612019,2.173595,2.015914,2.46037,2.300819,2.627473,2.5839,2.545853,2.582516,2.638213,...,2.48803,3.271268,3.54999,3.199903,2.886775,2.645256,2.536317,2.863869,2.939479,2.662368
4,3.923513,3.268515,3.037502,3.699895,3.448536,3.971102,3.885377,3.830359,3.885672,3.970765,...,3.727909,4.770907,5.317601,4.888683,4.442439,4.12219,3.850677,4.072624,4.533027,3.86311
5,2.950815,2.456467,2.285806,2.78076,2.595316,2.976943,2.920081,2.877449,2.919081,2.980278,...,2.827553,3.651512,4.001036,3.649873,3.321468,3.080423,2.949379,3.032882,3.348651,2.862934


In [84]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(pred, fill_df))

0.2679597736944209

In [9]:
#Bias Matrix Factorization
BMF = BiasMatrixFactorization(K=20, alpha = 0.01, beta = 0.5)
BMF.fit(train_df, n_user, n_item, n_iter = 10)

pre2 = BMF.predict(test_df[:, :2])
ret2 =np.hstack((test_df, np.array(pre2).reshape(-1, 1)))
np.sqrt(pow((ret2[:, 2] - ret2[:, 3]), 2).mean())

KeyboardInterrupt: 