In [24]:
# Code tính singular decomposition.
import numpy as np
from numpy import linalg as LA

m, n = 2, 3
A = np.random.rand(m, n)

U, S, V = LA.svd(A)
s1 = np.array([S[0], 0, 0])
s2 = np.array([0, S[1], 0])
Sigma = np.array([s1, s2])

print("USV = %s"%np.dot(U, np.dot(Sigma, V)))
print("A matrix: %s \n"%A)

USV = [[0.77229737 0.33077246 0.54309846]
 [0.55533186 0.01796953 0.47545175]]
A matrix: [[0.77229737 0.33077246 0.54309846]
 [0.55533186 0.01796953 0.47545175]] 



In [52]:
# Apply SVD for recommendation system 
import pandas as pd
import numpy as np
from numpy import linalg as LA
from sklearn.metrics.pairwise import cosine_similarity
from scipy import sparse

class SVD_RS:
    def __init__(self, Y_data, K, user_based = 1):
        self.Y_data = Y_data
        self.K = K
        self.user_based = user_based
        # number of users and items. Remember to add 1 since id start from 0
        self.n_users = int(np.max(Y_data[:, 0])) + 1
        self.n_items = int(np.max(Y_data[:, 1])) + 1
        self.n_ratings = Y_data.shape[0]
        # normalize data
        self.Ybar_data = self.Y_data.copy().astype(np.float32)
    
    def normalize_Y(self):
        if self.user_based:
            user_col = 0
            item_col = 1
            n_objects = self.n_users
            
        else:
            user_col = 1
            item_col = 0
            n_objects = self.n_items
            
        users = self.Y_data[:, user_col]
        self.mu = np.zeros((n_objects, ))
        for n in np.arange(n_objects):
            ids = np.where(users == n)[0].astype(np.int32)
            item_ids = self.Y_data[ids, 1]
            ratings = self.Y_data[ids, 2].astype(np.float32)
            m = np.mean(ratings)
            if np.isnan(m):
                m = 0
            self.mu[n] = m
            self.Ybar_data[ids, 2] = ratings - self.mu[n]
            self.Ybar = sparse.coo_matrix((self.Ybar_data[:, 2],
                 (self.Ybar_data[:, 1], self.Ybar_data[:, 0])), (self.n_items, self.n_users))
            self.Ybar = self.Ybar.todense()
            
            
    def fit(self):
        self.normalize_Y()
        U, S, V = LA.svd(self.Ybar)
        Uk = U[:, :self.K]
        Sk = S[:self.K]
        Vkt = V[:self.K, :]
        self.res = Uk.dot(np.diag(Sk)).dot(Vkt)
        
    def pred(self, u, i):
        u = int(u)
        i = int(i)
        
        if self.user_based:
            bias = self.mu[u]
        else:
            bias = self.mu[i]
        pred = self.res[i, u] + bias
        if pred < 1:
            return 1
        if pred > 5:
            return 5
        return pred
    
    def pred_for_user(self, user_id):
        ids = np.where(self.Y_data_n[:, 0] == user_id)[0]
        items_rated_by_u = self.Y_data_n[ids, 1].tolist()
        
        y_pred = self.X.dot(self.W[:, user_id]) + self.mu[user_id]
        predicted_ratings = []
        for i in np.arange(self.n_items):
            if i not in items_rated_by_u:
                predicted_ratings.append((i, y_pred[i]))
        return predicted_ratings
    
    def evaluate_RMSE(self, rate_test):
        n_tests = rate_test.shape[0]
        SE = 0 # squared error
        for n in np.arange(n_tests):
            pred = self.pred(rate_test[n, 0], rate_test[n, 1])
            SE += (pred - rate_test[n, 2])**2 

        RMSE = np.sqrt(SE/n_tests)
        return RMSE

In [25]:
import pandas as pd
import numpy as np
columns = ['user_id', 'item_id', 'rating', 'timestamp']
movie_length = pd.read_csv('ml-1m/ratings.dat', header = 0, \
                           names = columns, sep = '::', engine = 'python')
movie_length = movie_length.sort_values(['user_id', 'item_id'])
movie_length.head()

Unnamed: 0,user_id,item_id,rating,timestamp
39,1,1,5,978824268
24,1,48,5,978824351
38,1,150,5,978301777
43,1,260,4,978300760
22,1,527,5,978824195


In [35]:
#declare split_rate for train/total ratings
split_rate = 2/3

def split_train_test(dataset):
    gb = dataset.groupby('user_id')
    ls = [gb.get_group(x) for x in gb.groups]
    items = [x for x in gb.groups]
    index_size = [{'i': i, 'index':gb.groups[i], 'size':len(gb.groups[i])} for i in items]
    index_train = pd.Int64Index([])
    index_test = pd.Int64Index([])
    for x in index_size:
        np.random.shuffle(x['index'].values)
        le = int(x['size']*split_rate)
        index_train = index_train.append(x['index'][:le])
        index_test = index_test.append(x['index'][le:])
    train = dataset.iloc[index_train].values
    test = dataset.iloc[index_test].values
    #minus id to 1 to index start from 0
    train[:, 0] -= 1
    train[:, 1] -= 1
    test[:, 0] -= 1
    test[:, 1] -= 1
    return train, test

train, test = split_train_test(movie_length)

In [53]:
rs = SVD_RS(train, K = 2, user_based = 0)
rs.fit()

  out=out, **kwargs)
  ret = ret.dtype.type(ret / rcount)


In [54]:
RMSE = rs.evaluate_RMSE(test)
print('Item-based MF, RMSE = %.3f'%RMSE)

Item-based MF, RMSE = 0.953
