In [1]:
import pandas as pd 
import numpy as np
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics.pairwise import cosine_similarity, euclidean_distances, manhattan_distances
from scipy.stats.stats import pearsonr
from scipy import sparse 
from itertools import product

  from scipy.stats.stats import pearsonr


In [10]:
class CF(object):
    """docstring for CF"""
    def __init__(self, Y_data, k=0, dist_func = cosine_similarity, uuCF = 1):
      #cosine_similarity
      #euclidean_distances
      #manhattan_distances
        self.uuCF = uuCF # user-user (1) or item-item (0) CF
        self.Y_data = Y_data if uuCF else Y_data[:, [1, 0, 2]]
        self.k = k
        self.dist_func = dist_func
        self.Ybar_data = None
        # number of users and items. Remember to add 1 since id starts from 0
        self.n_users = int(np.max(self.Y_data[:, 0])) + 1 # value biggest in arr - last user->number of user
        self.n_items = int(np.max(self.Y_data[:, 1])) + 1 # value biggest in arr - last items->number of items
        self.rating_sparse = sparse.coo_matrix((np.array(self.Y_data[:, 2],dtype=float),
            (np.array(self.Y_data[:, 1],dtype=float), np.array(self.Y_data[:, 0],dtype=float))), (self.n_items, self.n_users))
        self.rating_sparse = self.rating_sparse.tocsr()
        self.rating_array = self.rating_sparse.toarray()
        


    
    def __update(self,new_data):
        users = new_data[:, 0] # all users - first col of the Y_data
        Ybar_data = new_data.copy() #copy new data

        firts_user = int(np.min(new_data[:, 0]))
        last_user = int(np.max(new_data[:, 0])) + 1
        #update mean array with new size
        if(last_user>np.size(self.mu)):
            new_size_mu = np.zeros((last_user-np.size(self.mu),))
            self.mu = np.append(self.mu,new_size_mu)
        #update n_user
        new_n_users = int(np.max(new_data[:, 0])) + 1
        if(self.n_users<new_n_users):
            self.n_users=new_n_users
        #update n_item
        new_n_items = int(np.max(new_data[:, 1])) + 1
        if(self.n_items<new_n_items):
            self.n_items=new_n_items

        for n in range(firts_user,last_user):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = new_data[ids, 1] 
            # and the corresponding ratings 
            ratings = new_data[ids, 2]
            # take mean (trung bình cộng)
            m = np.mean(ratings)
            #get Exeption if arr emty
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            #mu is mean of rating by user n (or by istems n)
            self.mu[n] = m
            # normalize
            Ybar_data[ids, 2] = ratings - self.mu[n]
        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.

        Ybar = sparse.coo_matrix((Ybar_data[:, 2],
            (Ybar_data[:, 1], Ybar_data[:, 0])), (self.n_items, self.n_users))
        Ybar = Ybar.tocsr()
        print (self.Ybar.shape)
        print(Ybar.shape)
        self.Ybar =sparse.hstack((self.Ybar, Ybar), format='csr')

    def update(self,new_data):
        self.__update(new_data)
        self.similarity()
        
    
    
    def normalize_Y(self):
        users = self.Y_data[:, 0] # all users - first col of the Y_data
        self.Ybar_data = self.Y_data.copy() #copy new data
        self.mu = np.zeros((self.n_users,)) #create arr 1D with '0', size n_users
        #Vòng for làm các bước:
        #b1: ép kiểu ids ->int32
        #ids là danh sách các item đã được rating của user thứ n
        #b2: Lấy tất cả item mà user đã rate
        #b3: Tính trung bình cộng các rating của user thứ n
        #b4: Bắt ngoại lệ nếu mảng rating emty
        #b5: lưu mean rating của user thứ n vào mảng
        #b6: Tính và lưu new_rating vào mảng vừa mới sao chép (Ybar_data)
        for n in range(self.n_users):
            # row indices of rating done by user n
            # since indices need to be integers, we need to convert
            ids = np.where(users == n)[0].astype(np.int32)
            # indices of all ratings associated with user n
            item_ids = self.Y_data[ids, 1] 
            # and the corresponding ratings 
            ratings = self.Y_data[ids, 2]
            # take mean (trung bình cộng)
            m = np.mean(ratings) 
            #get Exeption if arr emty
            if np.isnan(m):
                m = 0 # to avoid empty array and nan value
            #mu is mean of rating by user n (or by istems n)
            self.mu[n] = m
            # normalize
            self.Ybar_data[ids, 2] = ratings - self.mu[n]
        ################################################
        # form the rating matrix as a sparse matrix. Sparsity is important 
        # for both memory and computing efficiency. For example, if #user = 1M, 
        # #item = 100k, then shape of the rating matrix would be (100k, 1M), 
        # you may not have enough memory to store this. Then, instead, we store 
        # nonzeros only, and, of course, their locations.

        self.Ybar = sparse.coo_matrix((np.array(self.Ybar_data[:, 2],dtype=float),
            (np.array(self.Ybar_data[:, 1],dtype=float), np.array(self.Ybar_data[:, 0],dtype=float))), (self.n_items, self.n_users))
        self.Ybar = self.Ybar.tocsr()


    def similarity(self):
        eps = 1e-6
        self.S = self.dist_func(self.Ybar.T,self.Ybar.T)
        # self.S = self.cosine_similarity_n_space(self.Ybar.T,self.Ybar.T)
        # self.S = self.pearson(self.Ybar.T)
        # self.S = np.corrcoef(self.Ybar.T.toarray())
        # self.S = np.nan_to_num(self.S)

    def cosine_similarity_n_space(self,m1, m2, batch_size=100):
        assert m1.shape[1] == m2.shape[1] and isinstance(batch_size, int) == True

        ret = np.ndarray((m1.shape[0], m2.shape[0]))

        batches = m1.shape[0] // batch_size
        
        if m1.shape[0]%batch_size != 0:
            batches = batches + 1  

        for row_i in range(0, batches):
            start = row_i * batch_size
            end = min([(row_i + 1) * batch_size, m1.shape[0]])        
            rows = m1[start: end]
            sim = cosine_similarity(rows, m2)  
            ret[start: end] = sim
            # print(ret[start: end].shape)
        
        return ret

    def calc_sim(self, A):
            similarity = np.dot(A, A.T)
            # squared magnitude of preference vectors (number of occurrences)
            square_mag = np.diag(similarity)
            # inverse squared magnitude
            inv_square_mag = 1 / square_mag
            # if it doesn't occur, set it's inverse magnitude to zero (instead of inf)
            inv_square_mag[np.isinf(inv_square_mag)] = 0
            # inverse of the magnitude
            inv_mag = np.sqrt(inv_square_mag)
            # cosine similarity (elementwise multiply by inverse magnitudes)
            cosine = similarity * inv_mag
            return cosine.T * inv_mag

    def refresh(self):
        """
        Normalize data and calculate similarity matrix again (after
        some few ratings added)
        """
        self.normalize_Y()
        print("fit_DONE")
        self.similarity() 
        
    def fit(self):
        self.refresh()
        
    
    def __pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        # Step 1: find all users who rated i
        ids = np.where(self.Y_data[:, 1] == i)[0].astype(np.int32)
        # Step 2: 
        users_rated_i = (self.Y_data[ids, 0]).astype(np.int32)
        # Step 3: find similarity btw the current user and others 
        # who already rated i
        sim = self.S[u, users_rated_i]
        # Step 4: find the k most similarity users
        # a = np.argsort(sim)[:self.k] 
        a = np.argsort(sim)[-self.k:] 
        # and the corresponding similarity levels
        nearest_s = sim[a]
        # How did each of 'near' users rated item i
        r = self.Ybar[i, users_rated_i[a]]
        if normalized:
            # add a small number, for instance, 1e-8, to avoid dividing by 0
            return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8)

        return (r*nearest_s)[0]/(np.abs(nearest_s).sum() + 1e-8) + self.mu[u]
    
    def pred(self, u, i, normalized = 1):
        """ 
        predict the rating of user u for item i (normalized)
        if you need the un
        """
        if self.uuCF: return self.__pred(u, i, normalized)
        return self.__pred(i, u, normalized)
            
    
    def recommend(self, u):
        """
        Determine all items should be recommended for user u.
        The decision is made based on all i such that:
        self.pred(u, i) > 0. Suppose we are considering items which 
        have not been rated by u yet. 
        """
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        recommended_items = []
        for i in range(self.n_items):
            if i not in items_rated_by_u:
                rating = self.__pred(u, i)
                if rating > 0: 
                    recommended_items.append(i)
        
        return recommended_items 
    
    # def recommend2(self, u):
    #     """
    #     Determine all items should be recommended for user u.
    #     The decision is made based on all i such that:
    #     self.pred(u, i) > 0. Suppose we are considering items which 
    #     have not been rated by u yet. 
    #     """
    #     ids = np.where(self.Y_data[:, 0] == u)[0]
    #     items_rated_by_u = self.Y_data[ids, 1].tolist()              
    #     recommended_items = []
    
    #     for i in range(self.n_items):
    #         if i not in items_rated_by_u:
    #             rating = self.__pred(u, i)
    #             if rating > 0: 
    #                 recommended_items.append(i)
        
    #     return recommended_items 
    def list_rating_label_by_u(self, u):
        ids = np.where(self.Y_data[:, 0] == u)[0]
        y_label = self.Y_data[ids, 2].tolist()
        return y_label
    def list_item_rate_by_u(self, u):
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()   
        return items_rated_by_u
    def recommend2(self, u,k_nn=2):
        ids = np.where(self.Y_data[:, 0] == u)[0]
        items_rated_by_u = self.Y_data[ids, 1].tolist()              
        rate_pred = []
        y_label = self.Y_data[ids, 2].tolist()
        for i in range(self.n_items):
            if i in items_rated_by_u:
                rating = self.__pred(u, i,k_nn)+self.mu[u]                   
                rate_pred.append(round(rating))
                # rate_pred.append(rating)
        return rate_pred

    def print_recommendation(self):
        """
        print all items which should be recommended for each user 
        """
        print ('Recommendation: ')
        for u in range(self.n_users):
            recommended_items = self.recommend(u)
            if self.uuCF:
                print ('    Recommend item(s):', recommended_items, 'for user', u)
            else: 
                print( '    Recommend item', u, 'for user(s) : ', recommended_items)
                
    def full_Y(self):
        x,y = np.where(self.rating_array == 0)

        for i in range(x.shape[0]):
            print(i)
            rating = self.pred(y[i],x[i])
            self.rating_array[x[i],y[i]] = rating
        return pd.DataFrame(self.rating_array)

In [3]:
from myfm.utils.benchmark_data import MovieLens100kDataManager

In [4]:
data_manager=MovieLens100kDataManager()

In [14]:
rating = data_manager.load_rating_all()
rating

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,1997-12-04 15:55:49
1,186,302,3,1998-04-04 19:22:22
2,22,377,1,1997-11-07 07:18:36
3,244,51,2,1997-11-27 05:02:03
4,166,346,1,1998-02-02 05:33:16
...,...,...,...,...
99995,880,476,3,1997-11-22 05:10:44
99996,716,204,5,1997-11-17 19:39:03
99997,276,1090,1,1997-09-20 22:49:55
99998,13,225,2,1997-12-17 22:52:36


In [124]:
# data file 
r_cols = ['user_id', 'item_id', 'rating']
ratings = pd.read_csv('E:\\recommendation-system\\CONTROL\\ex.dat', sep = ' ', names = r_cols, encoding='latin-1')
Y_data = ratings.values

rs = CF(Y_data, k = 2, uuCF = 1)
rs.fit()
print(pd.DataFrame(rs.rating_array))
# rs.print_recommendation()
# rs.Ybar.toarray()

fit_DONE
     0    1    2    3    4    5    6
0  5.0  5.0  2.0  0.0  1.0  0.0  0.0
1  4.0  0.0  0.0  0.0  0.0  2.0  0.0
2  0.0  4.0  1.0  0.0  0.0  1.0  1.0
3  2.0  2.0  3.0  4.0  4.0  0.0  4.0
4  2.0  0.0  4.0  0.0  0.0  0.0  5.0


In [125]:
rs.full_Y()

          0         1         2         3         4         5         6
0  5.000000  5.000000  2.000000 -1.410684  1.000000  0.176938 -0.634336
1  4.000000  0.479426 -0.171047 -1.146659 -1.333333  2.000000  0.045201
2  0.905594  4.000000  1.000000 -1.835868 -1.780127  1.000000  1.000000
3  2.000000  2.000000  3.000000  4.000000  4.000000  0.590268  4.000000
4  2.000000 -2.068297  4.000000  1.567174  1.556025  1.590268  5.000000


In [15]:
rate_train,rate_test = train_test_split(rating, test_size=0.5)

X_train = rate_train.values
X_test = rate_test.values

X_train[:,:2] -=1
X_test[:,:2] -=1


In [13]:
X_train

array([[297, 133, 5, Timestamp('1998-01-07 14:22:46')],
       [136, 410, 5, Timestamp('1997-12-06 18:38:10')],
       [585, 30, 4, Timestamp('1998-01-06 05:30:31')],
       ...,
       [612, 477, 5, Timestamp('1998-03-30 03:07:42')],
       [709, 509, 4, Timestamp('1997-12-14 01:51:23')],
       [503, 504, 4, Timestamp('1998-02-18 21:39:17')]], dtype=object)

In [31]:
test = rating.values
test[:,:2] -=1
rs = CF(test, k = 2, uuCF = 1)
rs.fit()

pd.DataFrame(rs.rating_array)

# n_tests = X_test.shape[0]
# SE = 0 # squared error
# for n in range(n_tests):
#     pred = rs.pred(X_test[n, 0], X_test[n, 1], normalized = 0)
#     SE += (pred - X_test[n, 2])**2 

# RMSE = np.sqrt(SE/n_tests)
# print ('User-user CF, RMSE =', RMSE)

fit_DONE


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,5.0,4.0,0.0,0.0,4.0,4.0,0.0,0.0,0.0,4.0,...,2.0,3.0,4.0,0.0,4.0,0.0,0.0,5.0,0.0,0.0
1,3.0,0.0,0.0,0.0,3.0,0.0,0.0,0.0,0.0,0.0,...,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0
2,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,3.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,4.0,...,5.0,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0
4,3.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1677,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1678,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1679,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1680,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
# rs.full_Y()
# x,y = np.where(rs.rating_array == 0)
# print(x.shape)
# print(y.shape)
# test = pd.DataFrame(rs.rating_array)

# test.apply(lambda x: rs.pred() if x==0 else x)
# for i in range(1400000):
#     print(i)
pd.DataFrame(rs.S)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,933,934,935,936,937,938,939,940,941,942
0,1.000000,0.043411,0.011051,0.059303,0.134514,0.103373,0.110556,0.180891,0.012253,-0.000621,...,0.025835,-0.047952,0.087224,0.007718,0.074378,0.078714,0.067433,0.028790,-0.031270,0.032123
1,0.043411,1.000000,0.013658,-0.017016,0.035770,0.094503,0.089408,0.055640,0.027294,0.097846,...,0.012853,-0.028798,0.056659,0.197835,0.090009,0.032505,0.015053,-0.017344,0.012068,0.039173
2,0.011051,0.013658,1.000000,-0.059638,0.016037,-0.017158,0.016141,0.041177,-0.010093,0.023856,...,0.001615,0.000658,-0.006888,0.036157,-0.018513,-0.006240,-0.023907,0.034414,-0.009187,0.001489
3,0.059303,-0.017016,-0.059638,1.000000,0.007373,-0.053929,-0.025604,0.136046,0.016082,-0.013588,...,0.011895,0.002174,-0.028000,-0.025021,0.022882,-0.005960,0.279818,0.258594,0.064504,-0.019222
4,0.134514,0.035770,0.016037,0.007373,1.000000,0.038484,0.067874,0.140106,0.010195,0.014335,...,0.070014,-0.070821,0.024278,0.038672,0.093567,0.051782,0.029540,0.036234,0.043318,0.099324
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
938,0.078714,0.032505,-0.006240,-0.005960,0.051782,-0.047520,0.013584,0.025026,0.018475,0.014387,...,0.032608,-0.004944,0.064036,-0.035230,0.019929,1.000000,0.016835,-0.030376,-0.023190,0.004451
939,0.067433,0.015053,-0.023907,0.279818,0.029540,-0.012071,0.005844,0.078222,0.004491,0.044572,...,-0.029460,0.054646,-0.059929,-0.032935,0.022646,0.016835,1.000000,0.102008,-0.011483,0.065414
940,0.028790,-0.017344,0.034414,0.258594,0.036234,0.001559,0.001943,0.057949,0.040748,0.026179,...,-0.025764,-0.031663,0.057585,0.094183,-0.094838,-0.030376,0.102008,1.000000,-0.019055,0.000666
941,-0.031270,0.012068,-0.009187,0.064504,0.043318,0.036605,0.106252,0.030609,0.026259,0.062067,...,0.114179,-0.035153,-0.002613,0.031585,0.007572,-0.023190,-0.011483,-0.019055,1.000000,0.040354
