In [40]:
from matplotlib import pyplot as plt
%matplotlib inline
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [41]:
place = pd.read_csv('../Data/whyout_data/place.csv') # shape(4697,10), place idx에서 23개가 비어있음
product = pd.read_csv('../Data/whyout_data/product.csv') # shape(5834,11), product idx에서 538개가 비어있음
video = pd.read_csv('../Data/whyout_data/video.csv') # shape(3250, 9), video idx에서 315개가 비어있음

In [42]:
user_place = pd.read_csv('../Data/whyout_data/user_place.csv') # shape (31176,4697) 유저 x 장소 아이템 # 8756명이 장소를 평가하지 않음
user_product = pd.read_csv('../Data/whyout_data/user_product.csv') # shape(31176,5834) 유저 x 상품 아이템
user_video = pd.read_csv('../Data/whyout_data/user_video.csv') # shape (31176, 3250) 유저 x 영상 아이템

In [44]:
drop_user_place = pd.read_csv('../Data/whyout_data/drop_user_place.csv') # shape (22420,4697) 유저 x 장소 아이템 # 8756명이 장소를 평가하지 않음
drop_user_product = pd.read_csv('../Data/whyout_data/drop_user_product.csv') # shape(2996,5834) 유저 x 상품 아이템
drop_user_video = pd.read_csv('../Data/whyout_data/drop_user_video.csv') # shape (11067, 3250) 유저 x 영상 아이템

In [49]:
class SGD():
    def __init__(self, R, k, learning_rate, reg_param, epochs, verbose=False):
        """
        :param R: rating matrix
        :param k: latent parameter
        :param learning_rate: alpha on weight update
        :param reg_param: beta on weight update
        :param epochs: training epochs
        :param verbose: print status
        """
        self._R = R
        self._num_users, self._num_items = R.shape
        self._k = k
        self._learning_rate = learning_rate
        self._reg_param = reg_param
        self._epochs = epochs
        self._verbose = verbose
        self.cost_list = []


    def fit(self):
        """
        training Matrix Factorization : Update matrix latent weight and bias

        참고: self._b에 대한 설명
        - global bias: input R에서 평가가 매겨진 rating의 평균값을 global bias로 사용
        - 정규화 기능. 최종 rating에 음수가 들어가는 것 대신 latent feature에 음수가 포함되도록 해줌.

        :return: training_process
        """
        # init latent features
        self._U = np.random.normal(size=(self._num_users, self._k))
        self._V = np.random.normal(size=(self._num_items, self._k))

        # init biases
        self._b_U = np.zeros(self._num_users)
        self._b_V = np.zeros(self._num_items)
        self._b = np.mean(self._R[np.where(self._R != 0)])

        # train while epochs
        self._training_process = []
        for epoch in range(self._epochs):
            # rating이 존재하는 index를 기준으로 training
            xi, yi = self._R.nonzero()
            for i, j in zip(xi, yi):
                self.gradient_descent(i, j, self._R[i, j])
            cost = self.cost()
            self._training_process.append((epoch, cost))

            # print status
            if self._verbose == True and ((epoch + 1) % 10 == 0):
                self.cost_list.append(cost)
                print("Iteration: %d ; cost = %.4f" % (epoch + 1, cost))
        return self.cost_list


    def cost(self):
        """
        compute root mean square error
        :return: rmse cost
        """
        # xi, yi: R[xi, yi]는 nonzero인 value를 의미한다.
        xi, yi = self._R.nonzero()
        # predicted = self.get_complete_matrix()
        cost = 0
        #print(len(xi), len(yi))
        count = 0
        for x, y in zip(xi, yi):
            count += 1
            cost += pow(self._R[x, y] - self.get_prediction(x, y), 2)
            # if self._R[x,y]== 6:
            #     print(cost, self._R[x,y], self.get_prediction(x,y))
        return np.sqrt(cost/len(xi))


    def gradient(self, error, i, j):
        """
        gradient of latent feature for GD
        :param error: rating - prediction error
        :param i: user index
        :param j: item index
        :return: gradient of latent feature tuple
        """
        du = (error * self._V[j, :]) - (self._reg_param * self._U[i, :]) # user에 대해 gradient -> item에 대해 미분
        dv = (error * self._U[i, :]) - (self._reg_param * self._V[j, :])
        return du, dv


    def gradient_descent(self, i, j, rating):
        """
        graident descent function
        :param i: user index of matrix
        :param j: item index of matrix
        :param rating: rating of (i,j)
        """
        # get error
        prediction = self.get_prediction(i, j)
        error = rating - prediction

        # update biases
        self._b_U[i] += self._learning_rate * (error - self._reg_param * self._b_U[i])
        self._b_V[j] += self._learning_rate * (error - self._reg_param * self._b_V[j])

        # update latent feature
        du, dv = self.gradient(error, i, j)
        self._U[i, :] += self._learning_rate * du
        self._V[j, :] += self._learning_rate * dv


    def get_prediction(self, i, j):
        """
        get predicted rating: user_i, item_j
        :return: prediction of r_ij
        """
        return self._b + self._b_U[i] + self._b_V[j] + self._U[i, :].dot(self._V[j, :].T)


    def get_complete_matrix(self):
        """
        computer complete matrix UXV + U.bias + V.bias + global bias

        - UXV 행렬에 b_U[:, np.newaxis]를 더하는 것은 각 열마다 bias를 더해주는 것
        - b_V[np.newaxis:, ]를 더하는 것은 각 행마다 bias를 더해주는 것
        - b를 더하는 것은 각 element마다 bias를 더해주는 것

        - newaxis: 차원을 추가해줌. 1차원인 Latent들로 2차원의 R에 행/열 단위 연산을 해주기위해 차원을 추가하는 것.

        :return: complete matrix R^
        """
        return self._b + self._b_U[:, np.newaxis] + self._b_V[np.newaxis:, ] + self._U.dot(self._V.T)
    
    def print_results(self):
        print("User Latent U:")
        print(self._U)
        print("Item Latent V:")
        print(self._V.T)
        print("U x V:")
        print(self._U.dot(self._V.T))
        print("bias:")
        print(self._b)
        print("User Latent bias:")
        print(self._b_U)
        print("Item Latent bias:")
        print(self._b_V)
        print("Final R matrix:")
        print(self.get_complete_matrix())
        print("Final RMSE:")
        print(self._training_process[self._epochs-1][1])

In [50]:
if __name__ == "__main__":
    # rating matrix - User X Item : (사용자 수 X 아이템 수)
    # U, V is (사용자 수 X k), (k X 아이템 수) matrix
    R = np.array(drop_user_place)
    factorizer = SGD(R, k=50, learning_rate=0.01, reg_param=0.01, epochs=1000, verbose=True)
    cost_list = factorizer.fit()
    complete_matrix = factorizer.get_complete_matrix()

    R1 = np.array(drop_user_product)
    factorizer = SGD(R1, k=20, learning_rate=0.01, reg_param=0.01, epochs=1000, verbose=True)
    cost_list1 = factorizer.fit()
    complete_matrix1 = factorizer.get_complete_matrix()

    R2 = np.array(drop_user_video)
    factorizer = SGD(R2, k=40, learning_rate=0.01, reg_param=0.01, epochs=1000, verbose=True)
    cost_list2 = factorizer.fit()
    complete_matrix2 = factorizer.get_complete_matrix()

Iteration: 10 ; cost = 0.7635
Iteration: 20 ; cost = 0.7375
Iteration: 30 ; cost = 0.7178
Iteration: 40 ; cost = 0.6985
Iteration: 50 ; cost = 0.6782
Iteration: 60 ; cost = 0.6563
Iteration: 70 ; cost = 0.6328
Iteration: 80 ; cost = 0.6080
Iteration: 90 ; cost = 0.5823
Iteration: 100 ; cost = 0.5562
Iteration: 110 ; cost = 0.5301
Iteration: 120 ; cost = 0.5046
Iteration: 130 ; cost = 0.4799
Iteration: 140 ; cost = 0.4563
Iteration: 150 ; cost = 0.4340
Iteration: 160 ; cost = 0.4131
Iteration: 170 ; cost = 0.3937
Iteration: 180 ; cost = 0.3757
Iteration: 190 ; cost = 0.3592
Iteration: 200 ; cost = 0.3442
Iteration: 210 ; cost = 0.3306
Iteration: 220 ; cost = 0.3184
Iteration: 230 ; cost = 0.3075
Iteration: 240 ; cost = 0.2979
Iteration: 250 ; cost = 0.2893
Iteration: 260 ; cost = 0.2817
Iteration: 270 ; cost = 0.2750
Iteration: 280 ; cost = 0.2691
Iteration: 290 ; cost = 0.2638
Iteration: 300 ; cost = 0.2590
Iteration: 310 ; cost = 0.2547
Iteration: 320 ; cost = 0.2509
Iteration: 330 ; 

In [None]:
# 사용자의 평가 유무 확인
def find_zero_indices(df):
    return df.index[df.eq(0).all(axis=1)].tolist()

place_zero_indices = find_zero_indices(user_place)
product_zero_indices = find_zero_indices(user_product)
video_zero_indices = find_zero_indices(user_video)

#common_zero_indices = set(place_zero_indices) & set(product_zero_indices) & set(video_zero_indices)
#place_product_zero_indices = set(place_zero_indices) & set(product_zero_indices)
#place_video_zero_indices = set(place_zero_indices) & set(video_zero_indices)
#product_video_zero_indices = set(product_zero_indices) & set(video_zero_indices)

print('장소를 평가하지 않은 사용자 수:',len(place_zero_indices))
print('상품을 평가하지 않은 사용자 수:',len(product_zero_indices))
print('영상을 평가하지 않은 사용자 수:',len(video_zero_indices))
#print('모두 평가하지 않은 사용자 수:',len(common_zero_indices))
#print('장소 & 상품을 같이 평가하지 않은 사용자 수:',len(place_product_zero_indices))
#print('장소 & 영상을 같이 평가하지 않은 사용자 수:',len(place_video_zero_indices))
#print('상품 & 영상을 같이 평가하지 않은 사용자 수:',len(product_video_zero_indices))

In [None]:
df = pd.DataFrame(complete_matrix).astype(dtype='float16')
df.to_csv('drop_place_k50epochs1000.csv', index=False)

df1 = pd.DataFrame(complete_matrix1).astype(dtype='float16')
df1.to_csv('drop_product_k20epochs1000.csv', index=False)

df2 = pd.DataFrame(complete_matrix2).astype(dtype='float16')
df2.to_csv('drop_video_k40epochs1000.csv', index=False)

In [15]:
# sgd_place_df_full = pd.read_csv('../Data/whyout_data/sgd_result/full_data/place_epochs2000.csv', index_col=0)
# sgd_product_df_full = pd.read_csv('../Data/whyout_data/sgd_result/full_data/product_epochs2000.csv', index_col=0)
# sgd_video_df_full = pd.read_csv('../Data/whyout_data/sgd_result/full_data/video_epochs2000.csv', index_col=0)

In [14]:
sgd_place_df = pd.read_csv('../Data/whyout_data/sgd_result/del_data/place_k50epochs1000.csv', index_col=0)
sgd_product_df = pd.read_csv('../Data/whyout_data/sgd_result/del_data/product_k20epochs1000.csv', index_col=0)
sgd_video_df = pd.read_csv('../Data/whyout_data/sgd_result/del_data/video_k40epochs1000.csv', index_col=0)

In [16]:
def recommend_items(df_sgd_preds, user_id, item_df, ratings_df, num_recommendations):
    """
    :param df_sgd_preds: SGD 예측 평점 데이터프레임
    :param user_id: 사용자 ID
    :param df: 아이템 데이터프레임
    :param ratings_df: 사용자 평점 데이터프레임
    :param num_recommendations: 추천할 아이템 수
    :return: 사용자가 이미 평가한 아이템 인덱스와 추천 아이템 데이터프레임
    """
    # 원본 평점 데이터에서 user_id에 해당하는 행을 DataFrame으로 가져온다.
    user_data = ratings_df.loc[user_id]

    # 사용자가 이미 평가한 상품의 인덱스를 추출
    user_history_indices = [int(i) for i in user_data[user_data > 0].index.tolist()]
    print(len(user_history_indices),user_history_indices)

    # SVD를 통해 예측된 사용자의 상품 평점을 기반으로 상품 데이터를 정렬
    user_row_number = user_id - 1
    sorted_user_predictions = df_sgd_preds.iloc[user_row_number].sort_values(ascending=False)

    # 사용자가 아직 평가하지 않은 상품 목록을 추출
    recommendations = item_df.loc[~item_df.index.isin(user_history_indices)]

    # 예측된 상품 평점 데이터와 원본 상품 데이터를 합침
    predictions_df = pd.DataFrame(sorted_user_predictions).reset_index()
    predictions_df['index'] = predictions_df['index'].astype(int)
    predictions_df.set_index('index', inplace=True)

    # recommendations의 인덱스와 predictions_df의 인덱스를 기준으로 병합
    recommendations = recommendations.merge(predictions_df, left_index=True, right_index=True)
    
    # 컬럼 이름을 바꾸고 정렬해서 반환
    recommendations = recommendations.rename(columns={user_row_number: 'Predictions'}).sort_values('Predictions', ascending=False).iloc[:num_recommendations]
    print(f"user {user_id}에게 추천해줄 {num_recommendations}개 아이템 id : {recommendations['idx'].values}")
    return user_history_indices, recommendations


In [32]:
already_rated, predictions1 = recommend_items(sgd_video_df_full, 10, video, drop_user_place, 10)

830 [7, 9, 11, 12, 14, 15, 17, 19, 20, 22, 30, 31, 36, 38, 39, 40, 44, 45, 51, 53, 54, 55, 60, 63, 66, 67, 68, 69, 70, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 87, 90, 91, 92, 93, 97, 99, 104, 105, 106, 108, 109, 110, 112, 116, 117, 118, 119, 120, 122, 124, 125, 127, 128, 129, 131, 132, 133, 134, 138, 140, 141, 145, 147, 153, 154, 156, 159, 183, 184, 185, 186, 189, 190, 192, 193, 194, 195, 197, 198, 200, 202, 203, 204, 206, 207, 222, 233, 236, 241, 242, 243, 244, 245, 249, 250, 253, 255, 256, 257, 258, 260, 261, 262, 263, 264, 266, 267, 268, 272, 277, 278, 279, 280, 282, 283, 285, 288, 289, 290, 291, 299, 300, 309, 310, 312, 314, 315, 316, 317, 320, 325, 329, 334, 337, 338, 339, 340, 345, 346, 347, 355, 356, 358, 359, 364, 366, 368, 369, 375, 377, 381, 382, 383, 386, 387, 389, 390, 393, 394, 397, 398, 399, 405, 409, 413, 415, 416, 418, 419, 420, 421, 422, 423, 427, 428, 429, 432, 433, 434, 435, 436, 437, 442, 446, 449, 450, 451, 454, 456, 458, 460, 461, 462, 464, 465, 466, 467, 473,

In [33]:
predictions1

Unnamed: 0,idx,콘텐츠 카테고리,관련 제품여부,관련 장소여부,관련 장소의 공간유형 (위 장소 표 참고),좋아요 수,북마크 수,시청 수,댓글 수,Predictions
1325,1522,"[0, 0, 0, 0, 1, 0]",0,0,,0.0,0.0,0.0,0.0,11.88
3228,3552,"[0, 0, 1, 0, 0, 0]",0,0,,0.0,0.0,0.0,0.0,10.64
214,289,"[0, 0, 0, 0, 0, 1]",0,1,"[0,0,0,0,1,0,0,0,0]",0.0,0.0,0.021653,0.0,10.31
518,639,"[0, 0, 0, 1, 0, 0]",0,0,,0.0,0.0,0.020955,0.0,9.18
3245,3569,"[0, 0, 0, 1, 0, 0]",0,0,,0.0,0.0,0.0,0.0,9.18
3187,3511,"[0, 0, 0, 0, 0, 1]",0,0,,0.0,0.0,0.0,0.0,8.76
217,292,"[0, 0, 0, 0, 0, 1]",0,1,,0.0,0.0,0.022817,0.0,8.72
3186,3510,"[0, 0, 0, 0, 0, 1]",0,0,,0.0,0.0,0.0,0.0,7.875
3151,3475,"[0, 0, 0, 0, 0, 1]",0,1,"[0,0,0,0,0,0,0,1,0]",0.0,0.0,0.000233,0.0,7.72
784,929,"[0, 0, 0, 1, 0, 0]",0,0,,0.0,0.0,0.020955,0.0,7.72


In [31]:
already_rated, predictions = recommend_items(sgd_product_df, 3, product, index_del_user_product, 10)

60 [117, 164, 204, 212, 402, 578, 795, 911, 926, 1178, 1179, 1182, 1188, 1189, 1190, 1193, 1194, 1195, 1200, 1201, 1218, 1223, 1384, 1389, 1408, 1415, 1416, 1467, 1496, 1498, 1501, 1502, 1503, 1550, 1566, 1721, 2138, 2513, 2531, 2552, 2559, 2577, 3272, 3274, 3439, 3453, 3530, 3570, 3689, 3719, 3756, 3989, 4057, 4066, 4471, 4481, 4484, 4811, 5817, 5833]
user 3에게 추천해줄 10개 아이템 id : [3215  467 5239 6140 4945 5883 3084  894 1204 1276]


In [26]:
predictions

Unnamed: 0,idx,카테고리1,카테고리2,색상,기능성(합집합),휴대성,친환경,디자인,클릭 수,좋아요 수,북마크 수,Predictions
3076,3277,"[0,0,0,0,0,0,0,0,0,1,0]","[0,1,0]","[0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]","[0,0,1,0,0,0]","[0,0,1,0]","[0,1,0]","[0,0,1,0,0,0,1,1,0,0]",,0.0,0.0,23.73
386,550,"[0,1,0,0,0,0,0,0,0,0,0]","[1,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0]","[1,0,1,0,0,0]","[0,0,1,0]","[0,1,0]","[0,0,0,0,0,0,1,1,0,0]",,0.0,0.0,23.28
3462,3762,"[0,1,0,0,0,0,0,0,0,0,0]","[1,0,0]","[0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]",,,,,,0.0,0.0,22.66
5610,6150,"[0,0,0,0,0,0,0,0,1,0,0]","[0,1,0]","[0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]","[1,0,1,1,0,0]","[0,0,1,0]","[0,0,0]","[0,0,1,0,0,0,0,1,0,0]",,0.0,0.0,22.6
3776,4082,"[0,0,0,0,0,0,1,0,0,0,0]","[1,0,0]","[0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0]","[0,1,1,0,0,0]","[1,1,0,0]","[0,0,0]","[0,0,0,0,0,0,0,0,0,0]",,0.0,0.0,22.06
5789,6329,"[0,0,0,0,0,0,0,0,0,1,0]","[0,1,0]",,"[0,0,0,0,0,0]","[0,0,0,0]","[0,0,0]","[0,0,0,0,0,0,0,0,0,0]",,0.0,0.0,21.94
4888,5428,"[0,0,0,0,0,0,0,0,0,1,0]","[0,1,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0]",,,,,,0.0,0.0,21.69
5667,6207,"[0,1,0,0,0,0,0,0,0,0,0]","[1,0,0]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0]","[0,0,0,0,0,0]","[0,0,0,0]","[0,0,0]","[0,0,0,0,0,0,0,0,0,0]",,0.0,0.0,21.08
355,519,"[0,0,0,0,1,0,0,0,0,0,0]","[1,0,0]","[0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0]","[1,0,0,0,1,0]","[0,1,0,0]","[0,1,0]","[1,1,0,0,0,1,0,0,0,0]",,0.0,0.0,21.02
863,1030,"[0,0,0,0,0,0,0,0,0,0,1]","[0,0,1]","[0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0]","[1,1,1,0,0,0]","[0,0,1,0]","[0,1,0]","[1,1,0,0,0,0,0,1,0,0]",,0.0,0.0,20.9
