In [1]:
from sklearn.decomposition import TruncatedSVD, randomized_svd
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [2]:
user = pd.read_csv('../Data/whyout_data/user.csv') # shape(31178,13), user idx에서 2354개가 비어있음
place = pd.read_csv('../Data/whyout_data/place.csv') # shape(4697,10), place idx에서 23개가 비어있음
product = pd.read_csv('../Data/whyout_data/product.csv') # shape(5834,11), product idx에서 538개가 비어있음
video = pd.read_csv('../Data/whyout_data/video.csv') # shape(3250, 9), video idx에서 315개가 비어있음
#user_place = pd.read_csv('../Data/whyout_data/user_place.csv') # shape (31176,4697) 유저 x 장소 아이템 # 8756명이 장소를 평가하지 않음
#user_product = pd.read_csv('../Data/whyout_data/user_product.csv') # shape(31176,5834) 유저 x 상품 아이템
#user_video = pd.read_csv('../Data/whyout_data/user_video.csv') # shape (31177, 3250) 유저 x 영상 아이템
#null_del_user_place = pd.read_csv('../Data/whyout_data/null_del_user_place.csv')
#null_del_user_product = pd.read_csv('../Data/whyout_data/null_del_user_product.csv')
#null_del_user_video = pd.read_csv('../Data/whyout_data/null_del_user_video.csv')
index_null_del_user_place = pd.read_csv('../Data/whyout_data/index_null_del_user_place.csv', index_col=0)
index_null_del_user_product = pd.read_csv('../Data/whyout_data/index_null_del_user_product.csv', index_col=0)
index_null_del_user_video = pd.read_csv('../Data/whyout_data/index_null_del_user_video.csv', index_col=0)
#normal_index_null_del_user_place = pd.read_csv('../Data/whyout_data/normal_index_null_del_user_place.csv', index_col=0)
#normal_index_null_del_user_product = pd.read_csv('../Data/whyout_data/normal_index_null_del_user_product.csv', index_col=0)
#ormal_index_null_del_user_video = pd.read_csv('../Data/whyout_data/normal_index_null_del_user_video.csv', index_col=0)

In [3]:
index_null_del_user_place = index_null_del_user_place.astype(int)
index_null_del_user_product = index_null_del_user_product.astype(int)
index_null_del_user_video = index_null_del_user_video.astype(int)

In [4]:
# 데이터프레임을 numpy matrix로 만듦
user_product_matrix = index_null_del_user_product.values

# 사용자의 평균 평점
user_ratings_mean = np.mean(user_product_matrix, axis=1)

# 사용자-상품에 대해 사용자 평균 평점을 뺀 것
matrix_user_mean = user_product_matrix - user_ratings_mean.reshape(-1,1)

In [5]:
pd.DataFrame(matrix_user_mean, columns = index_null_del_user_product.columns).head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,5825,5826,5827,5828,5829,5830,5831,5832,5833,5834
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,...,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
# U 행렬, sigma 행렬, V 전치 행렬을 반환.
# U, sigma, Vt = svds(matrix, k = 20)
U, sigma, Vt = randomized_svd(user_product_matrix, n_components=20)

# 특이값 벡터를 대각 행렬 형태로 변환해줌
sigma = np.diag(sigma)

In [7]:
# U, Sigma, Vt의 행렬곱을 수행하면, 다시 원본 행렬로 복원이 된다. 
# 거기에 + 사용자 평균 rating을 적용한다. 
svd_user_predicted_ratings = np.matmul(np.matmul(U,sigma), Vt) + user_ratings_mean.reshape(-1,1)

In [8]:
def compute_cos_similarity(v1, v2):
  norm1 = np.sqrt(np.sum(np.square(v1)))
  norm2 = np.sqrt(np.sum(np.square(v2)))
  dot = np.dot(v1, v2)
  return dot / (norm1 * norm2)

In [9]:
# 유저 latent vector와 코사인 유사도를 사용하여 추천
# 나와 비슷한 취향을 가진 다른 사용자의 아이템을 추천
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1,  []

for user_id, user_vector in enumerate(U):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.6195422230924141, Best Match ID: 25139


In [10]:
index_null_del_user_product[0:1].shape

(1, 5834)

In [11]:
recommend_list_cos_similarity = []
for i, (log1, log2) in enumerate(zip(index_null_del_user_product.iloc[my_id], index_null_del_user_product.iloc[best_match_id])):
    if log1 < 1 and log2 > 0:
        recommend_list_cos_similarity.append(i)
print(recommend_list_cos_similarity)

[3896]


In [22]:
# 아이템 latent vector와 코사인 유사도를 사용하여 아이템 추천
# 내가 본 아이템과 비슷한 아이템을 추천
my_id, my_vector = 3, Vt.T[3]
best_match, best_match_id, best_match_vector = -1, -1,  []

for user_id, user_vector in enumerate(Vt.T):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9996609017198038, Best Match ID: 670


In [23]:
recommend_list_cos_similarity = []
for i, user_vector in enumerate(index_null_del_user_product):
    if index_null_del_user_product.iloc[i][my_id] > 0.0000001:
        recommend_list_cos_similarity.append(i)
print(recommend_list_cos_similarity)

[]
