In [52]:
from sklearn.decomposition import TruncatedSVD, randomized_svd
from scipy.sparse.linalg import svds

import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings("ignore")

In [53]:
#user = pd.read_csv('./whyout_data/user.csv') # shape(31178,13), user idx에서 2354개가 비어있음
#place = pd.read_csv('./whyout_data/place.csv') # shape(4697,10), place idx에서 23개가 비어있음
product = pd.read_csv('./whyout_data/product.csv') # shape(5834,11), product idx에서 538개가 비어있음
#video = pd.read_csv('./whyout_data/video.csv') # shape(3250, 9), video idx에서 315개가 비어있음
#user_place = pd.read_csv('./whyout_data/user_place.csv', index_col=0) # shape (31176,4697) 유저 x 장소 아이템 # 8756명이 장소를 평가하지 않음
user_product = pd.read_csv('./whyout_data/user_product.csv', index_col=0) # shape(31176,5834) 유저 x 상품 아이템
#user_video = pd.read_csv('./whyout_data/user_video.csv', index_col=0) # shape (31177, 3250) 유저 x 영상 아이템

In [54]:
# 데이터프레임을 numpy matrix로 만듦
user_product_matrix = user_product.values

# 사용자의 평균 평점
user_ratings_mean = np.mean(user_product_matrix, axis=1)

# 사용자-상품에 대해 사용자 평균 평점을 뺀 것
matrix_user_mean = user_product_matrix - user_ratings_mean.reshape(-1,1)

In [55]:
pd.DataFrame(matrix_user_mean, columns = user_product.columns).head()

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,5825,5826,5827,5828,5829,5830,5831,5832,5833,5834
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,...,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514,-0.000514


In [56]:
# U 행렬, sigma 행렬, V 전치 행렬을 반환.
# U, sigma, Vt = svds(matrix, k = 20)
U, sigma, Vt = randomized_svd(user_product_matrix, n_components=10)

# 특이값 벡터를 대각 행렬 형태로 변환해줌
sigma = np.diag(sigma)

In [57]:
# U, Sigma, Vt의 행렬곱을 수행하면, 다시 원본 행렬로 복원이 된다. 
# 거기에 + 사용자 평균 rating을 적용한다. 
svd_user_predicted_ratings = np.matmul(np.matmul(U,sigma), Vt) + user_ratings_mean.reshape(-1,1)

In [58]:
def compute_cos_similarity(v1, v2):
  norm1 = np.sqrt(np.sum(np.square(v1)))
  norm2 = np.sqrt(np.sum(np.square(v2)))
  dot = np.dot(v1, v2)
  return dot / (norm1 * norm2)

In [59]:
# 유저 latent vector와 코사인 유사도를 사용하여 추천
# 나와 비슷한 취향을 가진 다른 사용자의 아이템을 추천
my_id, my_vector = 0, U[0]
best_match, best_match_id, best_match_vector = -1, -1,  []

for user_id, user_vector in enumerate(U):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.6218655043823833, Best Match ID: 27901


In [73]:
user_product[0:1].shape

(1, 5834)

In [68]:
recommend_list_cos_similarity = []
for i, log in enumerate((zip(user_product[my_id:my_id+1], user_product[best_match_id:best_match_id+1]))):
    log1, log2 = log
    print(log)
    if log1 < 1 and log2 > 0:
        recommend_list_cos_similarity.append(i)
print(recommend_list_cos_similarity)

('1', '1')


TypeError: '<' not supported between instances of 'str' and 'int'

In [None]:
# 아이템 latent vector와 코사인 유사도를 사용하여 아이템 추천
# 내가 본 아이템과 비슷한 아이템을 추천
my_id, my_vector = 0, Vt.T[0]
best_match, best_match_id, best_match_vector = -1, -1,  []

for user_id, user_vector in enumerate(Vt.T):
    if my_id != user_id:
        cos_similarity = compute_cos_similarity(my_vector, user_vector)
        if cos_similarity > best_match:
            best_match = cos_similarity
            best_match_id = user_id
            best_match_vector = user_vector

print('Best Match: {}, Best Match ID: {}'.format(best_match, best_match_id))

Best Match: 0.9616892029552246, Best Match ID: 1614


In [None]:
recommend_list_cos_similarity = []
for i, user_vector in enumerate(user_product):
    if user_product[i][my_id] > 0.9:
        recommend_list_cos_similarity.append(i)
print(recommend_list_cos_similarity)

KeyError: 0

In [None]:
df_svd_preds = pd.DataFrame(svd_user_predicted_ratings, columns = user_product.columns)

In [None]:
df_svd_preds

Unnamed: 0,1,2,3,4,5,6,7,8,9,10,...,5825,5826,5827,5828,5829,5830,5831,5832,5833,5834
0,-2.817532e-19,1.057056e-17,9.588158e-20,2.735803e-17,-2.265083e-17,-6.798762e-20,9.880101e-18,1.518284e-29,-2.902752e-30,-6.701316e-30,...,-3.322882e-19,0.0,5.448227e-21,0.0,3.598288e-22,3.598288e-22,0.0,8.639644e-22,2.095593e-24,1.650084e-17
1,-2.196724e-20,1.275621e-17,-2.438027e-20,6.269619e-17,-1.382589e-17,2.607912e-18,2.064097e-17,4.686453e-30,-8.822061e-31,-2.364395e-30,...,-1.690933e-19,0.0,2.564265e-21,0.0,2.620213e-22,2.620213e-22,0.0,-7.500928e-22,3.593051e-25,1.603402e-17
2,2.551442e-18,4.181152e-16,4.099385e-18,-1.489213e-16,1.537804e-18,1.367819e-17,-2.383149e-17,-3.907810e-30,5.324750e-31,1.537114e-30,...,-2.701614e-20,0.0,6.164533e-22,0.0,1.792336e-21,1.792336e-21,0.0,1.418185e-21,-2.076964e-25,-1.763151e-20
3,-2.087007e-19,-4.388832e-17,-7.969849e-19,2.462979e-17,8.449781e-20,-1.427656e-18,6.319015e-18,-9.015984e-30,1.616083e-30,3.182130e-30,...,4.682301e-20,0.0,-9.956656e-22,0.0,-3.456681e-22,-3.456681e-22,0.0,-1.795815e-21,-1.401035e-24,1.379105e-17
4,3.388617e-10,3.283232e-08,1.189960e-09,1.587172e-08,-8.435398e-11,2.444259e-09,5.542569e-09,4.432015e-21,-7.149538e-22,-1.817935e-21,...,2.586279e-10,0.0,1.224484e-12,0.0,4.686870e-13,4.686870e-13,0.0,7.488330e-12,3.257625e-15,9.583531e-09
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
31171,2.898090e-08,6.872930e-06,7.517836e-08,-5.074999e-08,-1.953015e-07,3.808046e-07,3.070960e-07,4.450830e-19,-7.723987e-20,-1.728055e-19,...,9.748553e-09,0.0,9.824102e-11,0.0,3.629058e-11,3.629058e-11,0.0,3.998055e-10,1.927175e-13,1.459710e-07
31172,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00
31173,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00
31174,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,0.000000e+00,...,0.000000e+00,0.0,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.0,0.000000e+00,0.000000e+00,0.000000e+00


In [None]:
def recommend_movies(df_svd_preds, user_id, ori_movies_df, ori_ratings_df, num_recommendations):
    
    # 원본 평점 데이터에서 user id에 해당하는 데이터를 뽑아낸다. (사용자가 이미 평가한 영화 평점을 가져옴)
    user_data = ori_ratings_df[ori_ratings_df.index == user_id]

    # 위에서 뽑은 user_data와 원본 영화 데이터를 합친다. (사용자의 평가 기록을 원본 데이터와 병합하여 사용자가 이미 시청한 영화 목록을 가져와 평점이 높은 순으로 정렬)
    user_history = user_data.merge(ori_movies_df, on = 'movieId').sort_values(['rating'], ascending=False)

    # 현재는 index로 적용이 되어있으므로 user_id - 1을 해야함.
    user_row_number = user_id - 1 
    
    # SVD를 통해 예측된 사용자의 영화 평점을 기반으로, 사용자 index에 따라 영화 평점이 높은 순으로 영화 데이터 정렬
    sorted_user_predictions = df_svd_preds.iloc[user_row_number].sort_values(ascending=False)
    
    # 원본 영화 데이터에서 사용자가 본 영화 데이터를 제외한 데이터를 추출 (isin : movieid가 있는지 여부, ~가 부정 연산자)
    recommendations = ori_movies_df[~ori_movies_df['movieId'].isin(user_history['movieId'])]

    # 사용자의 영화 평점이 높은 순으로 정렬된 데이터와 위 recommendations을 합친다. 
    recommendations = recommendations.merge(pd.DataFrame(sorted_user_predictions).reset_index(), on = 'movieId')
    
    # 컬럼 이름 바꾸고 정렬해서 return
    recommendations = recommendations.rename(columns = {user_row_number: 'Predictions'}).sort_values('Predictions', ascending = False).iloc[:num_recommendations, :]
                      
    return user_history, recommendations

In [None]:
already_rated, predictions = recommend_movies(df_svd_preds, 1, product, user_product, 50)

KeyError: 'movieId'

In [None]:
predictions