In [1]:
import pandas as pd
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity
from new_user_similarity import find_similar_index
import warnings
warnings.filterwarnings("ignore")

In [2]:
# user, place, product, video 메타데이터
user = pd.read_csv('../Data/whyout_data/user.csv') # (31177,3)
user_interest = pd.read_csv('user_interest.csv') # (31178,15)
final_user_interest = pd.read_csv('../Data/whyout_data/final_user_interest.csv') # (31178,15)
place = pd.read_csv('../Data/whyout_data/place.csv') # shape(4697,10)
product = pd.read_csv('../Data/whyout_data/product.csv') # shape(5821,11)
video = pd.read_csv('../Data/whyout_data/video.csv') # shape(3250, 9)

In [3]:
# 유저의 행동데이터
user_place = pd.read_csv('../Data/whyout_data/col_user_place.csv') # (31177,4697)
user_product = pd.read_csv('../Data/whyout_data/col_user_product.csv') # (31177,5821)
user_video = pd.read_csv('../Data/whyout_data/col_user_video.csv') # (31177, 3250)

In [4]:
# 각 아이템 별 행동 데이터가 없는 유저를 삭제한 데이터
drop_user_place = pd.read_csv('../Data/whyout_data/drop_user_place.csv') # (22420,4697) 
drop_user_product = pd.read_csv('../Data/whyout_data/drop_user_product.csv') # (2994,5821)
drop_user_video = pd.read_csv('../Data/whyout_data/drop_user_video.csv') # (11067, 3250)

In [5]:
# 각 아이템 별 행동 데이터가 없는 유저를 삭제한 데이터의 idx
drop_user_place_idx = pd.read_csv('../Data/whyout_data/drop_user_place_idx.csv') # (22420,4)
drop_user_product_idx = pd.read_csv('../Data/whyout_data/drop_user_product_idx.csv') # (2294,4)
drop_user_video_idx = pd.read_csv('../Data/whyout_data/drop_user_video_idx.csv') # (11067, 4)

In [6]:
# 유저의 행동데이터 전체를 사용한 SGD 결과 (U x V)
full_data_sgd_place_preds = pd.read_csv('../Data/whyout_data/sgd_result/full_data/user_place_k50epochs1000.csv')
full_data_sgd_product_preds = pd.read_csv('../Data/whyout_data/sgd_result/full_data/user_product_k50epochs1000.csv')
full_data_sgd_video_preds = pd.read_csv('../Data/whyout_data/sgd_result/full_data/user_video_k50epochs1000.csv')

In [7]:
# 각 아이템 별 행동 데이터가 없는 유저를 삭제한 데이터를 사용한 SGD 결과 (U x V)
del_data_sgd_place_preds = pd.read_csv('../Data/whyout_data/sgd_result/del_data/drop_user_place_k40epochs1000.csv')
del_data_sgd_product_preds = pd.read_csv('../Data/whyout_data/sgd_result/del_data/drop_user_product_k20epochs1000.csv')
del_data_sgd_video_preds = pd.read_csv('../Data/whyout_data/sgd_result/del_data/drop_user_video_k30epochs1000.csv')

In [8]:
# 각 아이템 별 행동 데이터가 없는 유저를 삭제한 데이터를 사용한 SGD 결과의 user_latent(U)와 item_latent(V)
place_user_latent = pd.read_csv('../Data/whyout_data/sgd_result/del_data/drop_user_place_user_latent_k40epochs1000.csv')
product_user_latent = pd.read_csv('../Data/whyout_data/sgd_result/del_data/drop_user_product_user_latent_k20epochs1000.csv')
video_user_latent = pd.read_csv('../Data/whyout_data/sgd_result/del_data/drop_user_video_user_latent_k30epochs1000.csv')

In [9]:
def recommend_step1(item, sgd_video_preds, user_id, item_df, ratings_df, ratings_df_idx, num_recommendations):
    drop_user_video_index = int(ratings_df_idx[ratings_df_idx['idx'] == user_id].iloc[:,3])
    # 원본 행동 데이터에서 user_id에 해당하는 행을 가져옴
    user_data = ratings_df.loc[drop_user_video_index]
    # 유저가 평가하지 않은 아이템의 index를 가져옴
    user_history_non_indices = [int(i) for i in user_data[user_data <= 0].index.tolist()]
    # user_id에 해당하는 SGD 결과값을 가져온 후, 유저가 평가하지 않은 아이템의 결과값만 뽑아옴
    user_predictions = sgd_video_preds.loc[drop_user_video_index]
    user_predictions_filtered = user_predictions.iloc[user_history_non_indices]
    # SGD 결과값이 높은 순으로 정렬
    sorted_predictions = user_predictions_filtered.sort_values(ascending=False)
    # 상위 N개만큼 뽑아옴
    top_recommendations = sorted_predictions.index.tolist()[:num_recommendations]
    # 아이템 idx 매핑
    recommendations_result = item_df.iloc[top_recommendations]['idx'].tolist()
    #print(f"user {user_id}에게 추천해줄 {10}개 {item} idx : {recommendations_result}")
    return recommendations_result

def item_user_latent_cos(user_id, original_item, item_list, dict):
    print('유사도 선택시 item_list:', item_list)
    if user_id in dict[item_list[0]][3]['idx'].values:
        item = item_list[0]
        print(f'user {user_id}는 {item}에 대한 행동이 존재함')
    elif user_id in dict[item_list[1]][3]['idx'].values:
        item = item_list[1]
        print(f'user {user_id}는 {item}에 대한 행동이 존재함')
    else:
        """
        모든 아이템에 대한 행동이 없는 유저에게 추천하는 함수 추가 
        """
        print(f'user {user_id}는 모든 아이템에 대한 행동이 없음')

    print(f'2. {item} user latent에서 user {user_id}과 유사한 user 찾기')
    drop_user_place_index = int(dict[item][3][dict[item][3]['idx'] == user_id].iloc[:,3])
    cosine_sim_matrix = cosine_similarity(dict[item][4])
    cosine_sim_df = pd.DataFrame(cosine_sim_matrix, index=dict[item][4].index, columns=dict[item][4].index)
    # user_id의 score를 가져옴
    user_similarities = cosine_sim_df.loc[drop_user_place_index]
    # user_id를 선택하지 않도록 -1을 해줌
    user_similarities[drop_user_place_index] = -1
    # 유사도가 높은 순으로 정렬
    sorted_user_similarities = user_similarities.sort_values(ascending=False)
    # 유사도가 높은 유저를 순서대로 기존의 추천하려는 아이템에 행동이 있는지 확인
    for i in range(len(sorted_user_similarities)):
        most_similar_user_id = sorted_user_similarities.index[i]
        new_user_id = int(dict[item][3][dict[item][3].iloc[:,3] == most_similar_user_id].iloc[:,0])
        highest_similarity_score = sorted_user_similarities.iloc[i]
        
        if new_user_id in dict[original_item][3]['idx'].values:
            # new_user_idx 찾기
            new_user_id = int(dict[item][3][dict[item][3].iloc[:,3] == most_similar_user_id].iloc[:,0])
            print('3. update new_user_id:', new_user_id)
            break
        else:
            print(f'{new_user_id}가 {original_item}에 대한 행동이 없음')
    print(f'4. user {user_id}과 가장 유사한 user : {new_user_id}, cos : {highest_similarity_score}')
    return new_user_id


def recommendation_system(user_id, item, item_list, dict, num_recommendations):
    if user_id in dict[item][3]['idx'].values:
        recomm_list = recommend_step1(item, dict[item][0], user_id, dict[item][1], dict[item][2], dict[item][3], num_recommendations)
        print(f"1. user {user_id}에게 추천해줄 {10}개 {item} idx : {recomm_list}")
    else:
        print(f'1. user {user_id}는 {item}에 대한 행동내역이 없음')
        # 아이템 리스트에서 행동이 없는 아이템 제거
        item_list.remove(item)
        new_user_id = item_user_latent_cos(user_id, item, item_list, dict)
        user_id = new_user_id
        print(f'5. user {new_user_id}에게 {item} recommend_step2 시작')
        recom_list2 = recommend_step1(item, dict[item][0], user_id, dict[item][1], dict[item][2], dict[item][3], num_recommendations)
        print(f"6. user {user_id}에게 추천해줄 {num_recommendations}개 {item} idx : {recom_list2}")

In [10]:
user_id = 37000
num_recommendations = 10
dict = { 'place' : [del_data_sgd_place_preds, place, drop_user_place, drop_user_place_idx, place_user_latent],
         'video' : [del_data_sgd_video_preds, video, drop_user_video, drop_user_video_idx, video_user_latent],
         'product' : [del_data_sgd_product_preds, product, drop_user_product, drop_user_product_idx, product_user_latent]}
item = 'place'
item_list = list(dict.keys())

new_user_item = [1,0,0]
new_user_outdoor = [0,0,0,0,0,0,0,0,0,1]

if user_id in user['idx'].values:
    final = recommendation_system(user_id, item, item_list, dict, num_recommendations)
else:
    exact_match_indices = find_similar_index(final_user_interest, new_user_item, new_user_outdoor)
    if isinstance(exact_match_indices, (int, np.integer)):
        user_id = int(final_user_interest[final_user_interest.index == exact_match_indices].iloc[:,0])
    else:
        final_match_idx = []
        for i in exact_match_indices:
            final_match_idx.append(int(final_user_interest[final_user_interest.index == i].iloc[:,0]))
        user_id = final_match_idx[1]
    final = recommendation_system(user_id, item, item_list, dict, num_recommendations)

새로운 데이터와 정확히 일치하는 인덱스는 [117, 8724, 10893, 12581, 13681, 16568, 17538, 23082, 23386, 26893, 27744, 27985]입니다.
1. user 9636에게 추천해줄 10개 place idx : [668, 4672, 2292, 1879, 17, 4429, 1198, 301, 1502, 40]
