In [1]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
# UBCF를 위한 자가드 유사도
def calculate_jaccard_similarity(matrix):
    num_users = matrix.shape[0]
    similarities = np.zeros((num_users, num_users))

    for i in range(num_users):
        for j in range(i, num_users):
            similarity = jaccard_score(matrix.iloc[i], matrix.iloc[j])
            similarities[i, j] = similarity
            similarities[j, i] = similarity

    return similarities

In [3]:
# 데이터 셋 가져오기
likes = pd.read_csv(f"C:/Users/김가연/Desktop/23-2학기/캡스톤2/data/hearts_data.csv", encoding='UTF-8-SIG')
likes = likes[["place_id", "user_id"]]
likes.loc[:, "heart"] = 1 # 하트 누른 데이터만 있으므로 다 1로 만듦
likes = likes.drop_duplicates(subset=['place_id', 'user_id'], keep='first') # 오류 방지 위해
likes

Unnamed: 0,place_id,user_id,heart
0,205,1,1
1,211,1,1
2,213,1,1
3,227,1,1
4,234,1,1
...,...,...,...
859,1091,66,1
860,1117,66,1
861,1224,66,1
862,1180,66,1


In [4]:
x = likes.copy()
y = likes['user_id']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [5]:
# f1 score 계산
def f1(y_true, y_pred):
    return f1_score(y_true, y_pred)

# 모델별 정확도를 계산하는 함수 
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['place_id'])
    
    # 예측 값
    y_pred = np.array([model(user, place) for (user, place) in id_pairs]).astype(int)
    #print("y_pred", y_pred)
    
    # 실제 값
    y_true = np.array(x_test['heart'])
    #print("y_true", y_true)
    return f1(y_true, y_pred)


#  train 데이터로 Full matrix 구하기 
likes_matrix = x_train.pivot(index='user_id', columns='place_id', values='heart')
likes_matrix


matrix_dummy = likes_matrix.copy().fillna(0)
matrix_dummy

place_id,1,4,5,6,7,8,9,12,13,15,...,1863,1864,1865,1869,1874,1877,1891,1895,1896,1897
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


In [6]:
##### =============================== UBCF(자가드 유사도) ======================
# 자카드 유사도 구하기
user_similarity = calculate_jaccard_similarity(matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=likes_matrix.index, columns=likes_matrix.index)
user_similarity

user_id,1,4,6,7,8,9,10,11,12,13,...,56,57,58,59,61,62,63,64,65,66
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.055556,0.0,0.0,0.052632,0.047619,0.125,0.076923,0.5,0.3,...,0.0,0.0,0.235294,0.086957,0.111111,0.071429,0.0,0.058824,0.117647,0.111111
4,0.055556,1.0,0.0,0.0,0.055556,0.0,0.0625,0.0,0.0625,0.0,...,0.0625,0.0,0.0,0.043478,0.055556,0.076923,0.0,0.0,0.0,0.055556
6,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.2,0.111111,0.058824,...,0.038462,0.0,0.083333,0.0625,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.066667,0.0,...,0.0,0.0,0.055556,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.052632,0.055556,0.0,0.0,1.0,0.0,0.058824,0.076923,0.058824,0.0,...,0.0,0.052632,0.0,0.0,0.052632,0.0,0.0,0.0,0.0,0.052632
9,0.047619,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.037037,...,0.0,0.0,0.045455,0.0,0.0,0.0,0.0,0.052632,0.05,0.0
10,0.125,0.0625,0.0,0.0,0.058824,0.0,1.0,0.090909,0.142857,0.043478,...,0.0,0.0,0.0,0.095238,0.125,0.181818,0.0,0.066667,0.0625,0.125
11,0.076923,0.0,0.2,0.0,0.076923,0.0,0.090909,1.0,0.2,0.111111,...,0.035714,0.0,0.153846,0.055556,0.076923,0.125,0.0,0.090909,0.0,0.0
12,0.5,0.0625,0.111111,0.066667,0.058824,0.0,0.142857,0.2,1.0,0.2,...,0.03125,0.0,0.1875,0.095238,0.058824,0.083333,0.0,0.0,0.0,0.125
13,0.3,0.0,0.058824,0.0,0.0,0.037037,0.043478,0.111111,0.2,1.0,...,0.025,0.0,0.227273,0.148148,0.130435,0.0,0.0,0.142857,0.136364,0.083333


In [89]:
def CF_UBCF(user_id, place_id):
    if place_id in likes_matrix:

        sim_scores = user_similarity[user_id].copy()
        place_likes = likes_matrix[place_id].copy()     
        none_likes_idx = place_likes[place_likes.isnull()].index
        place_likes = place_likes.drop(none_likes_idx)
        sim_scores = sim_scores.drop(none_likes_idx)

        if sim_scores.sum() != 0.0:
            # 가중 평균
            predicted_likes = np.dot(sim_scores, place_likes) / sim_scores.sum()
        else:
            predicted_likes = 0.0
            
    else:
        predicted_likes = 0.0 # 특정 장소에 대한 좋아요 없는 경우 추천 제외
    return predicted_likes

    # 정확도 계산
print("UBCF F1 score :", score(CF_UBCF))

UBCF F1 score : 0.7796610169491525


In [90]:
def UBCF_recommender(user, n_items):
    # 현재 사용자가 찜한 장소
    liked_index = likes_matrix.loc[user][likes_matrix.loc[user] > 0].index
    
    # 현재 사용자와 유사한 사용자들의 찜한 장소의 평균 예상 찜 여부 계산
    predictions = []
    for place_id in likes_matrix.columns:
        if place_id not in liked_index:
            prediction = CF_UBCF(user, place_id)
            
            # 0이 아닌 것은 제외
            if prediction > 0.4:
                predictions.append((place_id, prediction))
    
    # 예상 찜 여부를 기준으로 내림차순 정렬
    predictions.sort(key=lambda x: x[1], reverse=True)
    print("UBCF에서 추천하는 장소", predictions)
    # 상위 n_items개의 장소를 추천
    recommended_items = [place_id for place_id, _ in predictions[:n_items]]

    return recommended_items


In [91]:
###### =============IBCF (코사인 유사도)
likes_matrix_t = likes_matrix.transpose()
matrix_dummy_t = likes_matrix_t.copy().fillna(0)

# 코사인 유사도 계산하기
item_similarity = cosine_similarity(matrix_dummy_t, matrix_dummy_t)
item_similarity = pd.DataFrame(item_similarity, index=likes_matrix_t.index, columns=likes_matrix_t.index)
item_similarity

place_id,1,4,5,6,7,9,12,13,15,19,...,1862,1863,1864,1865,1869,1874,1877,1895,1896,1897
place_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.57735,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,1.000000,0.707107,0.707107,0.0,0.0,0.0,0.0,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.707107,1.000000,0.500000,0.0,0.0,0.0,0.0,0.00000,0.707107,...,0.288675,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,0.0,0.707107,0.500000,1.000000,0.0,0.0,0.0,0.0,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.000000,0.000000,0.000000,1.0,0.0,0.0,0.0,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1874,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1877,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,1.0,0.0,0.0,0.0
1895,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.000000,...,0.577350,0.707107,0.707107,0.5,0.5,0.0,0.0,1.0,0.0,0.5
1896,0.0,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.00000,0.000000,...,0.000000,0.000000,0.000000,0.0,0.0,0.0,0.0,0.0,1.0,0.0


In [92]:
def CF_IBCF(user_id, place_id):
    if place_id in item_similarity:   
        # 현재 장소와 다른 장소의 similarity 값 가져오기
        sim_scores = item_similarity[place_id]
        
        # 현 사용자의 모든 찜 값 가져오기
        user_likes = likes_matrix_t[user_id]
        
        # 사용자가 평가하지 않은 장소 index 가져오기
        none_likes_idx = user_likes[user_likes.isnull()].index
        
        # 사용자가 평가하지 않은 장소 제거
        user_likes = user_likes.dropna()

        # 사용자가 평가하지 않은 장소의 similarity 값 제거
        sim_scores = sim_scores.drop(none_likes_idx)
        
        # 현 장소에 대한 예상 찜 계산, 가중치는 현 장소와 사용자가 평가한 장소의 유사도
        if sim_scores.sum() != 0.0:
            predicted_likes = np.dot(sim_scores, user_likes) / sim_scores.sum()

        else:
            predicted_likes = 0.0
    else:
        predicted_likes = 0.0
    return predicted_likes

# 정확도 계산
print("IBCF F1 score :", score(CF_IBCF))

IBCF F1 score : 0.7796610169491525


In [93]:
def IBCF_recommender(user, n_items):
    # 현재 사용자가 찜한 장소
    liked_index = likes_matrix.loc[user][likes_matrix.loc[user] > 0].index
    
    # 모든 장소에 대한 예상 찜 여부 계산
    predictions = []
    for place_id in likes_matrix.columns:
        if place_id not in liked_index:
            prediction = CF_IBCF(user, place_id)
                        
            # 0이 아닌 것은 제외
            if prediction > 0.4:
                predictions.append((place_id, prediction))
            
    # 예상 찜 여부를 기준으로 내림차순 정렬
    predictions.sort(key=lambda x: x[1], reverse=True)
    print("IBCF에서 추천하는 장소", predictions)

    # 상위 n_items개의 장소를 추천
    recommended_items = [place_id for place_id, _ in predictions[:n_items]]
    return recommended_items

In [106]:
print("==============유저 id 30에게 추천할 장소==============")
user = 30

UBCF_list = UBCF_recommender(user, n_items=50)
IBCF_list = IBCF_recommender(user, n_items=50)

intersection = list(set(UBCF_list) & set(IBCF_list))

print("\n공통으로 추천하는 장소 갯수 :", len(intersection))
print("\n추천할 장소의 id :", intersection)


UBCF에서 추천하는 장소 [(230, 1.0000000000000002), (962, 1.0000000000000002), (1862, 1.0000000000000002), (4, 1.0), (5, 1.0), (6, 1.0), (12, 1.0), (13, 1.0), (15, 1.0), (19, 1.0), (25, 1.0), (35, 1.0), (38, 1.0), (55, 1.0), (105, 1.0), (125, 1.0), (133, 1.0), (182, 1.0), (205, 1.0), (209, 1.0), (210, 1.0), (213, 1.0), (214, 1.0), (215, 1.0), (216, 1.0), (227, 1.0), (234, 1.0), (238, 1.0), (240, 1.0), (241, 1.0), (251, 1.0), (259, 1.0), (262, 1.0), (276, 1.0), (280, 1.0), (307, 1.0), (324, 1.0), (325, 1.0), (332, 1.0), (343, 1.0), (351, 1.0), (368, 1.0), (375, 1.0), (376, 1.0), (381, 1.0), (414, 1.0), (476, 1.0), (478, 1.0), (631, 1.0), (632, 1.0), (634, 1.0), (635, 1.0), (637, 1.0), (639, 1.0), (645, 1.0), (653, 1.0), (657, 1.0), (658, 1.0), (662, 1.0), (668, 1.0), (671, 1.0), (672, 1.0), (685, 1.0), (687, 1.0), (692, 1.0), (707, 1.0), (713, 1.0), (714, 1.0), (718, 1.0), (734, 1.0), (743, 1.0), (744, 1.0), (754, 1.0), (755, 1.0), (763, 1.0), (814, 1.0), (830, 1.0), (868, 1.0), (869, 1.0), (876