In [33]:
import pandas as pd
import numpy as np
from scipy.stats import mode
from sklearn.metrics import f1_score
from sklearn.metrics import jaccard_score
from sklearn.model_selection import train_test_split

# 각 행 간의 Jaccard 유사도 계산
def calculate_jaccard_similarity(matrix):
    num_users = matrix.shape[0]
    similarities = np.zeros((num_users, num_users))

    for i in range(num_users):
        for j in range(i, num_users):
            similarity = jaccard_score(matrix.iloc[i], matrix.iloc[j])
            similarities[i, j] = similarity
            similarities[j, i] = similarity

    return similarities


# 정확도를 계산하는 함수 
def f1(y_true, y_pred):
    return f1_score(y_true, y_pred)

# 모델별 정확도를 계산하는 함수 
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['place_id'])
    
    # 예측 값
    y_pred = np.array([model(user, place) for (user, place) in id_pairs]).astype(int)
    #print("y_pred", y_pred)
    
    # 실제 값
    y_true = np.array(x_test['heart'])
    #print("y_true", y_true)
    return f1(y_true, y_pred)



likes = pd.read_csv(f"C:/Users/김가연/Desktop/23-2학기/캡스톤2/data/hearts_data.csv", encoding='UTF-8-SIG')
likes = likes[["place_id", "user_id"]]
likes.loc[:, "heart"] = 1
likes = likes.drop_duplicates(subset=['place_id', 'user_id'], keep='first')


x = likes.copy()
y = likes['user_id']


x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

#  train 데이터로 Full matrix 구하기 
likes_matrix = x_train.pivot(index='user_id', columns='place_id', values='heart')
likes_matrix


place_id,1,4,5,6,7,8,9,13,15,19,...,1862,1863,1865,1869,1874,1877,1891,1895,1896,1897
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
7,,,,,,,,,,,...,,,,,,,,,,
8,,,1.0,,,,,,,1.0,...,1.0,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,
10,,,,,,,,1.0,,,...,,,,,,,,,,
11,,,,,,,,,,,...,,,,,,,,,,
12,,,,,,,,,,,...,,,,,,,,,,
13,,,,,,,,,,,...,1.0,,,,,,,1.0,,


In [34]:

# train set 사용자들의 Cosine similarities 계산
matrix_dummy = likes_matrix.copy().fillna(0)


# 자카드 유사도
user_similarity = calculate_jaccard_similarity(matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=likes_matrix.index, columns=likes_matrix.index)

def CF_UBCF(user_id, place_id):
    if place_id in likes_matrix:

        sim_scores = user_similarity[user_id].copy()   
        place_likes = likes_matrix[place_id].copy()     
        none_likes_idx = place_likes[place_likes.isnull()].index
        place_likes = place_likes.drop(none_likes_idx)
        sim_scores = sim_scores.drop(none_likes_idx)

        if sim_scores.sum() != 0.0:
            predicted_likes = np.dot(sim_scores, place_likes) / sim_scores.sum()
        else:
            predicted_likes = mode(place_likes, keepdims=True).mode[0]
            #predicted_likes = 0.0
        
    else:
        predicted_likes = 0.0 # 특정 장소에 대한 좋아요 없는 경우 예측 불가
    return predicted_likes

# 정확도 계산
print(score(CF_UBCF))

0.7796610169491525


In [35]:
##### (4) 주어진 사용자에 대해 추천받기 
# 전체 데이터로 full matrix와 자가드 유사도 구하기 
likes_matrix = likes.pivot_table(index='user_id', columns='place_id', values='heart')
matrix_dummy = likes_matrix.copy().fillna(0)
user_similarity = calculate_jaccard_similarity(matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=likes_matrix.index, columns=likes_matrix.index)
user_similarity

user_id,1,4,6,7,8,9,10,11,12,13,...,56,57,58,59,61,62,63,64,65,66
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,0.04,0.0625,0.0,0.076923,0.071429,0.136364,0.266667,0.470588,0.4,...,0.021739,0.0,0.26087,0.096774,0.125,0.105263,0.0,0.136364,0.181818,0.12
4,0.04,1.0,0.0,0.0,0.083333,0.037037,0.045455,0.0625,0.095238,0.03125,...,0.046512,0.04,0.0,0.032258,0.041667,0.055556,0.0,0.095238,0.043478,0.04
6,0.0625,0.0,1.0,0.0,0.0,0.0,0.0,0.142857,0.076923,0.043478,...,0.028571,0.0,0.058824,0.045455,0.0,0.0,0.0,0.0,0.0,0.0
7,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.047619,0.032258,...,0.0,0.0,0.04,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0.076923,0.083333,0.0,0.0,1.0,0.0,0.041667,0.117647,0.086957,0.060606,...,0.021739,0.037037,0.0,0.0,0.038462,0.05,0.0,0.086957,0.0,0.037037
9,0.071429,0.037037,0.0,0.0,0.0,1.0,0.0,0.0,0.038462,0.057143,...,0.0,0.0,0.107143,0.0,0.0,0.0,0.0,0.08,0.076923,0.034483
10,0.136364,0.045455,0.0,0.0,0.041667,0.0,1.0,0.230769,0.157895,0.142857,...,0.0,0.041667,0.04,0.107143,0.142857,0.125,0.0,0.157895,0.15,0.136364
11,0.266667,0.0625,0.142857,0.0,0.117647,0.0,0.230769,1.0,0.333333,0.181818,...,0.027027,0.0,0.111111,0.041667,0.125,0.090909,0.0,0.142857,0.0625,0.117647
12,0.470588,0.095238,0.076923,0.047619,0.086957,0.038462,0.157895,0.333333,1.0,0.391304,...,0.023256,0.0,0.238095,0.068966,0.090909,0.2,0.0,0.222222,0.095238,0.136364
13,0.4,0.03125,0.043478,0.032258,0.060606,0.057143,0.142857,0.181818,0.391304,1.0,...,0.018868,0.029412,0.2,0.138889,0.133333,0.076923,0.0,0.142857,0.178571,0.129032


In [44]:
def CF_UBCF_recommender(user, n_items):
    # 현재 사용자가 찜한 장소
    liked_index = likes_matrix.loc[user][likes_matrix.loc[user] > 0].index
    
    # 현재 사용자와 유사한 사용자들의 찜한 장소의 평균 예상 찜 여부 계산
    predictions = []
    for place_id in likes_matrix.columns:
        if place_id not in liked_index:
            prediction = CF_UBCF(user, place_id)
            predictions.append((place_id, prediction))
    
    # 예상 찜 여부를 기준으로 내림차순 정렬
    predictions.sort(key=lambda x: x[1], reverse=True)
    print("predictions", predictions)
    # 상위 n_items개의 장소를 추천
    recommended_items = [place_id for place_id, _ in predictions[:n_items]]
    
    return recommended_items

# 예시: 사용자 52에게 UBCF 알고리즘을 사용하여 최대 10개의 장소를 추천
recommended_places_ubcf = CF_UBCF_recommender(user=52, n_items=10)
print(recommended_places_ubcf)


predictions [(210, 1.0000000000000002), (1514, 1.0000000000000002), (1722, 1.0000000000000002), (1821, 1.0000000000000002), (1861, 1.0000000000000002), (1862, 1.0000000000000002), (1, 1.0), (4, 1.0), (5, 1.0), (6, 1.0), (7, 1.0), (8, 1.0), (9, 1.0), (12, 1.0), (13, 1.0), (15, 1.0), (19, 1.0), (21, 1.0), (25, 1.0), (35, 1.0), (38, 1.0), (39, 1.0), (40, 1.0), (41, 1.0), (55, 1.0), (68, 1.0), (105, 1.0), (125, 1.0), (133, 1.0), (182, 1.0), (205, 1.0), (207, 1.0), (208, 1.0), (209, 1.0), (211, 1.0), (212, 1.0), (213, 1.0), (214, 1.0), (215, 1.0), (216, 1.0), (222, 1.0), (223, 1.0), (224, 1.0), (238, 1.0), (240, 1.0), (241, 1.0), (245, 1.0), (248, 1.0), (251, 1.0), (252, 1.0), (259, 1.0), (262, 1.0), (268, 1.0), (276, 1.0), (280, 1.0), (307, 1.0), (319, 1.0), (321, 1.0), (324, 1.0), (325, 1.0), (332, 1.0), (343, 1.0), (348, 1.0), (351, 1.0), (367, 1.0), (368, 1.0), (370, 1.0), (375, 1.0), (376, 1.0), (377, 1.0), (379, 1.0), (380, 1.0), (381, 1.0), (388, 1.0), (414, 1.0), (419, 1.0), (420, 1

In [41]:
def recommender(user, n_items):
    #현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    liked_index = likes_matrix.loc[user][likes_matrix.loc[user] > 0].index    # 이미 찜하기 누른 장소 인덱스 가져옴
    #print("liked_index", liked_index)
    items = likes_matrix.loc[user].drop(liked_index) # 찜하기 누른 장소 제외하고 추천하기 위해
    #print(items)
    
    for item in items.index:
        predictions.append(CF_UBCF(user, item))                   # 예상 1, 0계산
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    
    #recommendations = recommendations.sort_values(ascending=False)[:n_items]    # 예상 0, 1중에 랜덤하게 고르기?
    recommendations = recommendations[recommendations==1.0]
    print(recommendations)
    return recommendations



# id가 52번인 사용자에게 추천할 장소
# 최대 장소 10개 가져오도록
recommender(user=52, n_items=10)

place_id
1       1.0
4       1.0
5       1.0
6       1.0
7       1.0
       ... 
1877    1.0
1891    1.0
1895    1.0
1896    1.0
1897    1.0
Length: 300, dtype: float64


place_id
1       1.0
4       1.0
5       1.0
6       1.0
7       1.0
       ... 
1877    1.0
1891    1.0
1895    1.0
1896    1.0
1897    1.0
Length: 300, dtype: float64