In [1]:
# Created or modified on May 2022
# Author: 임일
# 협업필터링(CF) 추천 - KNN

import numpy as np
import pandas as pd

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')
ratings

Unnamed: 0,user_id,movie_id,rating,timestamp
0,196,242,3,881250949
1,186,302,3,891717742
2,22,377,1,878887116
3,244,51,2,880606923
4,166,346,1,886397596
...,...,...,...,...
99995,880,476,3,880175444
99996,716,204,5,879795543
99997,276,1090,1,874795795
99998,13,225,2,882399156


In [2]:
# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [3]:
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]
movies = movies.set_index('movie_id')
movies

Unnamed: 0_level_0,title
movie_id,Unnamed: 1_level_1
1,Toy Story (1995)
2,GoldenEye (1995)
3,Four Rooms (1995)
4,Get Shorty (1995)
5,Copycat (1995)
...,...
1678,Mat' i syn (1997)
1679,B. Monkey (1998)
1680,Sliding Doors (1998)
1681,You So Crazy (1994)


In [4]:
# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
x

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


In [5]:
y = ratings['user_id']
y

0        196
1        186
2         22
3        244
4        166
        ... 
99995    880
99996    716
99997    276
99998     13
99999     12
Name: user_id, Length: 100000, dtype: int64

In [6]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    print(y_pred)
    y_true = np.array(x_test['rating'])
    print(y_true)
    return RMSE(y_true, y_pred)

In [7]:
#  train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1675,1677,1678,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,3.0,,3.0,3.0,5.0,4.0,1.0,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,,,...,,,,,,,,,,
940,,,,,,,4.0,,,,...,,,,,,,,,,
941,,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [8]:
# train set 사용자들의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.110895,0.051571,0.080832,0.281025,0.336667,0.337771,0.234709,0.062097,0.277682,...,0.255597,0.089437,0.196537,0.150845,0.103409,0.069808,0.270466,0.130549,0.097685,0.307499
2,0.110895,1.000000,0.036441,0.049208,0.028793,0.156617,0.091251,0.026594,0.049974,0.113598,...,0.099742,0.175905,0.258436,0.300978,0.255684,0.063620,0.160836,0.073665,0.106046,0.087932
3,0.051571,0.036441,1.000000,0.188331,0.028519,0.046928,0.048445,0.095104,0.057749,0.061774,...,0.027662,0.039028,0.109180,0.007875,0.051571,0.015684,0.121795,0.089381,0.083587,0.034838
4,0.080832,0.049208,0.188331,1.000000,0.013171,0.040320,0.061590,0.193213,0.133347,0.036981,...,0.043766,0.000000,0.039546,0.080815,0.074427,0.040239,0.188303,0.131041,0.084194,0.076614
5,0.281025,0.028793,0.028519,0.013171,1.000000,0.162666,0.279900,0.216450,0.013933,0.190443,...,0.268815,0.091546,0.085396,0.016888,0.110600,0.072526,0.156892,0.083863,0.082992,0.249492
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.069808,0.063620,0.015684,0.040239,0.072526,0.049876,0.053299,0.092995,0.053210,0.034151,...,0.054374,0.359603,0.180841,0.077395,0.382519,1.000000,0.075787,0.145888,0.017115,0.126565
940,0.270466,0.160836,0.121795,0.188303,0.156892,0.212245,0.267911,0.213552,0.132230,0.287181,...,0.225526,0.029014,0.154313,0.142063,0.119810,0.075787,1.000000,0.167069,0.134408,0.173896
941,0.130549,0.073665,0.089381,0.131041,0.083863,0.044536,0.069429,0.144434,0.110901,0.069202,...,0.027668,0.222505,0.243587,0.132323,0.170221,0.145888,0.167069,1.000000,0.074315,0.077655
942,0.097685,0.106046,0.083587,0.084194,0.082992,0.289005,0.192741,0.098278,0.122678,0.148008,...,0.185817,0.051108,0.056816,0.117635,0.041271,0.017115,0.134408,0.074315,1.000000,0.107898


In [9]:
##### (1) 

# Neighbor size를 정해서 예측치를 계산하는 함수 
def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.drop(none_rating_idx)
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
##### (2) Neighbor size가 지정되지 않은 경우        
        if neighbor_size == 0:          
            # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
##### (3) Neighbor size가 지정된 경우
        else:                       
            # 해당 영화를 평가한 사용자가 최소 2명이 되는 경우에만 계산
            if len(sim_scores) > 1: 
                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 결정
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                # 유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)
                # 유사도를 neighbor size만큼 받기
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                # 영화 rating을 neighbor size만큼 받기
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                # 최종 예측값 계산 
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

# 정확도 계산
score(cf_knn, neighbor_size=30)

[3.73443991 2.86498997 3.99478412 ... 3.17969611 3.85241072 3.54040441]
[5 5 4 ... 4 4 5]


1.011019211454133

In [10]:
##### (4) 주어진 사용자에 대해 추천받기 
# 전체 데이터로 full matrix와 cosine similarity 구하기

rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.166931,0.047460,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.000000,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.226790,0.161485,0.172268,0.105798
3,0.047460,0.110591,1.000000,0.344151,0.021245,0.072415,0.066137,0.083060,0.061040,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.161890,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.000000,0.031804,0.068044,0.091230,0.188060,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.000000,0.237286,0.373600,0.248930,0.056847,0.201427,...,0.338794,0.080580,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.118095,0.228583,0.026271,0.030138,0.071459,0.111852,0.107027,0.095898,0.039852,0.071460,...,0.066039,0.431154,0.258021,0.226449,0.432666,1.000000,0.087687,0.180029,0.043264,0.144250
940,0.314072,0.226790,0.161890,0.196858,0.239955,0.352449,0.329925,0.246883,0.120495,0.342961,...,0.327153,0.107024,0.187536,0.181317,0.175158,0.087687,1.000000,0.145152,0.261376,0.241028
941,0.148617,0.161485,0.101243,0.152041,0.139595,0.144446,0.059993,0.146145,0.143245,0.090305,...,0.046952,0.203301,0.288318,0.234211,0.313400,0.180029,0.145152,1.000000,0.101642,0.095120
942,0.179508,0.172268,0.133416,0.170086,0.152497,0.317328,0.282003,0.175322,0.092497,0.212330,...,0.226440,0.073513,0.089588,0.129554,0.099385,0.043264,0.261376,0.101642,1.000000,0.182465


In [18]:
def recommender(user, n_items):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index    # 이미 평가한 영화 확인
    items = rating_matrix.loc[user].drop(rated_index)
    for item in items.index:
        predictions.append(cf_knn(user, item))                   # 예상평점 계산
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]    # 예상평점이 가장 높은 영화 선택
    recommended_items = movies.loc[recommendations.index]['title']
    return recommended_items

recommender(user=2, n_items=50)

movie_id
1653    Entertaining Angels: The Dorothy Day Story (1996)
1122                       They Made Me a Criminal (1939)
1201           Marlene Dietrich: Shadow and Light (1996) 
1189                                   Prefontaine (1997)
1293                                      Star Kid (1997)
1536                                 Aiqing wansui (1994)
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1467                 Saint of Fort Washington, The (1993)
1500                            Santa with Muscles (1996)
1594                                       Everest (1998)
1449                               Pather Panchali (1955)
1398                                          Anna (1996)
119                Maya Lin: A Strong Clear Vision (1994)
318                               Schindler's List (1993)
64                       Shawshank Redemption, The (1994)
169                            Wrong Trousers, The (1993)
483  

In [19]:
UBCF_items=recommender(user=2, n_items=50)
UBCF_items

movie_id
1653    Entertaining Angels: The Dorothy Day Story (1996)
1122                       They Made Me a Criminal (1939)
1201           Marlene Dietrich: Shadow and Light (1996) 
1189                                   Prefontaine (1997)
1293                                      Star Kid (1997)
1536                                 Aiqing wansui (1994)
814                         Great Day in Harlem, A (1994)
1599                        Someone Else's America (1995)
1467                 Saint of Fort Washington, The (1993)
1500                            Santa with Muscles (1996)
1594                                       Everest (1998)
1449                               Pather Panchali (1955)
1398                                          Anna (1996)
119                Maya Lin: A Strong Clear Vision (1994)
318                               Schindler's List (1993)
64                       Shawshank Redemption, The (1994)
169                            Wrong Trousers, The (1993)
483  

In [20]:
UBCF_items.to_csv('UBCF 평점순 정렬 5점~1점.csv', encoding='latin-1')

In [19]:
##### (5) 최적의 neighbor size 구하기

# train set으로 full matrix와 cosine similarity 구하기 
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1676,1677,1678,1679,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,,4.0,3.0,3.0,5.0,,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


최적의 이웃값 찾기

In [20]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print("Neighbor size = %d : RMSE = %.4f" % (neighbor_size, score(cf_knn, neighbor_size)))

Neighbor size = 10 : RMSE = 1.0285
Neighbor size = 20 : RMSE = 1.0155
Neighbor size = 30 : RMSE = 1.0133
Neighbor size = 40 : RMSE = 1.0132
Neighbor size = 50 : RMSE = 1.0136
Neighbor size = 60 : RMSE = 1.0145


In [None]:
from scipy.stats import mode
def cf_binary(user_id, place_id):
    if place_id in likes_matrix:
        #print("현재 장소", place_id)
        sim_scores = user_similarity[user_id].copy()
        #print(sim_scores)
        
        place_likes = likes_matrix[place_id].copy()
        #print(place_likes)
        
        none_likes_idx = place_likes[place_likes.isnull()].index
        
        #print(place_likes[place_likes.isnull()])
        #print(none_likes_idx)
        
        place_likes = place_likes.drop(none_likes_idx)
        #print(place_likes[place_likes.isnull()])
        
        
        sim_scores = sim_scores.drop(none_likes_idx)
        #print(sim_scores[sim_scores.isnull()])
        #if sim_scores.sum() != 0.0:
        #    print("0이 아닐 때 sim_scores.sum :", sim_scores.sum())
        #print(sim_scores)
        
        
        #weighted_sum = np.sum(sim_scores * place_likes)
        #similarity_sum = np.sum(np.abs(sim_scores))
        #predicted_likes = weighted_sum / similarity_sum
        
        if sim_scores.sum() != 0.0:
            # 교재에 있는 기존 방식. 합이 0이 아닐때만 가중평균
            predicted_likes = np.dot(sim_scores, place_likes) / sim_scores.sum()
        else:
            
            predicted_likes = mode(place_likes, keepdims=True).mode[0]
            #predicted_likes = 0.0
        
    else:
        predicted_likes = 0.0 # 특정 장소에 대한 좋아요 없는 경우 예측 불가
    return predicted_likes

In [74]:
# Created or modified on May 2022
# Author: 임일
# 협업필터링(CF) 추천 - 기본

import numpy as np
import pandas as pd

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')
# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
ratings["rating"]=1
ratings


Unnamed: 0,user_id,movie_id,rating
0,196,242,1
1,186,302,1
2,22,377,1
3,244,51,1
4,166,346,1
...,...,...,...
99995,880,476,1
99996,716,204,1
99997,276,1090,1
99998,13,225,1


In [75]:
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [76]:
# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    print(y_pred)
    y_true = np.array(x_test['rating'])
    print(y_true)
    return RMSE(y_true, y_pred)


In [77]:
# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1671,1672,1673,1674,1676,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.0,1.0,,,,1.0,1.0,,1.0,1.0,...,,,,,,,,,,
2,1.0,,,,,,,,,1.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,1.0,1.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,1.0,,...,,,,,,,,,,
940,,,,,,,1.0,1.0,1.0,,...,,,,,,,,,,
941,,,,,,,1.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [83]:
##### (1)

# train set의 모든 가능한 사용자 pair의 Cosine similarities 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.113553,0.054672,0.082512,0.287506,0.284071,0.334395,0.189990,0.016981,0.262239,...,0.275272,0.169017,0.224412,0.127827,0.186704,0.126613,0.227006,0.140028,0.127611,0.318105
2,0.113553,1.000000,0.092106,0.139010,0.025764,0.234597,0.084843,0.111139,0.143040,0.125511,...,0.090174,0.191655,0.286417,0.323029,0.245737,0.145436,0.148361,0.147442,0.153562,0.052541
3,0.054672,0.092106,1.000000,0.294484,0.000000,0.086972,0.044934,0.141264,0.075755,0.066472,...,0.027290,0.029001,0.091014,0.114053,0.104116,0.000000,0.139686,0.039043,0.060996,0.027826
4,0.082512,0.139010,0.294484,1.000000,0.020593,0.037503,0.081379,0.213201,0.057166,0.040129,...,0.041187,0.043769,0.045787,0.172133,0.130946,0.000000,0.131762,0.117851,0.153429,0.020998
5,0.287506,0.025764,0.000000,0.020593,1.000000,0.139016,0.281546,0.118544,0.063571,0.148749,...,0.236641,0.032449,0.076375,0.031903,0.126202,0.057454,0.117220,0.021843,0.056873,0.287992
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.126613,0.145436,0.000000,0.000000,0.057454,0.091552,0.104061,0.049568,0.039873,0.069973,...,0.086182,0.305281,0.191614,0.120060,0.292265,1.000000,0.055141,0.164399,0.021403,0.117167
940,0.227006,0.148361,0.139686,0.131762,0.117220,0.257943,0.250909,0.185405,0.135582,0.333107,...,0.214903,0.124568,0.141171,0.122474,0.173916,0.055141,1.000000,0.083853,0.101889,0.169324
941,0.140028,0.147442,0.039043,0.117851,0.021843,0.179000,0.043158,0.150756,0.121268,0.063844,...,0.021843,0.185695,0.145693,0.228218,0.277778,0.164399,0.083853,1.000000,0.130189,0.044544
942,0.127611,0.153562,0.060996,0.153429,0.056873,0.217503,0.194780,0.117760,0.126302,0.099742,...,0.147871,0.072526,0.050580,0.118846,0.115723,0.021403,0.101889,0.130189,1.000000,0.127580


In [79]:
# 주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수, 
# 가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.dropna()
        # 현재 영화를 평가하지 않은 사용자의 similarity값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 0.0
    return mean_rating

# 정확도 계산
score(CF_simple)



[1. 1. 1. ... 1. 1. 1.]
[1 1 1 ... 1 1 1]


0.03847076812334269

In [92]:
def find_similar_users_and_items(user_id, num_users=10, min_similarity=0.3):
    # 사용자 유사도 행렬에서 특정 사용자의 유사도 가져오기
    user_similarities = user_similarity[user_id]
    
    # 유사도를 기준으로 내림차순으로 정렬하여 상위 num_users 개의 사용자 선택
    similar_users_idx = user_similarities[user_similarities >= min_similarity].sort_values(ascending=False).index[:num_users]
    
    # 선택된 사용자들과 해당 유사도 반환
    similar_users_and_similarity = list(zip(similar_users_idx, user_similarities[similar_users_idx]))
    
    # 선택된 사용자들이 좋아하는 아이템의 ID 반환
    liked_items = set()
    for similar_user, _ in similar_users_and_similarity:
        # 비슷한 사용자가 좋아하는 아이템의 index 가져오기
        similar_user_likes = rating_matrix.loc[similar_user]
        liked_items.update(similar_user_likes[similar_user_likes > 0.5].index)
    
    return similar_users_and_similarity, liked_items

# 예시: user_id가 2인 사용자와 유사도가 0.5 이상인 상위 5명의 사용자 및 그들이 좋아하는 아이템의 ID 출력
similar_users_and_items, liked_items = find_similar_users_and_items(user_id=6, num_users=5)

# 찾은 비슷한 사용자들의 user_id와 유사도 출력
for similar_user, similarity in similar_users_and_items:
    print(f"사용자 {similar_user}: 유사도 {similarity}")

# 비슷한 사용자들이 좋아하는 아이템의 ID 출력
print(f"\n비슷한 사용자들이 좋아하는 아이템 ID: {liked_items}")


사용자 6: 유사도 0.9999999999999991
사용자 474: 유사도 0.43202346922864066
사용자 537: 유사도 0.4064186443489132
사용자 298: 유사도 0.39995003018989844
사용자 567: 유사도 0.39887445309541864

비슷한 사용자들이 좋아하는 아이템 ID: {1, 3, 4, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 19, 20, 22, 23, 24, 25, 26, 28, 30, 32, 39, 42, 44, 45, 46, 47, 50, 52, 53, 55, 56, 58, 59, 60, 61, 64, 65, 66, 68, 69, 70, 71, 73, 76, 79, 82, 85, 86, 87, 88, 89, 90, 92, 95, 96, 97, 98, 99, 100, 101, 102, 107, 109, 111, 116, 117, 118, 121, 123, 124, 125, 126, 127, 129, 131, 132, 133, 134, 135, 136, 137, 140, 141, 143, 147, 150, 151, 152, 153, 154, 156, 165, 168, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 185, 186, 187, 188, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 215, 216, 218, 221, 222, 223, 226, 230, 231, 234, 237, 238, 241, 242, 243, 244, 246, 248, 252, 255, 257, 258, 259, 261, 262, 265, 268, 269, 270, 271, 272, 273, 274, 275, 276, 277, 279, 281, 28

In [68]:
### (4) 주어진 사용자에 대해 추천받기 
# 전체 데이터로 full matrix와 cosine similarity 구하기 
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

def recommender(user, n_items=10):
    # 현재 사용자의 모든 아이템에 대한 예상 평점 계산
    predictions = []
    rated_index = rating_matrix.loc[user][rating_matrix.loc[user] > 0].index    # 이미 평가한 영화 확인
    items = rating_matrix.loc[user].drop(rated_index)
    for item in items.index:
        predictions.append(CF_simple(user, item))                   # 예상평점 계산
    recommendations = pd.Series(data=predictions, index=items.index, dtype=float)
    recommendations = recommendations.sort_values(ascending=False)[:n_items]    # 예상평점이 가장 높은 영화 선택
    print(recommendations)
    recommended_items = movies.loc[recommendations.index]['title']
    return recommended_items

recommender(user=2, n_items=200)


movie_id
169     1.0
949     1.0
516     1.0
384     1.0
481     1.0
       ... 
488     1.0
1204    1.0
486     1.0
483     1.0
660     1.0
Length: 200, dtype: float64


movie_id
169         Cinema Paradiso (1988)
949                 Georgia (1995)
516               Manhattan (1979)
384               True Lies (1994)
481        Some Like It Hot (1959)
                   ...            
488               Notorious (1946)
1204      Secret Agent, The (1996)
486           Roman Holiday (1953)
483     Maltese Falcon, The (1941)
660               High Noon (1952)
Name: title, Length: 200, dtype: object

In [None]:

##### (5) 최적의 neighbor size 구하기

# train set으로 full matrix와 cosine similarity 구하기 
rating_matrix = x_train.pivot_table(values='rating', index='user_id', columns='movie_id')
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print("Neighbor size = %d : RMSE = %.4f" % (neighbor_size, score(cf_knn, neighbor_size)))
