In [15]:

import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity,

In [35]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../dataset/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('../dataset/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../dataset/u.data', sep='\t', names=r_cols, encoding='latin-1')


In [36]:
# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)

In [50]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [38]:
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [39]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [40]:
# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)


In [41]:
#정확도(RMSE)를 계산하는 함수
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [54]:
# 모델별 RMSE를 계산하는 함수
def score(model):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true,y_pred)


In [43]:
# train 데이터로 full matrix 구하기
rating_matrix = x_train.pivot(index='user_id',columns='movie_id',values='rating')

In [65]:
#train set의 모든 기ㅏ능한 사용자 pair의 cosine similarities계싼
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index = rating_matrix.index,columns=rating_matrix.index)


In [66]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.110460,0.025953,0.035808,0.254718,0.318631,0.345233,0.266241,0.065958,0.300434,...,0.288964,0.087463,0.237543,0.115658,0.167406,0.054412,0.249031,0.123613,0.140345,0.333594
2,0.110460,1.000000,0.084379,0.146000,0.040218,0.186254,0.092479,0.099562,0.074415,0.121291,...,0.111277,0.200547,0.318116,0.202929,0.233436,0.143618,0.119401,0.070430,0.110855,0.081297
3,0.025953,0.084379,1.000000,0.363089,0.000000,0.059204,0.054772,0.056583,0.021911,0.057356,...,0.022574,0.017817,0.099679,0.063449,0.059207,0.034331,0.123049,0.046827,0.142802,0.000000
4,0.035808,0.146000,0.363089,1.000000,0.013628,0.068517,0.065896,0.148897,0.059238,0.037430,...,0.024412,0.048169,0.123766,0.137231,0.069083,0.041252,0.176316,0.126600,0.149554,0.031242
5,0.254718,0.040218,0.000000,0.013628,1.000000,0.132462,0.241703,0.166189,0.077797,0.177708,...,0.252559,0.076170,0.084178,0.111813,0.140899,0.043120,0.145335,0.132332,0.052283,0.236552
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.054412,0.143618,0.034331,0.041252,0.043120,0.088575,0.103798,0.067018,0.054066,0.051536,...,0.062387,0.242678,0.173936,0.152305,0.349400,1.000000,0.072871,0.154834,0.042730,0.134970
940,0.249031,0.119401,0.123049,0.176316,0.145335,0.226858,0.265942,0.231798,0.034881,0.311075,...,0.237901,0.051053,0.092467,0.129289,0.106160,0.072871,1.000000,0.096909,0.150279,0.219989
941,0.123613,0.070430,0.046827,0.126600,0.132332,0.139692,0.025716,0.091411,0.136060,0.076684,...,0.049233,0.148414,0.172963,0.206032,0.269960,0.154834,0.096909,1.000000,0.029141,0.017502
942,0.140345,0.110855,0.142802,0.149554,0.052283,0.214596,0.222835,0.115028,0.061360,0.125543,...,0.155935,0.048508,0.053608,0.050541,0.045572,0.042730,0.150279,0.029141,1.000000,0.155814


In [55]:
#주어진 영화의 (movie_id) 가중평균 rating을 계산하는 함수,
#가중치는 주어진 사용자와 다른 사용자 간의 유사도(user_similarity)
def CF_simple(user_id, movie_id):
    if movie_id in rating_matrix:
        # 현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating값 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.dropna()
        # 현재 영화를 평가하지 않은 사용자의 similarity 값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        # 현재 영화를 평가한 모든 사용자의 가중평균값 구하기
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating = 3.0
    return mean_rating
        

In [56]:
score(CF_simple)

1.0176295278964103

In [67]:
# exercise
# 코사인 유사도 대신 피어슨 상관계수를 사용하여 CF 구현
user_similarity_pearson = matrix_dummy.transpose().corr(method='pearson')
def CF_pearson_simple(user_id,movie_id):
    if movie_id in rating_matrix:
        sim_scores = user_similarity_pearson[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.dropna()
        sim_scores = sim_scores.drop(none_rating_idx)
        mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    else:
        mean_rating= 3.0 
    return mean_rating

In [68]:
score(CF_pearson_simple)

1.0345990100700624