In [2]:
import numpy as np
import pandas as pd


In [3]:
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../dataset/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('../dataset//u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../dataset//u.data', sep='\t', names=r_cols, encoding='latin-1')

In [4]:
# timestamp 제거
ratings = ratings.drop('timestamp', axis=1)
# movie Id 와 title 빼고 다른 데이터 제거
movies = movies[['movie_id','title']]


In [5]:
#train , test 데이터 분리
from sklearn.model_selection import train_test_split
x= ratings.copy()
y= ratings['user_id']
x_train, x_test, y_train,y_test = train_test_split(x,y,test_size=0.25, stratify=y)

In [6]:
# 정확도를 계산하는 함수
def RMSE(y_true,y_pred):
    return np.sqrt(np.mean((np.array(y_true)- np.array(y_pred))**2))

In [7]:
def score(model,neighbor_size = 0):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie,neighbor_size) for (user,movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true,y_pred)

In [8]:
rating_matrix = x_train.pivot(index='user_id',columns='movie_id',values='rating')

In [9]:
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1676,1678,1679,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,,,,5.0,4.0,1.0,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [10]:
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index= rating_matrix.index, columns=rating_matrix.index)

In [11]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.141476,0.046633,0.066776,0.243146,0.320334,0.319859,0.273042,0.069334,0.293632,...,0.293991,0.066081,0.255820,0.152114,0.173149,0.097286,0.258751,0.118072,0.158898,0.310567
2,0.141476,1.000000,0.056599,0.196538,0.047628,0.200335,0.107586,0.098163,0.100139,0.154134,...,0.105608,0.326509,0.271595,0.371978,0.313808,0.269006,0.188152,0.173386,0.138044,0.091409
3,0.046633,0.056599,1.000000,0.177231,0.000000,0.049426,0.053108,0.068721,0.074706,0.023908,...,0.027347,0.000000,0.156616,0.017705,0.100847,0.017827,0.118214,0.080230,0.073590,0.014613
4,0.066776,0.196538,0.177231,1.000000,0.029680,0.078517,0.061540,0.175534,0.128945,0.081310,...,0.055069,0.000000,0.098229,0.190150,0.159075,0.038292,0.196208,0.135401,0.122583,0.036619
5,0.243146,0.047628,0.000000,0.029680,1.000000,0.189476,0.252466,0.167757,0.067357,0.127979,...,0.238441,0.019709,0.063964,0.085139,0.091935,0.032004,0.164505,0.040187,0.106161,0.224075
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.097286,0.269006,0.017827,0.038292,0.032004,0.099869,0.061705,0.094102,0.000000,0.058403,...,0.018969,0.278016,0.224900,0.241653,0.304512,1.000000,0.088896,0.126508,0.038046,0.147191
940,0.258751,0.188152,0.118214,0.196208,0.164505,0.265074,0.200361,0.141238,0.074837,0.247750,...,0.260660,0.083213,0.164783,0.129278,0.149290,0.088896,1.000000,0.158954,0.212499,0.154082
941,0.118072,0.173386,0.080230,0.135401,0.040187,0.153329,0.035769,0.103713,0.049662,0.103140,...,0.026512,0.152582,0.298801,0.191259,0.196141,0.126508,0.158954,1.000000,0.104828,0.077914
942,0.158898,0.138044,0.073590,0.122583,0.106161,0.222263,0.210488,0.148358,0.099239,0.170685,...,0.161887,0.072834,0.064174,0.118754,0.091276,0.038046,0.212499,0.104828,1.000000,0.109149


In [12]:
rating_mean= rating_matrix.mean(axis=1)
rating_bias = (rating_matrix.T - rating_mean).T

In [13]:
def CF_knn_bias(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_bias:
        # 현 user와 다른 사용자 간의 dbtkeh rkwudhrl
        sim_scores = user_similarity[user_id].copy()
        # 현 movie의 평점 편차 가져오기
        movie_ratings= rating_bias[movie_id].copy()
        # 현 movie에 대한 rating이 없는 사용자 삭제
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        
        if neighbor_size == 0:
            # 편차로 예측 계산
            prediction = np.dot(sim_scores,movie_ratings)/sim_scores.sum()
            # 편차 예측값에 현 사용자의 평균 더하기
            prediction = prediction+rating_mean[user_id]
        
        else:
            if len(sim_scores) >1:
                # 지정된 neighbor size 값과 해당 영화를 평가한 총사용자 수 중 작은 것으로 진행
                neighbor_size = min(neighbor_size, len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                #유사도를 순서대로 정렬
                user_idx = np.argsort(sim_scores)
                #유사도와 rating을 neighbor size만큼 받기
                sim_scores =  sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                # 편차로 예측치 계산
                prediction = np.dot(sim_scores,movie_ratings)/ sim_scores.sum()
                # 예측값에 현 사용자의 평균 더하기
                prediction = prediction + rating_mean[user_id]
            else:
                prediction = rating_mean[user_id]
    else:
        prediction = rating_mean[user_id]
    return prediction


In [14]:
score(CF_knn_bias,20)

0.9422853264254089

In [64]:
# 연습하기
rating_matrix = ratings.pivot_table(index='user_id',columns='movie_id',values='rating')

In [65]:
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index,columns=rating_matrix.index)

In [75]:
def recomender(user,n_items=10,neighbor_size=20):
    predictions=[]
    rated_index=rating_matrix.loc[user][rating_matrix.loc[user]>0].index
    items = rating_matrix.loc[user].drop(rated_index)
    for item in items.index:
        predictions.append(CF_knn_bias(user,item,neighbor_size))
    recomendations = pd.Series(data=predictions,index=items.index,dtype=float)
    recomendations = recomendations.sort_values(ascending=False)[:n_items]
    recomended_items = movies.loc[recomendations.index]['title']
    return recomended_items
        
    

In [76]:
recomender(user=1,n_items=5,neighbor_size=20)

movie_id
851                              Bloody Child, The (1996)
1643                              Sudden Manhattan (1996)
1463                  Stars Fell on Henrietta, The (1995)
1368    Forbidden Christ, The (Cristo proibito, Il) (1...
1293                     Ayn Rand: A Sense of Life (1997)
Name: title, dtype: object