In [1]:

import numpy as np
import pandas as pd

In [2]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('../dataset/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('../dataset/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('../dataset/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

In [5]:
# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [6]:
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true)-np.array(y_pred))**2))

In [7]:
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'],x_test['movie_id'])
    y_pred = np.array([model(user,movie,neighbor_size)for (user,movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true,y_pred)

In [8]:
rating_matrix = x_train.pivot(index = 'user_id',columns = 'movie_id', values = 'rating')

In [11]:
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1670,1671,1672,1673,1674,1675,1676,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,,3.0,...,,,,,,,,,,
2,,,,,,,,,,,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


In [14]:
#train set 사용자들의 cosine similarites 계산
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,index=rating_matrix.index,columns=rating_matrix.index)

In [15]:
#neighbor size를 정해서 예측치를 계산하는 함수
def cf_knn(user_id, movie_id,neighbor_size = 0):
    if movie_id in rating_matrix:
        #현재 사용자와 다른 사용자 간의 similarity 가져오기
        sim_scores = user_similarity[user_id].copy()
        # 현재 영화에 대한 모든 사용자의 rating 가져오기
        movie_ratings = rating_matrix[movie_id].copy()
        # 현재 영화를 평가하지 않은 사용자의 index 가져오기
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index 
        # 현재 영화를 평가하지 않은 사용자의 rating (null) 제거
        movie_ratings = movie_ratings.drop(none_rating_idx)
        # 현재 영화를 평가하지 않은 사용자의 similarity 값 제거
        sim_scores = sim_scores.drop(none_rating_idx)
        #neighbor size가 지정되지 않은 경우
        if neighbor_size == 0:
            #현재 영화를 평가한 모든 사용자의 가중평균값 구하기
            mean_rating = np.dot(sim_scores,movie_ratings)/sim_scores.sum()
        # neighbor size가 지정된 경우    
        else:
            if len(sim_scores) > 1:
                # 지정된 neighbor size값과 해당 영화를 평가한 총 사용자 수 중 작은 것으로 사용
                neighbor_size = min(neighbor_size,len(sim_scores))
                # array로 바꾸기 (argsort를 사용하기 위함)
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                # 유사도를 순서대로 정렬
                user_idx =np.argsort(sim_scores)
                # 유사도를 neighbor size 만큼 받기
                sim_scores = movie_ratings[user_idx][-neighbor_size:]
                # 영화 rating을 neighbor size만큼 받기
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                # 최족 예측값 계산
                mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
            else:
                mean_rating =3.0
    else:
        mean_rating = 3.0
    return mean_rating
                

In [16]:
score(cf_knn,neighbor_size=30)

1.0521965306565177

In [17]:
# 추천 받기
rating_matrix = ratings.pivot_table(values='rating',index='user_id', columns='movie_id')


In [19]:
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity= cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,index=rating_matrix.index,columns=rating_matrix.index)

In [20]:
def recom_movie(user_id,n_items,neighbor_size=30):
    # 현 사용자가 평가한 영화 가져오기
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        # 현 사용자가 이미 평가한 영화는 제외(평점을 0으로)
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie]=0
        #현 사용자가 평가하지 않은 영화의 예상 평점 계싼
        else:
            user_movie.loc[movie] = cf_knn(user_id,movie,neighbor_size)
    # 영화를 예상 평점에 따라 정렬해서 제목을 뽑아서 돌려줌
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies =  movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(user_id=2,n_items=5,neighbor_size=30)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1467                                     Cure, The (1995)
1189                              That Old Feeling (1997)
318                       Everyone Says I Love You (1996)
Name: title, dtype: object

In [21]:
# 최적의 neighbor size 구하기
#train set으로 full matrix와 cosine similarity 구하기
rating_matrix = x_train.pivot_table(values='rating',index='user_id',columns='movie_id')


In [22]:
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy,matrix_dummy)
user_similarity = pd.DataFrame(user_similarity,index=rating_matrix.index,columns=rating_matrix.index)

In [24]:
for neighbor_size in [10,20,30,40,50,60]:
    print("Neigbor size = %d : RMSE = %.4f"%(neighbor_size,score(cf_knn,neighbor_size)))

Neigbor size = 10 : RMSE = 1.0587
Neigbor size = 20 : RMSE = 1.0500
Neigbor size = 30 : RMSE = 1.0522
Neigbor size = 40 : RMSE = 1.0534
Neigbor size = 50 : RMSE = 1.0551
Neigbor size = 60 : RMSE = 1.0567


In [26]:
for neighbor_size in [10,20,30,40,50,60]:
    print(f"Neigbor size = {neighbor_size} : RMSE ={score(cf_knn,neighbor_size):.4f}")

Neigbor size = 10 : RMSE =1.0587
Neigbor size = 20 : RMSE =1.0500
Neigbor size = 30 : RMSE =1.0522
Neigbor size = 40 : RMSE =1.0534
Neigbor size = 50 : RMSE =1.0551
Neigbor size = 60 : RMSE =1.0567
