In [1]:
# 주피터노트북 목차 만들기
# 참고: https://mingchin.tistory.com/139

# pip install jupyter_contrib_nbextensions
# ! jupyter contrib nbextension install --user

# 목차
>## 1. 이웃 기반 협업필터링
>>### 1.1. def cf_knn을 활용한 이웃 기반 협업필터링
>>>#### 1.1.1. 데이터 읽어오고 정리하기
>>>#### 1.1.2. 함수 def cf_knn
>>### 1.2. 실제 추천 알고리즘에 적용하기
>>>#### 1.1.1. 데이터 읽어오고 정리하기
>>>#### 1.2.2. 함수 def cf_knn
>>### 1.3. 최적의 K의 개수 구하기
>>>#### 1.1.1. 데이터 읽어오고 정리하기
>>>#### 1.3.2. neighbor_size 조절해 가며 for문 돌리기

# 1. 이웃 기반 협업필터링

## 1.1. def cf_knn을 활용한 이웃 기반 협업필터링

<요약>  
기본적인 협업 필터링은 베스트셀러 방식으로, 추천해주고자 하는 사용자와 유사도를 비교하는 그룹이 사용자 전체이다.  
이럴경우 모든 사용자에게 동일한 항목만을 추천하게 되는 이슈가 발생한다.  
위의 이슈를 해결하기 위해 **유사도가 높은 [Neighbor size]명의 가중평균을 예상 평점**으로 정해준다  
  
  
<코드 요약>  

>- 데이터 읽어오고 정리(RMSE 함수와 score(RMSE 사용)함수)하기
>- def cf_knn(user_id, movie_id, neighbor_size)
>    - 전달받은 movie_id에 대해 점수를 부여한 사람의 값이 rating_matrix에 존재할 때
>        - movie_id에 대한 평점이 없는 사람 제거하기
>        - 이웃의 크기가 1 이상일 때
>            - movie_id에 대해 평가한 사람이 1보다 클 때 (mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum())  
>            (유사도가 높은 [Neighbor size]명의 사람들의 평점에, 유사도를 가중치 삼아 가중평균을 구한다)
>            - movie_id에 대해 평가한 사람이 1이하일 때 (mean_rating = 3.0)
>        - 이웃의 크기가 0일때 (mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum())  
>            (Neighbor size가 없기 때문에 전체 데이터를 이용해 가중평균을 구한다)
>    - 전달받은 movie_id에 대해 점수를 부여한 사람의 값이 rating_matrix에 존재하지 않을 때 (mean_rating = 3.0)
>- 예상 평점값 도출 (mean_rating)  
  
--------------------------------------------------  
    
<가정>  
  
코드에 대한 이해를 돕기 위해 예시로,  
user_id = 2, movie_id = 2, neighbor_size = 30  
값을 넣는다고 가정하고 코드 실행 

### 1.1.1. 데이터 읽어오고 정리하기  
사용해야하는 데이터를 읽어온다  
user에는 user_id에 대한 정보가 들어있고,  
movies에는 movie_id에 대한 정보,  
ratings에는 각 사용자들이 영화에 부여한 평점에 대한 정보가 들어있다

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)

movies = movies[['movie_id', 'title']]

In [4]:
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [5]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [6]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


#### - train셋과 test셋 분리하기
**train데이터로 알고리즘**을 제작 후, **test로 정확도를 측정**해야 정확한 검증을 할 수 있다  
따라서 데이터를 train셋과 test셋으로 분리해준다

In [7]:
# train, test 데이터 분리
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [8]:
# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 협업필터링 알고리즘별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

#### - full matrix 제작과 코사인유사도 구하기
train용으로 분리가 완료된 x_train 데이터를 full matrix로 구현해준다(=rating_matrix)  
rating_matrix를 이용해 코사인 유사도를 구해준다(=user_similarity)

- **rating_matrix (x_train의 full matrix)**

In [9]:
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1673,1674,1675,1676,1677,1678,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,3.0,5.0,4.0,1.0,,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


- **user_similarity (rating_matrix 활용해 구한 코사인유사도)**

In [10]:
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.131093,0.041712,0.074419,0.297778,0.277424,0.359269,0.271226,0.105531,0.285891,...,0.278856,0.104049,0.210892,0.152483,0.138520,0.026504,0.214480,0.145569,0.148141,0.313842
2,0.131093,1.000000,0.079939,0.154818,0.073655,0.124635,0.087005,0.099552,0.171636,0.106596,...,0.129594,0.192282,0.274897,0.324624,0.194301,0.171857,0.161779,0.122178,0.152653,0.073519
3,0.041712,0.079939,1.000000,0.161887,0.027538,0.077411,0.043176,0.027565,0.023423,0.034831,...,0.041613,0.000000,0.071516,0.013107,0.092645,0.019544,0.125719,0.023819,0.107224,0.034682
4,0.074419,0.154818,0.161887,1.000000,0.029091,0.064499,0.110985,0.195515,0.077323,0.044370,...,0.030527,0.000000,0.056992,0.124611,0.099556,0.000000,0.113372,0.138391,0.131070,0.062806
5,0.297778,0.073655,0.027538,0.029091,1.000000,0.182064,0.308228,0.203534,0.031042,0.204193,...,0.291675,0.036064,0.067177,0.077818,0.125322,0.000000,0.204814,0.108907,0.178644,0.235068
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.026504,0.171857,0.019544,0.000000,0.000000,0.091063,0.061726,0.029630,0.055076,0.043166,...,0.055664,0.257774,0.164268,0.218939,0.286054,1.000000,0.028841,0.080652,0.017289,0.081084
940,0.214480,0.161779,0.125719,0.113372,0.204814,0.208515,0.263774,0.145275,0.070857,0.250945,...,0.217651,0.022946,0.111088,0.120689,0.099943,0.028841,1.000000,0.130052,0.212477,0.208370
941,0.145569,0.122178,0.023819,0.138391,0.108907,0.113732,0.046193,0.121877,0.107398,0.104513,...,0.021200,0.175461,0.237306,0.249401,0.162960,0.080652,0.130052,1.000000,0.105352,0.078091
942,0.148141,0.152653,0.107224,0.131070,0.178644,0.253979,0.256321,0.140499,0.063886,0.198158,...,0.158149,0.051582,0.079911,0.086570,0.064998,0.017289,0.212477,0.105352,1.000000,0.147553


여기까지 만들어진 DataFrame을(rating_matrix, user_similarity 등) 함수 **def cf_knn**에 활용해 예상 평점을 계산한다

### 1.1.2. 함수 def cf_knn
def cf_knn = **유사도가 높은 [Neighbor size]명의 가중평균**을 활용해 예측 평점을 계산하는 함수다

- user_id = 2, movie_id = 2, neighbor_size = 30 이라고 가정한다

In [11]:
user_id = 2
movie_id = 2
neighbor_size = 30

#### - 전달받은 movie_id가 rating_matrix에 존재하는지 확인이 필요하다
rating_matrix는 x_train을 활용해 만든 데이터 셋이다.  
따라서 탐색을 원하는 movie_id에 대한 정보가 들어있지 않을 수 있다  
만약 movie_id가 rating_matrix에 존재하지 않는다면(Null), 평균 점수로 정한 3.0점을 평점으로 부여한다
#### - 전달받은movie_id가 rating_matrix에 존재한다면, user_id에 대한 코사인 유사도와 movie_id에 대한 평점들을 받아온다
user_id와 다른 사용자의 코사인 유사도가 담긴 user_similarity를 추출해 sim_scores에 담아준다  
movie_id에 대해 다른 사용자들이 부여한 평점을 rating_matrix에서 추출해 movie_ratings에 담아준다

#### - movie_id에 대해 평가하지 않은 사용자의 정보는 사용 데이터에서 제외시켜준다
movie_id에 평점을 부여하지 않은 사용자들을  
movie_id의 평점 모음인 movie_ratings와 user_id와 다른 사용자간의 유사도 모음인 sim_scores에서 제외시켜준다

현재는 user_id = 2, movie_id = 2, neighbor_size = 30 이라고 가정하기 때문에  
movie_id에 대해 평점을 부여한 사람들의 정보(평점, 유사도)만을 받아준다 -> 'movie_ratings', 'sim_scores'

In [12]:
if movie_id in rating_matrix:
    sim_scores = user_similarity[user_id].copy()
    movie_ratings = rating_matrix[movie_id].copy()
    none_rating_idx = movie_ratings[movie_ratings.isnull()].index
    movie_ratings = movie_ratings.drop(none_rating_idx)
    sim_scores = sim_scores.drop(none_rating_idx)

- movie_ratings (movie_id에 대한 평점이 Null이 아닌 사람들의 **평점 모음**)

In [13]:
movie_ratings

user_id
1      3.0
5      3.0
13     3.0
22     2.0
30     3.0
      ... 
899    3.0
916    3.0
924    3.0
934    4.0
943    5.0
Name: 2, Length: 100, dtype: float64

- sim_scores (movie_id에 대한 평점이 Null이 아닌 사람들의 **유사도 모음**)

In [14]:
sim_scores

user_id
1      0.131093
5      0.073655
13     0.156633
22     0.043395
30     0.243811
         ...   
899    0.098960
916    0.088363
924    0.208126
934    0.129594
943    0.073519
Name: 2, Length: 100, dtype: float64

#### - Neighbor size(이웃의 크기)가 0인 경우와 1 이상인 경우
여기서 Neighbor size는 후에 'user_id와 유사도가 높은 [Neighbor size]개'에 사용된다 
만약 Neighbor size가 0이라면 유사도가 높은 0개의 유사도를 구하게 되고, 그것은 의미없는 값이다  
  
- 따라서 **Neighbor size가 0**일땐 범위를 따로 정하지 않고,  
사용자 범위를 전체로 삼아 movie_id에 대한 평점(movie_ratings)에 유사도를(sim_scores)를 가중치 삼아 가중평균을 예상 점수로 부여한다
  
  
- 반대로 **Neighbor size가 1이상**이라면,  
sim_scores 관련 if문으로 이동하게 된다
  
  
#### - Neighbor size가 1 이상이면 그 후 sim_scores관련 if문이 등장한다
sim_scores는 평점을 부여한 사람들의 user_id에 대한 유사도 모음
sim_scores가 1이거나 그 이하인 경우는 연산에서 문제가 생긴다고 한다.  
  
- 따라서 **sim_score가 1이거나 그 이하**인 경우는 강제로  
평균 점수로 정한, 3.0점을 부여한다
  
  
- 반대로 **sim_scores가 1이상**인 경우에는  
사용자와 유사도가 높은 [Neighbor size]개를 범위로 잡은 후  
movie_id에 대한 평점(movie_ratings)에 유사도를(sim_scores)를 가중치 삼아 가중평균을 예상 점수로 부여한다

현재는 user_id = 2, movie_id = 2, neighbor_size = 30 이라고 가정하고 있다  
Neighbor size는 0보다 크고, sim_scores 역시 1보다 큰 상태이다  
따라서 **사용자와 유사도가 높은 [Neighbor size]명의 가중평균**을 구해준다

In [15]:
# 전달받은 movie_id에 대해 평가를 한 사람의 모음(sim_scores)의 수가 많은게 정상이지만,
# neighbor_size의 크기가 클 수도 있기 때문에 min 계산 후 작은 값을 선택해주기
neighbor_size = min(neighbor_size, len(sim_scores))

# 유사도의 상위 [Neighbor size]개 추출하기
sim_scores = np.array(sim_scores)
movie_ratings = np.array(movie_ratings)
user_idx = np.argsort(sim_scores)
sim_scores = sim_scores[user_idx][-neighbor_size:]
movie_ratings = movie_ratings[user_idx][-neighbor_size:]

# 선별된 유사도 상위 [Neighbor size]개의 가중 평균 구하기
# 영화 평점에 유사도를 가중치로 구해준다
mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
    
mean_rating

3.1218025883378777

코드 원본

In [16]:
def cf_knn(user_id, movie_id, neighbor_size=0):
    if movie_id in rating_matrix:
        sim_scores = user_similarity[user_id].copy()
        movie_ratings = rating_matrix[movie_id].copy()
        none_rating_idx = movie_ratings[movie_ratings.isnull()].index
        movie_ratings = movie_ratings.drop(none_rating_idx)
        sim_scores = sim_scores.drop(none_rating_idx)
        if neighbor_size == 0:
            mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
        else:
            if len(sim_scores) > 1:
                neighbor_size = min(neighbor_size, len(sim_scores))
                sim_scores = np.array(sim_scores)
                movie_ratings = np.array(movie_ratings)
                user_idx = np.argsort(sim_scores)
                sim_scores = sim_scores[user_idx][-neighbor_size:]
                movie_ratings = movie_ratings[user_idx][-neighbor_size:]
                mean_rating = np.dot(sim_scores, movie_ratings) / sim_scores.sum()
            else:
                mean_rating = 3.0
    else:
        mean_rating = 3.0
    return mean_rating

## 1.2. 실제 추천 알고리즘에 적용하기

<요약>  
기본 틀이 되는 def cf_knn(이웃 기반 협업알고리즘)을 활용했다.  
원하는 user_id의 평점에 대한 데이터를 불러온다.  
그 후 평점이 이미 부여되어 있는 컬럼에 대해서는 0을 삽입하고, 평점이 부여되어있지 않은 경우에는 def cf_knn 함수를 돌려준다.  
def df_knn으로 인해 예상 평점을 부여받은 영화들 중 원하는 상위 개수(n_item)만큼 추출해준다(=movie_sort)  
해당 movie_sort에서 타이틀만 추출해준다  

<코드 요약>
>- 데이터 읽어오고 정리(RMSE 함수와 score(RMSE 사용)함수)하기
>- def recom_movie(user_id, n_items, neighbor_size)
>    - 전달받은 user_id에 대해 rating_matrix를 for문 돌리기 
>        - user_id가 이미 평가한 항목은 0점 대체해 넣기
>        - user_id가 평가하지 않은 항목이라면, def cf_knn 함수 돌려 예상 평점 구하기
>    - 예상 평점이 구해졌다면 상위 n_items개 만큼 뽑아주기
>    - 상위 n_items의 타이틀을 추출하기
>- 타이틀 출력
  
--------------------------------------------------  
    
<가정>  
  
코드에 대한 이해를 돕기 위해  
user_id = 2, neighbor_size = 30, n_items = 5 라고 가정한다

### 1.2.1. 데이터 읽어오고 정리하기  
def cf_knn과 동일한 과정이다  
이 경우는 알고리즘의 정확도를 측정하는 것이 아닌,  
실제로 추천을 해야하는 경우라고 가정하기 때문에 train셋과 test셋으로 구분하지 않는다.  
이에따라 전체 데이터를 full matrix로 만들어주고, 코사인유사도 또한 전체 데이터를 기준으로 구해준다  
  
  ------------------------  
  
사용해야하는 데이터를 읽어온다  
해당 user에는 user_id에 대한 정보가 들어있고,  
movies에는 movie_id에 대한 정보,
ratings에는 각 사용자들이 영화에 부여한 평점에 대한 정보가 들어있다

In [17]:
# 위 def df_knn의 '데이터 읽어오고 정리하기'와 같은 과정

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]
rating_matrix = ratings.pivot_table(values='rating', index='user_id', columns='movie_id')
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

- **rating_matrix (전체 유저와 평점 데이터를 full martix로 만들어 준 것)**

In [18]:
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,4.0,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,2.0,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


- **user_similarity (전체 유저의 코사인 유사도를 구한 것)**

In [19]:
user_similarity

user_id,1,2,3,4,5,6,7,8,9,10,...,934,935,936,937,938,939,940,941,942,943
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.166931,0.047460,0.064358,0.378475,0.430239,0.440367,0.319072,0.078138,0.376544,...,0.369527,0.119482,0.274876,0.189705,0.197326,0.118095,0.314072,0.148617,0.179508,0.398175
2,0.166931,1.000000,0.110591,0.178121,0.072979,0.245843,0.107328,0.103344,0.161048,0.159862,...,0.156986,0.307942,0.358789,0.424046,0.319889,0.228583,0.226790,0.161485,0.172268,0.105798
3,0.047460,0.110591,1.000000,0.344151,0.021245,0.072415,0.066137,0.083060,0.061040,0.065151,...,0.031875,0.042753,0.163829,0.069038,0.124245,0.026271,0.161890,0.101243,0.133416,0.026556
4,0.064358,0.178121,0.344151,1.000000,0.031804,0.068044,0.091230,0.188060,0.101284,0.060859,...,0.052107,0.036784,0.133115,0.193471,0.146058,0.030138,0.196858,0.152041,0.170086,0.058752
5,0.378475,0.072979,0.021245,0.031804,1.000000,0.237286,0.373600,0.248930,0.056847,0.201427,...,0.338794,0.080580,0.094924,0.079779,0.148607,0.071459,0.239955,0.139595,0.152497,0.313941
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,0.118095,0.228583,0.026271,0.030138,0.071459,0.111852,0.107027,0.095898,0.039852,0.071460,...,0.066039,0.431154,0.258021,0.226449,0.432666,1.000000,0.087687,0.180029,0.043264,0.144250
940,0.314072,0.226790,0.161890,0.196858,0.239955,0.352449,0.329925,0.246883,0.120495,0.342961,...,0.327153,0.107024,0.187536,0.181317,0.175158,0.087687,1.000000,0.145152,0.261376,0.241028
941,0.148617,0.161485,0.101243,0.152041,0.139595,0.144446,0.059993,0.146145,0.143245,0.090305,...,0.046952,0.203301,0.288318,0.234211,0.313400,0.180029,0.145152,1.000000,0.101642,0.095120
942,0.179508,0.172268,0.133416,0.170086,0.152497,0.317328,0.282003,0.175322,0.092497,0.212330,...,0.226440,0.073513,0.089588,0.129554,0.099385,0.043264,0.261376,0.101642,1.000000,0.182465


### 1.2.2. 함수 def recom_movie

def cf_knn을 활용한 함수다.  
평점이 부여되지 않은 영화들만을 간추리고,  
상위 [Neighbor size]개를 기준으로 예상 평점(가중 평균)을 구한 후  
상위 [n_items]개 만큼 영화를 추천해준다

- user_id = 2, neighbor_size = 30, n_items = 5 라고 가정한다

In [20]:
def recom_movie(user_id, n_items, neighbor_size):
    # user_id가 평가한 영화 평점 가져오기
    user_movie = rating_matrix.loc[user_id].copy()
    for movie in rating_matrix:
        # user_id가 평점을 부여한 영화는 제외시키기(부여되어 있던 평점을 0점처리 해주기)        
        if pd.notnull(user_movie.loc[movie]):
            user_movie.loc[movie] = 0
        # user_id가 평점을 부여하지 않은 항목은 def cf_knn 돌려주기
        else:
            user_movie.loc[movie] = cf_knn(user_id, movie, neighbor_size)
    # 위에서 구한 평점 중 상위 [n_items]개 만큼을 추천함
    movie_sort = user_movie.sort_values(ascending=False)[:n_items]
    recom_movies = movies.loc[movie_sort.index]
    recommendations = recom_movies['title']
    return recommendations

recom_movie(2, 5, 30)

movie_id
1293                     Ayn Rand: A Sense of Life (1997)
1189                              That Old Feeling (1997)
1500    Prisoner of the Mountains (Kavkazsky Plennik) ...
1467                                     Cure, The (1995)
318                       Everyone Says I Love You (1996)
Name: title, dtype: object

## 1.3. 최적의 K의 개수 구하기

### 1.3.1. 데이터 읽어오고 정리하기

In [21]:
# 위 def df_knn의 '데이터 읽어오고 정리하기'와 같은 과정

# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

# timestamp 제거 
ratings = ratings.drop('timestamp', axis=1)
# movie ID와 title 빼고 다른 데이터 제거
movies = movies[['movie_id', 'title']]

# train, test 데이터 분리
from sklearn.model_selection import train_test_split
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

# train 데이터로 Full matrix 구하기 
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')

# train set 사용자들의 Cosine similarities 계산
matrix_dummy = rating_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, index=rating_matrix.index, columns=rating_matrix.index)

### 1.3.2. neighbor_size 조절해 가며 for문 돌리기
활용하는 알고리즘은  
def cf_knn(이웃 기반 협업알고리즘)이다

In [22]:
for neighbor_size in [10, 20, 30, 40, 50, 60]:
    print("Neighbor size = %d : RMSE = %.4f" % (neighbor_size, score(cf_knn, neighbor_size)))

Neighbor size = 10 : RMSE = 1.0295
Neighbor size = 20 : RMSE = 1.0135
Neighbor size = 30 : RMSE = 1.0125
Neighbor size = 40 : RMSE = 1.0125
Neighbor size = 50 : RMSE = 1.0125
Neighbor size = 60 : RMSE = 1.0135
