In [1]:
# 주피터노트북 목차 만들기
# 참고: https://mingchin.tistory.com/139

# pip install jupyter_contrib_nbextensions
# ! jupyter contrib nbextension install --user

# 목차
> # 1. 아이템 기반 협업필터링
>> ## 1.1. 데이터 읽어오고 정리하기
>> ## 1.2. 함수 def CF_IBCF

# 1. 아이템 기반 협업필터링
UBCF로 불리는 사용자 기반 협업필터링은  
취향이 비슷한 이웃 사용자를 알아내고, 그 그룹에 속한 사용자들이 공통적으로 좋은 평가를 한 아이템을 추천해주는 방식이다.  
  
이와 반대로 아이템 기반 협업필터링(=IBCF)은  
사용자들의 평가 패턴을 바탕으로 아이템 간 유사도를 구해 특정 아이템을 추천하는 방식이다.  
즉, user_id가 평가한 다른 아이템의 평점에 movie_id와의 유사도를 가중치로 삼아 예측 평점을 구하는 방식이다.  
  
책에서 설명한 방식은  
기존에 사용하던 사용자 기반 협업필터링에서  
아이템과 사용자간의 full matrix 데이터만을 전치시켜 가동시키는 방식이다.

## 1.1. 데이터 읽어오고 정리하기

In [2]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

In [3]:
# 데이터 읽어 오기 
u_cols = ['user_id', 'age', 'sex', 'occupation', 'zip_code']
users = pd.read_csv('C:/RecoSys/Data/u.user', sep='|', names=u_cols, encoding='latin-1')
i_cols = ['movie_id', 'title', 'release date', 'video release date', 'IMDB URL', 'unknown', 
          'Action', 'Adventure', 'Animation', 'Children\'s', 'Comedy', 'Crime', 'Documentary', 
          'Drama', 'Fantasy', 'Film-Noir', 'Horror', 'Musical', 'Mystery', 'Romance', 'Sci-Fi', 
          'Thriller', 'War', 'Western']
movies = pd.read_csv('C:/RecoSys/Data/u.item', sep='|', names=i_cols, encoding='latin-1')
r_cols = ['user_id', 'movie_id', 'rating', 'timestamp']
ratings = pd.read_csv('C:/RecoSys/Data/u.data', sep='\t', names=r_cols, encoding='latin-1')

ratings = ratings.drop('timestamp', axis=1)
movies = movies[['movie_id', 'title']]

In [4]:
users

Unnamed: 0,user_id,age,sex,occupation,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043
2,3,23,M,writer,32067
3,4,24,M,technician,43537
4,5,33,F,other,15213
...,...,...,...,...,...
938,939,26,F,student,33319
939,940,32,M,administrator,02215
940,941,20,M,student,97229
941,942,48,F,librarian,78209


In [5]:
movies

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,3,Four Rooms (1995)
3,4,Get Shorty (1995)
4,5,Copycat (1995)
...,...,...
1677,1678,Mat' i syn (1997)
1678,1679,B. Monkey (1998)
1679,1680,Sliding Doors (1998)
1680,1681,You So Crazy (1994)


In [6]:
ratings

Unnamed: 0,user_id,movie_id,rating
0,196,242,3
1,186,302,3
2,22,377,1
3,244,51,2
4,166,346,1
...,...,...,...
99995,880,476,3
99996,716,204,5
99997,276,1090,1
99998,13,225,2


#### - train셋과 test셋 분리하기
**train데이터로 알고리즘**을 제작 후, **test로 정확도를 측정**해야 정확한 검증을 할 수 있다  
따라서 데이터를 train셋과 test셋으로 분리해준다

In [7]:
# train, test 데이터 분리
x = ratings.copy()
y = ratings['user_id']
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.25, stratify=y)

In [8]:
# 정확도(RMSE)를 계산하는 함수 
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

# 모델별 RMSE를 계산하는 함수 
def score(model, neighbor_size=0):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie, neighbor_size) for (user, movie) in id_pairs])
    y_true = np.array(x_test['rating'])
    return RMSE(y_true, y_pred)

#### - full matrix 제작
train용으로 분리가 완료된 x_train 데이터를 full matrix로 구현해준다(=rating_matrix)  

In [9]:
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1674,1675,1678,1679,1680,1681,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,3.0,,5.0,,1.0,5.0,3.0,...,,,,,,,,,,
2,,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


#### - 아이템 기반 유사도 구하기
full matrix로 제작된 rating_matrix를 전치 시켜준다.  
그 후 코사인유사도 계산을 해주어 각 아이템 별 유사도를 구해준다

In [10]:
rating_matrix_t = np.transpose(rating_matrix)
matrix_dummy = rating_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity, index=rating_matrix_t.index, columns=rating_matrix_t.index)
item_similarity

movie_id,1,2,3,4,5,6,7,8,9,10,...,1670,1671,1672,1674,1675,1678,1679,1680,1681,1682
movie_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,1.000000,0.364852,0.271193,0.366629,0.204571,0.107443,0.490546,0.371756,0.395881,0.189358,...,0.0,0.0,0.052659,0.0,0.000000,0.0,0.0,0.0,0.052659,0.052659
2,0.364852,1.000000,0.239312,0.407180,0.258409,0.058275,0.287533,0.259534,0.209195,0.150960,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.087519
3,0.271193,0.239312,1.000000,0.234620,0.186796,0.113371,0.281393,0.157647,0.231571,0.146259,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.110357
4,0.366629,0.407180,0.234620,1.000000,0.275205,0.100976,0.396508,0.408698,0.321200,0.227332,...,0.0,0.0,0.065528,0.0,0.109213,0.0,0.0,0.0,0.065528,0.087370
5,0.204571,0.258409,0.186796,0.275205,1.000000,0.017432,0.246567,0.191381,0.213224,0.033486,...,0.0,0.0,0.000000,0.0,0.000000,0.0,0.0,0.0,0.000000,0.114541
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1678,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,1.0,1.0,1.0,0.000000,0.000000
1679,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,1.0,1.0,1.0,0.000000,0.000000
1680,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.0,0.000000,0.0,0.000000,1.0,1.0,1.0,0.000000,0.000000
1681,0.052659,0.000000,0.000000,0.065528,0.000000,0.000000,0.058843,0.095660,0.066091,0.000000,...,0.0,0.0,1.000000,0.0,0.000000,0.0,0.0,0.0,1.000000,0.000000


## 1.2. 함수 CF_IBCF
기존의 사용자 기반 협업 필터링과 과정은 동일하다.  
다만 사용하는 기반 df가 아이템 중심으로 구성된 rating_matrix 와 item_similarity다. 

user_id = 2, movie_id = 2 라고 가정한다

In [11]:
user_id = 2
movie_id = 2

코드의 진행은 아래와 같다.

1. 인자로 받은 movie_id가 item_similarity에 존재하는지 확인한다.  
train set으로 만든 df기 때문에 원하는 movie_id가 존재하지 않을수도 있다.  
그렇다면 기본 평점 값으로 정한 3.0점을 넣어준다
  
    
2. movie_id와 다른 영화의 유사도를 가져온다 (=sim_scores)  
  
    
3. user_id가 평가한 모든 영화에 대한 평점을 가져온다. (=user_rating)  
  
    
4. user_rating 중 값이 없는 항목은 제거해주고, 동일한 아이템을 sim_scores에서도 제거해준다  
  
    
5. 살아남은 user_rating(각 항목의 평점값)에 sim_scores(아이템별 유사도)를 가중치 삼아 가중평균을 구하고 그것을 예상 평점으로 정한다

In [12]:
if movie_id in item_similarity:
    sim_scores = item_similarity[movie_id]
    user_rating = rating_matrix_t[user_id]
    non_rating_idx = user_rating[user_rating.isnull()].index
    user_rating = user_rating.dropna()
    sim_scores = sim_scores.drop(non_rating_idx)
    mean_rating = np.dot(sim_scores, user_rating) / sim_scores.sum()
else:
    mean_rating = 3.0

mean_rating

3.7801056022069566