## 컨텐츠 기반 필터링(Content-based Filtering)

In [51]:
# conda activate base
# conda install -c conda-forge scikit-surprise
import numpy as np
import pandas as pd

In [52]:
from surprise import Dataset
data = Dataset.load_builtin('ml-100k', prompt=False)
df = pd.DataFrame(data.raw_ratings, columns = ['user-id','movie-id','rating','timestamp'])
df.head()

Unnamed: 0,user-id,movie-id,rating,timestamp
0,196,242,3.0,881250949
1,186,302,3.0,891717742
2,22,377,1.0,878887116
3,244,51,2.0,880606923
4,166,346,1.0,886397596


In [53]:
df.shape

(100000, 4)

In [54]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype  
---  ------     --------------   -----  
 0   user-id    100000 non-null  object 
 1   movie-id   100000 non-null  object 
 2   rating     100000 non-null  float64
 3   timestamp  100000 non-null  object 
dtypes: float64(1), object(3)
memory usage: 3.1+ MB


### 1. Adjacent Matrix 생성 (인접 행렬 생성)
- 행은 사용자
- 열은 영화
- 내용은 평점

In [55]:
raw_data = np.array(data.raw_ratings, dtype=int)
raw_data

array([[      196,       242,         3, 881250949],
       [      186,       302,         3, 891717742],
       [       22,       377,         1, 878887116],
       ...,
       [      276,      1090,         1, 874795795],
       [       13,       225,         2, 882399156],
       [       12,       203,         3, 879959583]])

In [56]:
np.min(raw_data, axis=0), np.max(raw_data, axis=0)

(array([        1,         1,         1, 874724710]),
 array([      943,      1682,         5, 893286638]))

In [57]:
# user-id, movie-id 가 0 부터 시작하도록 만들어줌 (기존 최솟값은 1)
raw_data[:, :2] -= 1
raw_data[:5]

array([[      195,       241,         3, 881250949],
       [      185,       301,         3, 891717742],
       [       21,       376,         1, 878887116],
       [      243,        50,         2, 880606923],
       [      165,       345,         1, 886397596]])

In [58]:
# 행, 열의 갯수
nrows = len(np.unique(raw_data[:,0])) # 사용자 수
ncols = len(np.unique(raw_data[:, 1])) # 영화 갯수

#### 1) 영화 시청 유무

In [59]:
# 본 영화는 1, 안 본 영화는 0
adj_matrix = np.zeros([nrows, ncols], int)
for user_id, movie_id, _, _ in raw_data:
    adj_matrix[user_id, movie_id] = 1
adj_matrix[:5]

array([[1, 1, 1, ..., 0, 0, 0],
       [1, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [1, 1, 0, ..., 0, 0, 0]])

In [60]:
# 0번 데이터 기준
my_id, my_vector = 0, adj_matrix[0]

In [61]:
# 0과 1로 구성된 데이터의 유사도는 이진 벡터의 내적
# 0번과 10번 사용자의 유사도, 20번 사용자와의 유사도 = 10번이 더 유사함 (숫자값이 큰게 좋음)
np.dot(my_vector, adj_matrix[10]), np.dot(my_vector, adj_matrix[20])

(71, 42)

In [62]:
# 누가 기준과 닮았는가?
best_score , best_match_id = 0, 0

for i in range(1, len(adj_matrix)):
    dot = np.dot(my_vector, adj_matrix[i])
    if dot > best_score:
        best_score, best_match_id = dot, i

best_score, best_match_id

(183, 275)

In [63]:
best_vector = adj_matrix[best_match_id]
my_vector[100:110], best_vector[100:110]

(array([1, 1, 1, 1, 1, 1, 1, 1, 1, 1]), array([1, 0, 0, 1, 0, 0, 0, 1, 1, 0]))

In [64]:
# 내가 보지 않은 영화 중에서 best_match 사용자가 본 영화 --> 추천
recommend_list = []
for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view == 1:
        recommend_list.append(i)

len(recommend_list), recommend_list[:10]

(335, [272, 273, 275, 280, 281, 283, 287, 288, 289, 290])

#### 2) 평점 점수를 주는 경우

In [65]:
adj_matrix = np.zeros([nrows, ncols], int)
for user_id, movie_id, rating, _ in raw_data:
    adj_matrix[user_id, movie_id] = rating
adj_matrix[:5]

array([[5, 3, 4, ..., 0, 0, 0],
       [4, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [4, 3, 0, ..., 0, 0, 0]])

- Case 1 : 유클리드 거리

In [66]:
best_score , best_match_id = 10000, 0
my_vector = adj_matrix[0]

for i in range(1, len(adj_matrix)):
    euc = np.sqrt(np.sum(np.square(my_vector - adj_matrix[i])))
    if euc < best_score:
        best_score, best_match_id = euc, i

best_score, best_match_id

(55.06359959174482, 737)

In [68]:
# 내가 보지 않은 영화 중에서 best_match 사용자가 본 최고 평점 영화 --> 추천
best_vector = adj_matrix[best_match_id]
recommend_list = []

for i, (my_view, best_match_view) in enumerate(zip(my_vector, best_vector)):
    if my_view == 0 and best_match_view == 5:
        recommend_list.append(i)

len(recommend_list), recommend_list[:10]

(6, [312, 317, 384, 407, 526, 602])

- Case 2: 코사인 유사도

In [70]:
def cos_similarity(v1, v2):
    v1_norm = np.sqrt(np.sum(np.square(v1)))
    v2_norm = np.sqrt(np.sum(np.square(v2)))
    return np.dot(v1, v2) / (v1_norm * v2_norm)

In [71]:
best_score , best_match_id = -1, 0
my_vector = adj_matrix[0]

for i in range(1, len(adj_matrix)):
    cos_sim = cos_similarity(my_vector, adj_matrix[i])
    if cos_sim > best_score:
        best_score, best_match_id = cos_sim, i

best_score, best_match_id

(0.569065731527988, 915)