# Best seller 방식 추천

In [1]:
# 사용자별 영화 평점 파일을 DataFrame으로 읽기
import os
import pandas as pd

base_src = 'drive/MyDrive/Source/Data'
rating_src = os.path.join(base_src,'Rating.csv')
u_cols = ['user_id','work_id','rating']
ratings = pd.read_csv(rating_src,
                    sep=',', # 구분자
                    names=u_cols,
                    encoding='latin-1')
ratings = ratings.set_index('user_id') 
ratings = ratings.drop('user_id',axis=0) # index 중복되어 삭제
ratings.head() # 상위 5개 행 출력

Unnamed: 0_level_0,work_id,rating
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,469,3
1,272,4
1,299,5
1,337,2
2,349,3


In [7]:
# 영화 정보 파일을 DataFrame으로 읽기
base_src = 'drive/MyDrive/Source/Data'
movie_src = os.path.join(base_src,'Movie.csv')
i_cols = ['work_id','title_kr',
          'sf','action','adult','adventure','animation','comedy', 'criminal','documentary','drama','family ','fantasy',
          'horror','music','musical','mystery','performance','romance','thriller','variety','war','western',
        ]
movies = pd.read_csv(movie_src,
                    sep='|',
                    names=i_cols,
                    encoding='utf-8')
movies = movies.set_index('work_id')
movies = movies.drop('work_id',axis=0) # index 중복되어 삭제
movies.head()

Unnamed: 0_level_0,title_kr,sf,action,adult,adventure,animation,comedy,criminal,documentary,drama,...,horror,music,musical,mystery,performance,romance,thriller,variety,war,western
work_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,택배기사,True,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,True,False,False,False
2,"어쩌다 마주친, 그대",False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,【최애의 아이】,False,False,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,구미호뎐 1938,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
5,애프터썬,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [10]:
# 인기 제품 방식 추천 function
def recom_movie(n_items): # 몇개의 아이템을 추천할지
  movie_mean = ratings.groupby(['work_id'])['rating'].mean() # rating을 movie_id 기준으로 그룹화 후 평균
  movie_sort = movie_mean.sort_values(ascending=False)[:n_items] # 정렬 후 몇개를 뽑아낼 건지
  recom_movies = movies.loc[movie_sort.index] # 인덱스를 기준으로 movie 데이터프레임에서 조회하기
  recommendations = recom_movies['title_kr'] # 타이틀만 뽑아주기
  return recommendations
print(type(ratings))
recom_movie(5)

<class 'pandas.core.frame.DataFrame'>


work_id
478                명량
531        퀸카로 살아남는 법
602          프렌치 디스패치
290    스타워즈: 비전스 볼륨 2
417    더 울프 오브 월 스트리트
Name: title_kr, dtype: object

In [11]:
# best-seller 방식 추천
# 100K의 영화 평점에 대해서 실제값과 best-seller 방식으로 구한 예측값의 RMSE를 계산
import numpy as np

def RMSE(y_true, y_pred):
  return np.sqrt(np.mean((np.array(y_true, dtype=float) - np.array(y_pred))**2))

# 정확도 계산
rmse = []
movie_mean = ratings.groupby(['work_id'])['rating'].mean()

for user in set(ratings.index):
  y_true = ratings.loc[user]['rating'] # 모든 영화의 평점
  #best-seller 방식으로
  y_pred = movie_mean[ratings.loc[user]['work_id']]
  accuracy = RMSE(y_true,y_pred)
  rmse.append(accuracy)

# RMSE 계산
print(np.mean(rmse))

15.49480247062167


# 협업 필터링 (CF 알고리즘)

## ✨ 접근
어떤 아이템에 대해 비슷한 취향을 가진 사람들은 다른 아이템 또한 비슷한 취향을 가질 것이다.

## ✨ 원리
취향이 비슷한 사람들의 집단이 존재한다고 가정

## ✨ 방법
#### 모든 사용자 간 평가의 유사도 계산
#### -> 추천 대상과 다른 사용자간 유사도 추출
#### -> 추천 대상이 평가하지 않은 아이템에 대한 예상 평가값 계산 
(평가값 = 다른 사용자의 평가 * 다른 사용자 유사도)
#### -> 아이템 중에서 예상 평가값 가장 높은 N개 추천

In [12]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

# 사용자별 영화 평점 파일을 DataFrame으로 읽기
import os
import pandas as pd

base_src = 'drive/MyDrive/Source/Data'
rating_src = os.path.join(base_src,'Rating.csv')
u_cols = ['user_id','work_id','rating']
ratings = pd.read_csv(rating_src,
                    sep=',', # 구분자
                    names=u_cols,
                    encoding='latin-1')
ratings = ratings.set_index('user_id') 
ratings.reset_index(inplace=True)
ratings.rename(columns={'index': 'index_column'}, inplace=True)
ratings = ratings.iloc[1:]
ratings.head() # 상위 5개 행 출력




Unnamed: 0,user_id,work_id,rating
1,1,469,3
2,1,272,4
3,1,299,5
4,1,337,2
5,2,349,3


In [14]:
# 영화 정보 파일을 DataFrame으로 읽기
base_src = 'drive/MyDrive/Source/Data'
movie_src = os.path.join(base_src,'Movie.csv')
i_cols = ['work_id','title_kr',
          'sf','action','adult','adventure','animation','comedy', 'criminal','documentary','drama','family ','fantasy',
          'horror','music','musical','mystery','performance','romance','thriller','variety','war','western',
        ]
movies = pd.read_csv(movie_src,
                    sep='|',
                    names=i_cols,
                    encoding='utf-8')
movies = movies.set_index('work_id')
movies = movies.drop('work_id',axis=0) # index 중복되어 삭제
movies.head() # 상위 5개 행 출력

Unnamed: 0_level_0,title_kr,sf,action,adult,adventure,animation,comedy,criminal,documentary,drama,...,horror,music,musical,mystery,performance,romance,thriller,variety,war,western
work_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,택배기사,True,True,False,False,False,False,False,False,True,...,False,False,False,True,False,False,True,False,False,False
2,"어쩌다 마주친, 그대",False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,True,False,False,False,False
3,【최애의 아이】,False,False,False,False,True,False,False,False,False,...,False,True,False,False,False,False,True,False,False,False
4,구미호뎐 1938,False,False,False,False,False,False,False,False,False,...,False,False,False,False,False,True,False,False,False,False
5,애프터썬,False,False,False,False,False,False,False,False,True,...,False,False,False,False,False,False,False,False,False,False


In [15]:
# 정확도(RMSE)를 계산하는 함수
def RMSE(y_true,y_pred):
  return np.sqrt(np.mean((np.array(y_true, dtype=float) - np.array(y_pred))**2))

# 모델명 RMSE를 계산하는 함수
def score(model):
  # 테스트 데이터의 user_id와 work_id 간 pairs를 맞춰 튜플형원소 리스트 데이터 만들기
  id_pairs = zip(x_test['user_id'],x_test['work_id'])
  # 모든 사용자-영화 짝에 대하여 주어진 예측모델에 의해 예측값 계산 및 리스트형 데이터 생성
  y_pred = np.array([model(user,movie) for (user,movie) in id_pairs])
  # 실제 평점값
  y_true = np.array(x_test['rating'])
  return RMSE(y_true,y_pred)

# 유사도 지표 (Correalation with User)

### 상관계수
- 가장 이해하기 쉬운 유사도
- -1~1 사이의 값

### ✅ 코사인 유사도
- 협업 필터링에서 널리 쓰이는 유사도
- 각 아이템 -> 하나의 차원, 사용자의 평가값 -> 좌표값
  - ex) x축: 아이템1, y축: 아이템2, 좌표: 평가값
- 두 사용자의 평가값 유사할수록 -> theta 작아지고, 코사인 커짐
- -1~1 사이의 값
- 데이터가 이진값일 때는 -> **타니모토 계수** 사용 권장

In [16]:
### 데이터셋 만들기 ###
x = ratings.copy() # 원본 훼손 방지
y = ratings['user_id']

x_train,x_test,y_train,y_test = train_test_split(x,y,
                                                 test_size=0.25, # 테스트 데이터 25%
                                                 stratify=y)

ratings_matrix = x_train.pivot(index='user_id', #피벗 테이블 제작
                              columns='work_id',
                              values='rating')

### 코사인 유사도 계산 ###
from sklearn.metrics.pairwise import cosine_similarity
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy) # 코사인 유사도 구하기
user_similarity = pd.DataFrame(user_similarity, 
                               index=ratings_matrix.index,
                               columns = ratings_matrix.index)

user_similarity

user_id,1,10,11,12,13,14,15,16,17,18,19,2,20,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.141421,0.254164,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.205798,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [17]:
### 주어진 영화(movie_id)의 가중평균 rating을 계산하는 함수 ###
def CF_simple(user_id,work_id):
  if int(work_id) in ratings_matrix.columns: # 해당 work_id가 있다면
    sim_scores = user_similarity[user_id].copy() # 주어진 사용자와 다른 사용자의 유사도 추출
    movie_ratings = ratings_matrix[work_id].copy() # 주어진 영화와 다른 영화의 유사도 추출
    # 영화 평가를 하지 않은 사용자 가중 평균에서 제외
    none_rating_idx = movie_ratings[movie_ratings.isnull()].index
    movie_ratings = movie_ratings.dropna() # 제외
    sim_scores = sim_scores.drop(none_rating_idx) # 제외
    mean_rating = np.dot(sim_scores, movie_ratings)/sim_scores.sum()
  else: # work_id가 포함되지 않았다면 기본값
    mean_rating = 3.0
  return mean_rating

### 정확도 계산 ###
score(CF_simple)

1.0488088481701516

평가 경향이 비슷한(유사도가 높은) 사람에게 가중치를 더 주어서 평균을 계산하면 더 정확한 예측이 가능하다.

# 이웃을 고려한 CF

In [32]:
# 유사집단의 크기를 미리 정하기 위해서 기존 score함수에 neighbors_size 인자값 추가
def knn_score(model, neighbors_size=0):
  # 테스트 데이터의 user_id와 work_id 간 pairs를 맞춰 튜플형원소 리스트 데이터 만들기
  id_pairs = zip(x_test['user_id'],x_test['work_id'])
  # 모든 사용자-영화 짝에 대하여 주어진 예측모델에 의해 예측값 계산 및 리스트형 데이터 생성
  y_pred = np.array([model(user,work,neighbors_size) for (user,work) in id_pairs])
  # 실제 평점값
  y_true = np.array(x_test['rating'])
  return RMSE(y_true,y_pred)

In [33]:
######### neightbor size를 정해서 예측치를 계산하는 함수 #########
def CF_knn(user_id,work_id,neighbors_size=0):
  if int(work_id) in ratings_matrix.columns: # 해당 영화가 존재하면
    # 주어진 사용자(user_id)와 다른 사용자의 유사도 추출
    sim_scores = user_similarity[user_id].copy()
    # 주어진 영화(movie_id)와 다른 사용자의 유사도 추출
    movie_ratings = ratings_matrix[work_id].copy()
    # 주어진 영화에 대해서 평가하지 않은 사용자를 가중 평균 계산에서 제외하기 위해 인덱스 추출
    none_rating_idx = movie_ratings[movie_ratings.isnull()].index
    # 주어진 영화를 평가하지 않은 사용자와의 유사도 제거
    movie_ratings = movie_ratings.dropna()
    # 주어진 영화를 평가하지 않은 사용자와의 유사도 제거
    sim_scores = sim_scores.drop(none_rating_idx)

    ### neighbors_size가 지정되지 않은 경우 ###
    if neighbors_size == 0: # neighbor_size가 0이면 기존의 simple CF와 같음
      mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()

    ### neighbors_size가 지정된 경우 ###
    else:
      if len(sim_scores) > 1:
        # neighbor_size와 sim_score 중에 작은 걸 택해야 분리 가능
        neighbors_size = min(neighbors_size,len(sim_scores)) 
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        user_idx = np.argsort(sim_scores) #sim_scores 오름차순 인덱스
        sim_scores = sim_scores[user_idx][-neighbors_size:] # 정렬된 것을 뒤부터 뽑아냄
        movie_ratings = movie_ratings[user_idx][-neighbors_size:]
        mean_rating = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      else:
        mean_rating=3.0

  # train/test set의 분할에 따라 ratings_matrix에 해당 영화 없으면 기본값
  else:
    mean_rating=3.0
  return mean_rating

# 정확도 계산
knn_score(CF_knn,neighbors_size=30)


1.0488088481701516

In [34]:
### 실제 주어진 사용자에 대해 추천을 받는 기능 구현 ###
# train data말고 full matrix(전체 데이터셋)로 추천하기 
ratings_matrix = ratings.pivot_table(values='rating',
                              index='user_id',
                              columns='work_id')

### train set의 모든 가능한 사용자 pair의 코사인 유사도 계산 ###
matrix_dummy = ratings_matrix.copy().fillna(0)
user_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
user_similarity = pd.DataFrame(user_similarity, 
                               index=ratings_matrix.index,
                               columns = ratings_matrix.index)

def recom_movie_by_CF_knn(user_id,n_items,neighbors_size=30):
  user_movie = ratings_matrix.loc[user_id].copy()

  for movie in ratings_matrix.columns:
    if pd.notnull(user_movie.loc[movie]): # 사용자가 해당 영화를 봤으면
      user_movie.loc[movie] = 0 # 추천 리스트에서 제외
    else:
      user_movie.loc[movie] = CF_knn(user_id,movie,neighbors_size)

  movie_sort = user_movie.sort_values(ascending=False)[:n_items] # 내림차순
  recom_movies = movies.loc[movie_sort.index] # 인덱스 반환
  recommendations = recom_movies['title_kr']
  return recommendations


In [35]:
recom_movie_by_CF_knn(user_id='15',n_items=5,neighbors_size=30) # 테스팅


work_id
246         멋진 세계
413            박쥐
459    바람계곡의 나우시카
456           아키라
453           무뢰한
Name: title_kr, dtype: object

## 최적의 이웃 크기 결정
집단의 크기를 너무 크게 하면 best-seller 방식과 다르지 않다
집단의 크기를 너무 작게 하면 신뢰성이 떨어진다

-> `overfitting`

In [29]:
####### neighbors_size가 10,20,30,40,50,60인 경우에 대해 RMSE 계산 #######
for neighbors_size in [10,20,30,40,50,60]:
  print('Neighbor size = %d : RMSE = %.4f'%(neighbors_size,knn_score(CF_knn, neighbors_size)))

Neighbor size = 10 : RMSE = 1.0488
Neighbor size = 20 : RMSE = 1.0488
Neighbor size = 30 : RMSE = 1.0488
Neighbor size = 40 : RMSE = 1.0488
Neighbor size = 50 : RMSE = 1.0488
Neighbor size = 60 : RMSE = 1.0488


# 사용자의 평가경향을 고려한 CF

1. 각 사용자의 평점평균 계산
2. 평점 -> 각 사용자의 평균에서의 차이로 변환
`(평점 - 해당 사용자의 평점 평균)`
3. 평점 편차의 예측값 계산
`(평가값 = 평점편차 * 다른 사용자 유사도)`
4. 실제 예측값  = 평점편차 예측값 + 평점평균

In [45]:
### 사용자 평가 경향 고려
ratings_mean = ratings_matrix.mean(axis=1)
ratings_bias = (ratings_matrix.T - ratings_mean).T ## .T : 전치 연산(행과 열 바꿔줌)

###
rating_binary_1 = np.array(ratings_matrix>0).astype(float) # 평점이 존재하는 것들만 float로 형변환
rating_binary_2 = rating_binary_1.T

counts = np.dot(rating_binary_1,rating_binary_2)
counts = pd.DataFrame(counts,
                      index=ratings_matrix.index,
                      columns=ratings_matrix.index).fillna(0)

counts

user_id,1,10,11,12,13,14,15,16,17,18,19,2,20,3,4,5,6,7,8,9
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
11,0.0,0.0,4.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
12,0.0,0.0,0.0,4.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
13,0.0,0.0,1.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
14,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
15,0.0,0.0,0.0,1.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
16,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
17,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0
18,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [46]:
### 신뢰도 가중을 고려한 함수 ###
def CF_knn_bias_sig(user_id,movie_id,neighbors_size=0):
  if movie_id in ratings_bias.columns:
    sim_scores = user_similarity[user_id].copy()
    movie_ratings = ratings_bias[movie_id].copy()

    no_rating = movie_ratings.isnull() # null값인 것을 True로 표시
    common_counts = counts[user_id]
    low_significance = common_counts < SIG_LEVEL # 정해둔 신뢰도보다 작은 사람만 True로 표시
    none_rating_idx = movie_ratings[no_rating | low_significance].index
  
    movie_ratings = movie_ratings.drop(none_rating_idx)
    sim_scores = sim_scores.drop(none_rating_idx)

    if neighbors_size==0:
      prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
      # 평점편차 예측값 + 평점평균
      prediction = prediction + ratings_mean[user_id]
    
    else:
      if len(sim_scores) > MIN_RATINGS: # 현재 영화를 평가한 사람의 수가 MIN_RATINGS보다 클 때만
        neighbors_size = min(neighbors_size,len(sim_scores))
        sim_scores = np.array(sim_scores)
        movie_ratings = np.array(movie_ratings)
        user_idx = np.argsort(sim_scores)
        sim_scores = sim_scores[user_idx][-neighbors_size:]
        movie_ratings = movie_ratings[user_idx][-neighbors_size:]
        prediction = np.dot(sim_scores,movie_ratings) / sim_scores.sum()
        prediction = prediction + ratings_mean[user_id]
      else:
        # 원래 내던 평점 평균으로
        prediction = ratings_mean[user_id]
  else:
    prediction = ratings_mean[user_id]

  # 정확도 올리기
  if prediction <=1:
    prediction = 1
  elif prediction >=5:
    prediction = 5

  return prediction

SIG_LEVEL = 3
MIN_RATINGS = 3

knn_score(CF_knn_bias_sig,30)

0.9762812094883317

# 아이템 기반 CF (IBCF)
- 계산이 빠름
- 업데이트에 대한 결과 영향이 적다

-> 데이터 크기가 크고 충분한 정보가 없는 경우에 적합하다

In [36]:
####### 아이템 기반 CF #######
from sklearn.metrics.pairwise import cosine_similarity
ratings_matrix_t = np.transpose(ratings_matrix)
matrix_dummy = ratings_matrix_t.copy().fillna(0)
item_similarity = cosine_similarity(matrix_dummy, matrix_dummy)
item_similarity = pd.DataFrame(item_similarity,
                               index=ratings_matrix_t.index,
                               columns=ratings_matrix_t.index)

def CF_IBCF(user_id,movie_id):
  if movie_id in item_similarity.columns:
    sim_scores = item_similarity[movie_id]
    user_ratings = ratings_matrix_t[user_id]
    none_rating_idx = user_ratings[user_ratings.isnull()].index
    user_ratings = user_ratings.dropna()
    sim_scores = sim_scores.drop(none_rating_idx)
    mean_ratings = np.dot(sim_scores,user_ratings) / sim_scores.sum()
  else:
    mean_ratings = 3.0
  return mean_ratings

score(CF_IBCF)

0.9722416631711565

In [38]:
def recom_movie_by_CF_IBCF(user_id,n_items,neighbors_size=30):
  user_movie = ratings_matrix.loc[user_id].copy()

  for movie in ratings_matrix.columns:
    if pd.notnull(user_movie.loc[movie]): # 사용자가 해당 영화를 봤으면
      user_movie.loc[movie] = 0 # 추천 리스트에서 제외
    else:
      user_movie.loc[movie] = CF_IBCF(user_id,movie)

  movie_sort = user_movie.sort_values(ascending=False)[:n_items] # 내림차순
  recom_movies = movies.loc[movie_sort.index] # 인덱스 반환
  recommendations = recom_movies['title_kr']
  return recommendations

recom_movie_by_CF_IBCF(user_id='15',n_items=5) # 테스팅

  mean_ratings = np.dot(sim_scores,user_ratings) / sim_scores.sum()


work_id
276             진짜가 나타났다!
527               문라이즈 킹덤
536               주술회전 1기
290        스타워즈: 비전스 볼륨 2
335    베벌리힐스의 진짜 주부들 시즌 8
Name: title_kr, dtype: object

가장 정확도가 높음