<a href="https://colab.research.google.com/github/stebechoi/CP2/blob/Hwkdir/5.%20(MovieLense100K)_SVD.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# MovieLense 데이터를 이용한 영화 추천 시스템 (CF - SVD)

## SVD

협업 필터링(Collaborative Filtering)
- 메모리 기반 memory based
  - 유저 기반 user based
  - 아이템 기반 item based
- 모델 기반 machine learning
  - 잠재요인 기반 latent factor based
    - **SVD (Singular Vector Decomposition)**
    - MF (Matrix Factorization)
    - AutoEncoder (Latent Feature)
    - SVM (Support Vector Machine)
  - 기타 분류/회귀 기반
  - 딥러닝 기반
    - NCF (Neural Collaborative Filtering)
      - DCN (Deep Cross Network)
      - Wide & Deep
    - DeepFM(Deep Factorization Model)

[분류 참고](https://velog.io/@dlskawns/%EC%B6%94%EC%B2%9C%EC%8B%9C%EC%8A%A4%ED%85%9C-%EC%9D%B4%EB%A1%A0-%EC%BB%A8%ED%85%90%EC%B8%A0-%EA%B8%B0%EB%B0%98%EC%B6%94%EC%B2%9CCB-%ED%98%91%EC%97%85%ED%95%84%ED%84%B0%EB%A7%81CF)

### 설치 및 데이터 로드

In [None]:
# 구글 드라이브 마운트
from google.colab import drive
drive.mount('/content/gdrive')
# 작업 디렉토리 변경 change directory
import os
os.chdir('/content/gdrive/My Drive/Colab Notebooks/CP/CP2')

In [None]:
# 추천 시스템 개발을 위한 라이브러리 surprise 설치
!pip install scikit-surprise

In [None]:
os.getcwd()

In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('movielense_rt100.csv')
df.head()

Unnamed: 0,user_id,movie_id,movie_title,rating,TotalRatingCount,release_year,release_month,rating_year,rating_month,age_group,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,308,1,Toy Story (1995),4,452,1995,1,1998,2,6,...,0,0,0,0,0,0,0,0,0,0
1,308,4,Get Shorty (1995),5,209,1995,1,1998,2,6,...,0,0,0,0,0,0,0,0,0,0
2,308,7,Twelve Monkeys (1995),4,392,1995,1,1998,2,6,...,0,0,0,0,0,0,1,0,0,0
3,308,8,Babe (1995),5,219,1995,1,1998,2,6,...,0,0,0,0,0,0,0,0,0,0
4,308,9,Dead Man Walking (1995),4,299,1995,1,1998,2,6,...,0,0,0,0,0,0,0,0,0,0


### 라이브러리 이용

In [None]:
from surprise import Reader, Dataset, SVD
from surprise.model_selection import cross_validate

model = SVD()

In [None]:
# surprise reader class 이용, surprise에서 요구하는 데이터로 변환.
# rating_scale의 기본값(1,5)로 확인해보기(min, max)
df['rating'].min(), df['rating'].max()

(1, 5)

In [None]:
reader=Reader()

data = Dataset.load_from_df(df[['user_id', 'movie_id', 'rating']], reader=reader) # expected 3로 세 개만 넣어야 오류나지 않는다.

In [None]:
data

<surprise.dataset.DatasetAutoFolds at 0x7f4d6c16bfa0>

In [None]:
svd = SVD(random_state=42)

#### 평가

In [None]:
# 평가
cross_validate(svd, data, measures=['rmse','mae'], cv=5, verbose=True) #cv 교차검증

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9133  0.9210  0.9162  0.9091  0.9123  0.9144  0.0040  
MAE (testset)     0.7175  0.7277  0.7162  0.7126  0.7162  0.7181  0.0051  
Fit time          0.60    0.61    0.60    0.62    0.61    0.61    0.01    
Test time         0.08    0.06    0.06    0.24    0.10    0.11    0.07    


{'test_rmse': array([0.9132622 , 0.92104128, 0.91623311, 0.90906676, 0.91228373]),
 'test_mae': array([0.71753021, 0.72772219, 0.71615652, 0.71260476, 0.71624628]),
 'fit_time': (0.5997698307037354,
  0.6050868034362793,
  0.6041910648345947,
  0.618941068649292,
  0.6100926399230957),
 'test_time': (0.0800929069519043,
  0.0649423599243164,
  0.06336259841918945,
  0.23633241653442383,
  0.09553670883178711)}

#### 예측

In [None]:
trainset = data.build_full_trainset() # 있는 데이터 전체 활용 
svd.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x7f4d6bfec310>

In [None]:
# 예시
df[df['user_id']==1][['user_id', 'movie_id', 'rating']]

Unnamed: 0,user_id,movie_id,rating
12688,1,1,5
12689,1,2,3
12690,1,4,3
12691,1,7,4
12692,1,8,1
...,...,...,...
12857,1,268,5
12858,1,269,5
12859,1,270,5
12860,1,271,2


In [None]:
a = df[df['user_id']==1][['user_id', 'movie_id', 'rating']]
a[a['movie_id']==6] # movie_id==6인 영화는 평점을 매기지 않음(안보았음)

Unnamed: 0,user_id,movie_id,rating


In [None]:
# 예측
svd.predict(1,6) #user_id==1인 사용자가 movie_id==6인 영화를 평점 몇으로 줄 것인지 예측 est=3.7

Prediction(uid=1, iid=6, r_ui=None, est=3.7671015751164965, details={'was_impossible': False})

In [None]:
# 비교
svd.predict(1, 272, 3) #user_id==1인 사용자가 movie_id==272인 영화의 평점을 r_ui=3점으로 줬는데 모델은 est=4.2로 예측함

Prediction(uid=1, iid=272, r_ui=3, est=4.231383014701638, details={'was_impossible': False})

In [None]:
svd.predict(1, 268, 5) # uid = user id, iid = item id

Prediction(uid=1, iid=268, r_ui=5, est=4.511971879079819, details={'was_impossible': False})

In [None]:
svd.predict(1, 268, 5)[3]

4.511971879079819

### 추천

In [None]:
df_lu = df[['user_id','movie_id', 'movie_title', 'rating']]
df_lu

In [None]:
df_lu['movie_id'].max()

1047

In [None]:
# 최종적으로 추천받을 영화의 movie_id와 title 매칭을 위한 테이블
movie_lst= df_lu[['movie_id', 'movie_title']]
movie_lst = movie_lst.drop_duplicates(subset=['movie_id']).sort_values(['movie_id'], ascending=True).reset_index(drop=True)
movie_lst

Unnamed: 0,movie_id,movie_title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)
2,4,Get Shorty (1995)
3,7,Twelve Monkeys (1995)
4,8,Babe (1995)
...,...,...
337,928,"Craft, The (1996)"
338,1012,Private Parts (1997)
339,1016,Con Air (1997)
340,1028,Grumpier Old Men (1995)


In [None]:
# recommender(user_id, num_of_recommendation)
def recommender(user_id, num):
  all = set(df_lu['movie_id'])
  seen = set(df_lu[df_lu['user_id']==user_id]['movie_id'])
  unseen = set.difference(all, seen) # 이미 본 영화는 제외

  lst=[]
  for m in unseen:
    if svd.predict(user_id, m)[3] > 3:   # 예측 평점 3점 이상인 영화만 추천 활용
      lst.append((m, svd.predict(user_id, m)[3]))
  
  est_rt = pd.DataFrame(lst, columns = ['movie_id', 'est_rating'])
  est_rt = est_rt.sort_values(['est_rating'], ascending=False)  # 예측 평점 오름차순
  movie_id = list(est_rt.iloc[0:num]['movie_id'])

  print('Movie Recommendation:')
  print('++++++++++')
  for x in movie_id:
    print(movie_lst[movie_lst['movie_id']==x]['movie_title'].iloc[0])
    print('++++++++++')

In [None]:
#예시
recommender(47, 6)

Movie Recommendation:
++++++++++
Close Shave, A (1995)
++++++++++
Usual Suspects, The (1995)
++++++++++
Wrong Trousers, The (1993)
++++++++++
Good Will Hunting (1997)
++++++++++
Rear Window (1954)
++++++++++
Schindler's List (1993)
++++++++++
