# 추천 시스템

- 사용자 집단별 추천
    - 성별

## 전처리

In [1]:
import pandas as pd
import numpy as np

In [2]:
## 사용자
users = pd.read_csv('users.csv')
users[:2]

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [3]:
## 영화 평점
ratings = pd.read_csv('ratings.csv')
ratings[:2]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,55,5,875072688
1,1,203,4,878542231


In [4]:
## 컬럼 삭제 : timestamp
ratings.drop('timestamp', axis=1, inplace=True)
ratings[:2]

Unnamed: 0,user_id,movie_id,rating
0,1,55,5
1,1,203,4


In [5]:
## 영화 정보
movies = pd.read_csv('movies.csv')
movies[:2]

Unnamed: 0,movie_id,title,release date,imdb url,action,adventure,animation,children,comedy,crime,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
## 컬럼 정리 : movie_id, title만 사용
movies = movies[['movie_id', 'title']]
movies[:2]

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)


## 데이터 셋 분리
- Train data
    - 모델을 학습하기 위한 데이터 셋
    - 학습은 최적의 파라미터를 찾는 것
    - 학습을 위한 데이터
- Test data
    - 모델의 '최종 성능'을 평가하기 위한 데이터 셋
    - 모델 학습에 관여하지 않음
- Train data로 학습하고, Test data로 최종 성능 평가

### train, test set 분리

In [7]:
## train, test set 분리
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings.user_id

In [8]:
## x_train : 학습용 데이터
## x_test : 정확도 검증용 데이터
## y_train, y_test : 사용자 ID
x_train, x_test, y_train, y_test = train_test_split(x,               ## feature data
                                                    y,               ## target data
                                                    test_size=0.25,  ## test data size
                                                    #random_state=1,  ## random seed  
                                                    #shuffle=True,   ## default : True
                                                    stratify=y,      ## 값 기준으로 train, test data 분리
                                                   )

## Gender 기준 추천 모델

In [9]:
## x_train를 users와 merge
merged_ratings = pd.merge(x_train, users)
merged_ratings[:2]

Unnamed: 0,user_id,movie_id,rating,age,gender,job,zip_code
0,533,451,2,43,M,librarian,2324
1,533,568,5,43,M,librarian,2324


In [10]:
## 영화별 성별별 평점 평균 계산
g_mean = merged_ratings[['movie_id', 'gender', 'rating']].groupby(['movie_id', 'gender']).rating.mean()
g_mean

movie_id  gender
1         F         3.842697
          M         3.859848
2         F         3.333333
          M         3.151163
3         F         2.857143
                      ...   
1677      F         3.000000
1678      M         1.000000
1679      M         3.000000
1681      M         3.000000
1682      M         3.000000
Name: rating, Length: 3032, dtype: float64

In [11]:
users.set_index('user_id', inplace=True)
users[:2]

Unnamed: 0_level_0,age,gender,job,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [41]:
## x_train -->full matrix
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix[:2]

movie_id,1,2,3,4,5,6,7,8,9,10,...,1669,1671,1672,1673,1675,1677,1678,1679,1680,1682
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,,,,,5.0,3.0,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,


## 정확도 (Accuracy)

- 오차(잔차) : 실제값 - 예측값 (y - y^)
- 오차(잔차) 합 : 0이 되거나 -가 될 수 있음 -> 그래서 제곱의 합을 구함
- 평균 제곱 오차(MSE : Mean Square Error) 
- 평균 제곱근 오차(RMSE : Root Mean Square Error)

### RMSE 정의

In [42]:
## 정확도(RMSE) 계산하는 함수 정의
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

### 정확도 계산

In [43]:
## 모델별 RMSE를 계산하는 함수 정의
## : 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    
    y_pred = np.array([cf_gender(user, movie) for (user, movie) in id_pairs])
    print(y_pred)
    
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

In [44]:
## gender 기준 추천 모델
## gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id].gender
        
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0            
    else:
        gender_rating = 3.0
    return gender_rating        

In [45]:
score(cf_gender)

[4.22727273 2.67924528 3.7        ... 4.18518519 3.01086957 2.83333333]


1.029090380110449