# 추천 시스템
- 사용자 집단별 추천
    - 성별

## 전처리

In [30]:
import pandas as pd
import numpy as np

In [2]:
# 사용자
users = pd.read_csv('users.csv')
users[:2]

Unnamed: 0,user_id,age,gender,job,zip_code
0,1,24,M,technician,85711
1,2,53,F,other,94043


In [3]:
# 영화 평점
ratings = pd.read_csv('ratings.csv')
ratings[:2]

Unnamed: 0,user_id,movie_id,rating,timestamp
0,1,55,5,875072688
1,1,203,4,878542231


In [4]:
# 컬럼 삭제 : timestamp
ratings.drop('timestamp', axis=1, inplace=True)
ratings[:2]

Unnamed: 0,user_id,movie_id,rating
0,1,55,5
1,1,203,4


In [5]:
# 영화 정보
movies = pd.read_csv('movies.csv')
movies[:2]

Unnamed: 0,movie_id,title,release date,imdb url,action,adventure,animation,children,comedy,crime,...,fantasy,film-noir,horror,musical,mystery,romance,sci-fi,thriller,war,western
0,1,Toy Story (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?Toy%20Story%2...,0,0,1,1,1,0,...,0,0,0,0,0,0,0,0,0,0
1,2,GoldenEye (1995),01-Jan-1995,http://us.imdb.com/M/title-exact?GoldenEye%20(...,1,1,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0


In [6]:
# 컬럼 정리 : movie_id, title만 사용
movies = movies[['movie_id', 'title']]
movies[:2]

Unnamed: 0,movie_id,title
0,1,Toy Story (1995)
1,2,GoldenEye (1995)


## 데이터셋 분리
- Train data 
    - 모델을 학습하기 위한 데이터셋
    - 학습은 최적의 파라미터를 찾는 것
    - 학습을 위한 데이터
- Test data
    - 모델의 '최종 성능을 평가하기 위한 데이터셋'
    - 모델 학습에 관여하지 않음
- Train data로 학습하고, Test data로 최종 성능 평가

In [None]:
# pip install scikit-learn

In [8]:
# train, test set 분리
from sklearn.model_selection import train_test_split

x = ratings.copy()
y = ratings.user_id

In [23]:
# x_train : 학습용 데이터  x_test : 검증용 데이터
# y_train, y_test : 사용자 ID
x_train, x_test, y_train, y_test = train_test_split(x, # feature data
                                                    y,  # target data
                                                    test_size=0.25, # test data size
                                                    #random_state=1, # 값 설정이 되면 값을 기준으로 늘 똑같이 분리됨 (random seed)
                                                    #shuffle=True,   # 분리 전 데이터를 한 번 더 섞는 것
                                                    stratify=y   # 대상을 기준으로 나눠줌
                                                   )

In [10]:
x.shape

(100000, 3)

In [11]:
# 전체 data 75%
x_train.shape

(75000, 3)

In [12]:
# 전체 data 25%
x_test.shape

(25000, 3)

In [13]:
y_train.shape

(75000,)

In [14]:
y_test.shape

(25000,)

In [15]:
# random_state = 1
# 첫 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
45399,406,433,3
58708,524,525,3
5749,57,871,3
10029,94,690,4
90738,860,26,3
...,...,...,...
78117,716,472,3
2421,14,507,4
28827,283,210,5
32374,305,582,4


In [16]:
# random_state = 1
# 두 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
45399,406,433,3
58708,524,525,3
5749,57,871,3
10029,94,690,4
90738,860,26,3
...,...,...,...
78117,716,472,3
2421,14,507,4
28827,283,210,5
32374,305,582,4


In [18]:
# random_state 설정하지 않음
# 첫 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
12749,121,427,4
57578,506,216,4
91257,864,49,3
2024,13,668,1
47092,417,436,3
...,...,...,...
11318,103,181,4
94513,889,39,2
80047,745,177,3
47529,423,355,3


In [20]:
# random_state 설정하지 않음
# 두 번째 실행
x_train

Unnamed: 0,user_id,movie_id,rating
12749,121,427,4
57578,506,216,4
91257,864,49,3
2024,13,668,1
47092,417,436,3
...,...,...,...
11318,103,181,4
94513,889,39,2
80047,745,177,3
47529,423,355,3


In [21]:
# stratify 설정하지 않음
ratings.groupby('user_id').size()

user_id
1      272
2       62
3       54
4       24
5      175
      ... 
939     49
940    107
941     22
942     79
943    168
Length: 943, dtype: int64

In [22]:
# stratify 설정하지 않음
x_train.groupby('user_id').size()

user_id
1      210
2       47
3       31
4       20
5      133
      ... 
939     32
940     81
941     18
942     55
943    120
Length: 943, dtype: int64

In [24]:
# stratify=y
ratings.groupby('user_id').size()

user_id
1      272
2       62
3       54
4       24
5      175
      ... 
939     49
940    107
941     22
942     79
943    168
Length: 943, dtype: int64

In [25]:
# stratify=y
x_train.groupby('user_id').size()

user_id
1      204
2       46
3       41
4       18
5      131
      ... 
939     37
940     80
941     17
942     59
943    126
Length: 943, dtype: int64

## 정확도 (Accuracy)
- 10분 동안 줄넘기 횟수 

이름 | 홍길동 | 박보검 | 이미자
-- | -- | -- | --
예측 | 50 | 35 | 40
실제 | 60 | 20 | 45

- 오차(잔차) : 실제값 - 예측값 (y - y^)
    - 오차 : 10, -15, 5
- 오차(잔차) 합 : 0이 되거나 -가 될 수 있음 -> 따라서 제곱의 합을 구함
- 평균 제곱 오차 (Mean Square Error : MSE)
- 평균 제곱근 오차 (Root Mean Square Error : RMSE) => 0에 가까울수록 좋음

In [28]:
# MSE
mse = ((10)**2 + (-15)**2 + 5**2) / 3
mse

116.66666666666667

In [29]:
# RMSE
import math
math.sqrt(mse)

10.801234497346433

### RMSE 정의

In [32]:
# 정확도(RMSE) 계산 함수 정의
def RMSE(y_true, y_pred):
    return np.sqrt(np.mean((np.array(y_true) - np.array(y_pred))**2))

### 모델별 예측치의 정확도 계산

In [41]:
# 모델별 RMSE를 계산하는 함수 정의
# 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
#     for p in id_pairs:
#         print(p)
    y_pred = np.array([best_seller(user, movie) for (user, movie) in id_pairs]) # 예측값
    print('y_pred shape >>', y_pred.shape)
    print('y_pred >>', y_pred)
    
    y_true = np.array(x_test['rating']) # 실제값
    
    return RMSE(y_true, y_pred) # 정확도 

In [40]:
x_test[:2]

Unnamed: 0,user_id,movie_id,rating
76683,705,2,3
7876,76,121,2


In [31]:
for z in zip([1, 3, 5], ['a', 'b', 'c']):
    print(z)

(1, 'a')
(3, 'b')
(5, 'c')


## 모델

In [34]:
# 영화별 평점 평균
train_mean = x_train.groupby('movie_id').rating.mean()

In [36]:
train_mean[:3]

movie_id
1    3.886905
2    3.196078
3    3.119403
Name: rating, dtype: float64

### best-seller 모델 : 예측

In [42]:
# 영화별 평점 평균을 예측치로 계산하는 기본 모델
def best_seller(user_id, movie_id):
    try:
        rating = train_mean[movie_id]
    except:
        print('movie_id 없음 >>', movie_id)
        rating = 3.0
    return rating

## best-seller 정확도 측정

In [43]:
score(best_seller)

movie_id 없음 >> 1682
movie_id 없음 >> 1595
movie_id 없음 >> 1611
movie_id 없음 >> 1505
movie_id 없음 >> 857
movie_id 없음 >> 1571
movie_id 없음 >> 1611
movie_id 없음 >> 1651
movie_id 없음 >> 1669
movie_id 없음 >> 1494
movie_id 없음 >> 1566
movie_id 없음 >> 1625
movie_id 없음 >> 1572
movie_id 없음 >> 1554
movie_id 없음 >> 1638
movie_id 없음 >> 1236
movie_id 없음 >> 1636
movie_id 없음 >> 1630
movie_id 없음 >> 1493
movie_id 없음 >> 1575
movie_id 없음 >> 1590
movie_id 없음 >> 711
movie_id 없음 >> 1563
movie_id 없음 >> 1554
movie_id 없음 >> 1343
movie_id 없음 >> 1320
movie_id 없음 >> 1306
movie_id 없음 >> 1601
movie_id 없음 >> 1596
movie_id 없음 >> 1565
movie_id 없음 >> 1624
movie_id 없음 >> 1656
movie_id 없음 >> 784
movie_id 없음 >> 1306
movie_id 없음 >> 1661
movie_id 없음 >> 1576
movie_id 없음 >> 1520
movie_id 없음 >> 1616
movie_id 없음 >> 1582
movie_id 없음 >> 1590
movie_id 없음 >> 1564
movie_id 없음 >> 1665
movie_id 없음 >> 784
movie_id 없음 >> 1637
movie_id 없음 >> 1453
movie_id 없음 >> 1510
movie_id 없음 >> 1649
movie_id 없음 >> 1587
movie_id 없음 >> 1604
movie_id 없음 >> 1567
movi

1.0251895018421908

## Gender 기준 추천 모델

In [None]:
## 영화별 성별별 평점 평균 계산


In [47]:
one = pd.DataFrame({
    'id':[1, 2, 3],
    'name':['홍길동', '이미자', '박보검']
})

two = pd.DataFrame({
    'id':[1, 2, 5],
    'hire_date':[2020, 2010, 2002]
})
pd.merge(one, two)

Unnamed: 0,id,name,hire_date
0,1,홍길동,2020
1,2,이미자,2010


In [49]:
# x_train를 users와 merge
merge_ratings = pd.merge(x_train, users)
merge_ratings

Unnamed: 0,user_id,movie_id,rating,age,gender,job,zip_code
0,463,1067,2,48,F,healthcare,75218
1,463,275,5,48,F,healthcare,75218
2,463,16,4,48,F,healthcare,75218
3,463,274,3,48,F,healthcare,75218
4,463,1014,2,48,F,healthcare,75218
...,...,...,...,...,...,...,...
74995,444,269,4,51,F,lawyer,53202
74996,444,328,5,51,F,lawyer,53202
74997,444,748,1,51,F,lawyer,53202
74998,444,906,4,51,F,lawyer,53202


In [56]:
# 영화별 성별별 평점 평균 계산
g_mean = merge_ratings[['movie_id', 'gender', 'rating']].groupby(['movie_id', 'gender']).rating.mean()
g_mean

movie_id  gender
1         F         3.831325
          M         3.905138
2         F         3.200000
          M         3.195402
3         F         3.000000
                      ...   
1677      F         3.000000
1678      M         1.000000
1679      M         3.000000
1680      M         2.000000
1681      M         3.000000
Name: rating, Length: 3025, dtype: float64

In [57]:
x_train.shape

(75000, 3)

In [58]:
users.shape

(943, 5)

In [59]:
users.set_index('user_id', inplace=True)
users[:2]

Unnamed: 0_level_0,age,gender,job,zip_code
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,24,M,technician,85711
2,53,F,other,94043


In [61]:
x_train[:2]

Unnamed: 0,user_id,movie_id,rating
52561,463,1067,2
4591,43,241,4


In [64]:
# x_train -> full matrix
rating_matrix = x_train.pivot(index='user_id', columns='movie_id', values='rating')
rating_matrix

movie_id,1,2,3,4,5,6,7,8,9,10,...,1672,1673,1674,1675,1676,1677,1678,1679,1680,1681
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,3.0,4.0,,,5.0,4.0,,5.0,,...,,,,,,,,,,
2,4.0,,,,,,,,,2.0,...,,,,,,,,,,
3,,,,,,,,,,,...,,,,,,,,,,
4,,,,,,,,,,,...,,,,,,,,,,
5,,3.0,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
939,,,,,,,,,5.0,,...,,,,,,,,,,
940,,,,,,,4.0,5.0,3.0,,...,,,,,,,,,,
941,5.0,,,,,,4.0,,,,...,,,,,,,,,,
942,,,,,,,,,,,...,,,,,,,,,,


## 정확도 계산

In [63]:
# 모델별 RMSE를 계산하는 함수 정의
# 모델별 예측치의 정확도 계산
def score(model):
    id_pairs = zip(x_test['user_id'], x_test['movie_id'])
    y_pred = np.array([model(user, movie) for (user, movie) in id_pairs])
    print(y_pred)
    
    y_true = np.array(x_test['rating'])
    
    return RMSE(y_true, y_pred)

In [65]:
# gender 기준 추천 모델
# gender별 평균을 예측치로 돌려주는 함수
def cf_gender(user_id, movie_id):
    if movie_id in rating_matrix:
        gender = users.loc[user_id]['gender']
        
        if gender in g_mean[movie_id]:
            gender_rating = g_mean[movie_id][gender]
        else:
            gender_rating = 3.0
            
    else:
        gender_rating = 3.0
    return gender_rating

In [66]:
score(cf_gender)

[3.2        3.42083333 3.61206897 ... 3.93121693 3.82926829 3.44444444]


1.0343673193226834