# 회귀 기반 추천 시스템

In [11]:
!pip install catboost



In [12]:
import pandas as pd

In [13]:
travel_df = pd.read_csv('./data/travel.csv')
print(travel_df.shape)
print(travel_df.head())

(34572, 15)
  GENDER  AGE_GRP  TRAVEL_STYL_1  TRAVEL_STYL_2  TRAVEL_STYL_3  TRAVEL_STYL_4  \
0      남     30.0            1.0            4.0            2.0            2.0   
1      남     20.0            4.0            1.0            5.0            1.0   
2      여     50.0            4.0            1.0            2.0            4.0   
3      남     30.0            1.0            1.0            1.0            5.0   
4      여     20.0            5.0            3.0            3.0            3.0   

   TRAVEL_STYL_5  TRAVEL_STYL_6  TRAVEL_STYL_7  TRAVEL_STYL_8  \
0            6.0            2.0            2.0            7.0   
1            1.0            4.0            1.0            6.0   
2            3.0            3.0            2.0            3.0   
3            6.0            3.0            5.0            7.0   
4            3.0            3.0            3.0            5.0   

   TRAVEL_MOTIVE_1  TRAVEL_COMPANIONS_NUM VISIT_AREA_NM MVMN_NM  DGSTFN  
0              3.0                  

In [14]:
travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']]\
    = travel_df[['AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2', 'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5', 'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8', 'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM']].astype(int)

In [15]:
from sklearn.model_selection import train_test_split

X = travel_df.drop('DGSTFN', axis=1)
y = travel_df['DGSTFN']

X_train,X_test, y_train,y_test = train_test_split(X,y, test_size=0.2, random_state=0)

In [17]:
from catboost import Pool

cat_features = [
    'GENDER',
    'AGE_GRP', 'TRAVEL_STYL_1', 'TRAVEL_STYL_2',
    'TRAVEL_STYL_3', 'TRAVEL_STYL_4', 'TRAVEL_STYL_5',
    'TRAVEL_STYL_6', 'TRAVEL_STYL_7', 'TRAVEL_STYL_8',
    'TRAVEL_MOTIVE_1', 'TRAVEL_COMPANIONS_NUM',
    'VISIT_AREA_NM', 'MVMN_NM'
]

X_train_pool = Pool(X_train,y_train, cat_features=cat_features)
X_test_pool = Pool(X_test,y_test, cat_features=cat_features)

In [18]:
from catboost import CatBoostRegressor

cb_reg = CatBoostRegressor(
    n_estimators= 100, # 반복횟수(내부 예측기 계수)
    depth= 5,          # 개별 트리의 최대 깊이
    learning_rate= 0.03, # 
    loss_function= 'RMSE', # 손실함수 (기본값)
    eval_metric= 'RMSE' # 평가지표 (기본값)
)

cb_reg.fit(X_train_pool, eval_set=X_test_pool, verbose=10)


0:	learn: 0.8348252	test: 0.8360693	best: 0.8360693 (0)	total: 223ms	remaining: 22.1s
10:	learn: 0.8309097	test: 0.8313066	best: 0.8313066 (10)	total: 870ms	remaining: 7.04s
20:	learn: 0.8284089	test: 0.8281688	best: 0.8281688 (20)	total: 1.39s	remaining: 5.22s
30:	learn: 0.8266281	test: 0.8259954	best: 0.8259954 (30)	total: 1.9s	remaining: 4.22s
40:	learn: 0.8253390	test: 0.8242814	best: 0.8242814 (40)	total: 2.37s	remaining: 3.41s
50:	learn: 0.8241862	test: 0.8230994	best: 0.8230994 (50)	total: 2.87s	remaining: 2.76s
60:	learn: 0.8232098	test: 0.8221181	best: 0.8221181 (60)	total: 3.36s	remaining: 2.15s
70:	learn: 0.8224472	test: 0.8213216	best: 0.8213216 (70)	total: 3.85s	remaining: 1.57s
80:	learn: 0.8216428	test: 0.8205121	best: 0.8205121 (80)	total: 4.36s	remaining: 1.02s
90:	learn: 0.8210018	test: 0.8198809	best: 0.8198809 (90)	total: 4.83s	remaining: 477ms
99:	learn: 0.8203473	test: 0.8193664	best: 0.8193664 (99)	total: 5.22s	remaining: 0us

bestTest = 0.8193664476
bestIteratio

<catboost.core.CatBoostRegressor at 0x1907f37dd90>

In [19]:
col_importance = pd.DataFrame({
    'column': X_train.columns,
    'importance' : cb_reg.feature_importances_
})

col_importance

Unnamed: 0,column,importance
0,GENDER,0.771593
1,AGE_GRP,14.510488
2,TRAVEL_STYL_1,5.528368
3,TRAVEL_STYL_2,8.48565
4,TRAVEL_STYL_3,6.9543
5,TRAVEL_STYL_4,3.46487
6,TRAVEL_STYL_5,4.786226
7,TRAVEL_STYL_6,7.467811
8,TRAVEL_STYL_7,1.247661
9,TRAVEL_STYL_8,11.272213


### 추천 시스템 구축
1. 방문지 목록을 생성
2. 사용자 특성 입력
3. 가상 만족도 예측
4. 만족도가 높은 순으로 추측

In [20]:
visit_ares = travel_df['VISIT_AREA_NM'].unique()
visit_ares.shape

(10711,)

In [21]:
user_input = ['여', 60,4,4,4,4,4,4,4,4,1,2,'방문지','자가용']
pred_results = []

for area in visit_ares:
    user_input[-2] = area
    dgstfn_pred = cb_reg.predict(user_input)
    pred_results.append(dgstfn_pred)

pred_results[:10]

[4.306107212369755,
 4.134672247211243,
 4.256663752108525,
 4.2466825925724025,
 4.088213814198101,
 4.0794092856928925,
 4.088213814198101,
 4.141207260897324,
 4.256663752108525,
 4.200830663195209]

In [22]:
result_df = pd.DataFrame({
    'VISIT_AREA_NM': visit_ares,
    'DGSTFN_PRED' : pred_results
})

result_df.sort_values(by='DGSTFN_PRED', ascending=False).head(10)

Unnamed: 0,VISIT_AREA_NM,DGSTFN_PRED
193,여울목게스트하우스,4.308569
3450,한라산국립공원 영실탐방로,4.308569
1037,올레길 21코스,4.308569
676,파라다이스시티,4.308569
216,제주신라호텔,4.308569
60,산지해장국,4.308569
3288,귤꽃다락,4.308569
1303,영종씨사이드 레일바이크,4.308569
78,제주1번가더테라스오피스텔,4.308569
3936,한라산국립공원 관음사탐방로,4.308569
