# concept : Simple is the best!
- 딥러닝 사용하지 않음
- 외부데이터 사용하지 않음
- **최소한의 노력으로 최대한의 결과를 얻어보자!**
    - competition의 목표가 모든 매수고객수를 맞추는 것이였다면 더 많은 모델을 사용했을 것이다.
    - 하지만 상위 3개만을 맞추는 것이 목표였기에 굳이 많은 수의 모델, 딥러닝은 필요하지 않다고 판단하였다.

## 0 make data, feature engineering
- 다소 복잡

## 1 feature selection
- shap value importance
- model feature importance

## 2 hyperparameter optimization
- bayesian optimization

## 3 final modeling
- stacking
    - 전체 데이터 대상 **catboost model 1개**
        - 매수고객이 0이 아닌 sample에 가중치 (만든 dataset이 0이 많은 sparse한 데이터였기에)
    - **6월 매수고객 데이터**
        - 시계열에서 아직까지도 기초적인 통계모델링들의 ensemble을 많이 사용하기에 예측달 7월의 직전달인 6월의 데이터 사용
- 총 2개의 모델이용

## 결과
- 공개리더보드 81.18809999999999

## 추가적인 발전 가능성
- 외부 데이터 활용
- rnn 계열의 딥러닝 이용
- 최적의 하이퍼파라미터
- 더 많은 수의 모델링

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

trade = pd.read_csv('trade_train.csv', index_col=0)
stock = pd.read_csv('stocks.csv', index_col=0)
answer = pd.read_csv('answer_sheet.csv')

from makeDataSet2 import makeDataset, makeCV, encoding, makeSub
from myModel import modelCatboost, modelLightgbm, linear
from sklearn.metrics import mean_squared_error

best_feature = [
       '종목번호', 
       '거래량_mean_weekdiff41',  '거래금액_mean', '거래량4_mean', 
       '그룹번호', 
       '매수고객수', '매수고객수rolling_mean2', '매수고객수rolling_mean3',
       '매수고객수rolling_std2', '매수고객수rolling_std3',
       '매수고객수rolling_max2', '매수고객수rolling_max3', 
       '매수고객수rolling_min2', '매수고객수rolling_min3', 
       '매수고객수diff1', '매수고객수diff2']

df= pd.read_csv('df_rolling23_diff12_MinMax.csv')

X_train_cat, y_train_cat, X_val_cat, y_val_cat, X_test_cat = \
    makeCV(df.reset_index(), train_=[201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004], use_catboost=True)

X_train_cat1, y_train_cat1, X_val_cat1, y_val_cat1, X_test_cat1 = \
    makeCV(df.loc[df['group_mse']==1].reset_index(drop=True), 
    train_=[201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004], use_catboost=True)
X_train_cat2, y_train_cat2, X_val_cat2, y_val_cat2, X_test_cat2 = \
    makeCV(df.loc[df['group_mse']==2].reset_index(drop=True), 
    train_=[201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004], use_catboost=True)
X_train_cat3, y_train_cat3, X_val_cat3, y_val_cat3, X_test_cat3 = \
    makeCV(df.loc[df['group_mse']==3].reset_index(drop=True), 
    train_=[201909, 201910, 201911, 201912, 202001, 202002, 202003, 202004], use_catboost=True)


weight_idx : [0, 6144, 12336, 18528, 24864, 31248, 37680, 44112]
weight_idx : [0, 3840, 7710, 11580, 15540, 19530, 23550, 27570]
weight_idx : [0, 1408, 2827, 4246, 5698, 7161, 8635, 10109]
weight_idx : [0, 896, 1799, 2702, 3626, 4557, 5495, 6433]


# Catboost

In [4]:
# catboost
params = {
    'iterations': 10000,
    'learning_rate': 0.02,
    'random_seed': 42,
    'use_best_model': True,
    'task_type' : 'GPU',
    'border_count' : 254,
    'depth' : 10,
    'early_stopping_rounds' : 1000,
    'eval_metric' : 'RMSE' 
}
category_cols = ['종목번호', '그룹번호']
#weight = {0:1, 6144:1, 12336:1, 18528:1, 24864:3, 31248:3, 37680:3, 44112:3}
model_cat_4, pred_train_cat_4, pred_val_cat_4, pred_test_cat_4=\
    modelCatboost(X_train_cat, y_train_cat, X_val_cat, y_val_cat, X_test_cat, \
                  category_cols, params=params, selected_feature=best_feature, weight=None, targetWeight=3)

0:	learn: 35.3785594	test: 23.4101445	best: 23.4101445 (0)	total: 25.1ms	remaining: 4m 11s
100:	learn: 33.7180573	test: 22.6301584	best: 22.6301584 (100)	total: 1.54s	remaining: 2m 30s
200:	learn: 31.1775157	test: 21.4715785	best: 21.4715785 (200)	total: 3.23s	remaining: 2m 37s
300:	learn: 28.6363993	test: 19.8888450	best: 19.8888450 (300)	total: 4.93s	remaining: 2m 38s
400:	learn: 27.1869963	test: 18.6249374	best: 18.6249374 (400)	total: 6.66s	remaining: 2m 39s
500:	learn: 25.8386882	test: 17.0858795	best: 17.0858795 (500)	total: 8.47s	remaining: 2m 40s
600:	learn: 24.8136373	test: 15.6561338	best: 15.6561338 (600)	total: 10.2s	remaining: 2m 38s
700:	learn: 24.0467817	test: 14.1753337	best: 14.1753337 (700)	total: 11.8s	remaining: 2m 36s
800:	learn: 23.1500636	test: 12.4023318	best: 12.4023318 (800)	total: 13.5s	remaining: 2m 34s
900:	learn: 22.1610521	test: 10.6542070	best: 10.6542070 (900)	total: 15.2s	remaining: 2m 33s
1000:	learn: 21.7482402	test: 9.8957888	best: 9.8957044 (999)	t

# 6월 매수고객수

In [17]:
pred_train_month = X_train_cat['매수고객수']
pred_val_month = X_val_cat['매수고객수']
pred_test_month = X_test_cat['매수고객수']

# stacking

In [8]:
from sklearn.linear_model import LinearRegression

stacking_df = pd.DataFrame(pred_train_month)
stacking_df['cat2'] = pred_train_cat_4


stacking_df_val = pd.DataFrame(pred_val_month)
stacking_df_val['cat2'] = pred_val_cat_4


stacking_df_test = pd.DataFrame(pred_test_month)
stacking_df_test['cat2'] = pred_test_cat_4


stackModel = LinearRegression()
stackModel.fit(stacking_df, y_train_cat)
stack_pred = stackModel.predict(stacking_df_val)
print(np.sqrt(mean_squared_error(y_val_cat, stack_pred)))


6.430457686541004


In [9]:
stacking_df_val.corr()

Unnamed: 0,매수고객수,cat2
매수고객수,1.0,0.960886
cat2,0.960886,1.0


In [11]:
finalStack_X = pd.concat([stacking_df, stacking_df_val])
finalStack_y = pd.concat([y_train_cat, y_val_cat])

stackModel = LinearRegression()
stackModel.fit(stacking_df_val, y_val_cat)
stack_pred = stackModel.predict(stacking_df_test)

In [12]:
stackModel.coef_

array([1.05638472, 0.19672514])

In [13]:
cat_answer = makeSub(X_test_cat, stack_pred)
cat_answer.to_csv('1003_stacking_cat.csv', index=False)


In [16]:
cat_answer.tail()

Unnamed: 0,그룹명,종목번호1,종목번호2,종목번호3
43,MAD44,A005930,A005935,A105560
44,MAD45,A005930,A035420,A035720
45,MAD46,A005930,A035720,A105560
46,MAD47,A005930,A005935,A017670
47,MAD48,A005930,A035420,A051910
