<font color="#CC3D3D"><p>
# Modeling
    
<font color="black"><p>
- [Read Data](#Read-Data)
- [Merge All Features](#Merge-All-Features)
- [Feature Selection](#Feature-Selection)
- [LGBM_BO Modeling](#LGBM_BO-Modeling)
- [Deployment](#Deployment)
- [Model Ensemble](#Model-Ensemble)

In [1]:
# Data Wrangling
import pandas as pd
from pandas import Series, DataFrame
import numpy as np

# Visualization
import matplotlib.pylab as plt
from matplotlib import font_manager, rc
import seaborn as sns
%matplotlib inline

# EDA
import klib

# Preprocessing & Feature Engineering
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.impute import SimpleImputer 
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import PowerTransformer
from sklearn.feature_selection import SelectPercentile
from sklearn import base
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.model_selection import StratifiedKFold
from sklearn.experimental import enable_iterative_imputer  # still experimental 
from sklearn.impute import IterativeImputer
from sklearn.feature_selection import RFE


# Hyperparameter Optimization
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

# Modeling
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.svm import SVR
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from catboost import CatBoostRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import ARDRegression
from sklearn.linear_model import BayesianRidge

# Evaluation
from sklearn.metrics import roc_auc_score
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

# Utility
import os
import time
import random
import sys, warnings
if not sys.warnoptions: warnings.simplefilter("ignore")
from IPython.display import Image
# import pickle
from tqdm import tqdm
import platform
from itertools import combinations
from scipy.stats.mstats import gmean
from tensorflow import keras

# from bayes_opt import BayesianOptimization

# Read Data

In [82]:
# 피쳐 불러오기
num_features_train = pd.read_csv('../input/num_features_train_new_4.csv')
num_features_test = pd.read_csv('../input/num_features_test_new_4.csv')
onehot_features_train = pd.read_csv('../input/onehot_features_train.csv', encoding='cp949')
onehot_features_test = pd.read_csv('../input/onehot_features_test.csv', encoding='cp949')
w2v_features_train = pd.read_csv('../input/w2v_features_train.csv', encoding='cp949')
w2v_features_test = pd.read_csv('../input/w2v_features_test.csv', encoding='cp949')

In [83]:
target = pd.read_csv('../input/y_train.csv', encoding = 'cp949').age

In [84]:
num_features_train.shape

(21587, 1498)

In [85]:
onehot_features_train.shape

(21587, 2309)

In [86]:
w2v_features_test.shape

(14380, 1950)

# Merge All Features

In [37]:
all_features_train =  pd.concat([num_features_train, onehot_features_train, w2v_features_train], axis = 1)
all_features_test =  pd.concat([num_features_test, onehot_features_test, w2v_features_test], axis = 1)

In [38]:
all_features_train = all_features_train.iloc[:, 1:]
all_features_test = all_features_test.iloc[:, 1:]

In [39]:
print(all_features_train.shape, all_features_test.shape, target.shape)

(21587, 5756) (14380, 5756) (21587,)


# Feature Selection

In [41]:
clf = LGBMRegressor(random_state = 0)        

In [42]:
from sklearn.feature_selection import SelectFromModel

In [43]:
all_features_train_fs = all_features_train
all_features_test_fs = all_features_test

In [44]:
smf = SelectFromModel(clf, threshold='3.0*mean')
smf.fit(all_features_train_fs, target)

X_new = smf.transform(all_features_train_fs)
X_te_new = smf.transform(all_features_test_fs)

feature_selection_idx = smf.get_support()
feature_selection_name = all_features_train_fs.columns[feature_selection_idx]

In [45]:
all_features_train_fs.columns[feature_selection_idx]

Index(['e', '베스트셀러_총구매액_평균', '베스트셀러_총구매액_최대구매액', '베스트셀러_총할인액_평균',
       '베스트셀러_총할인액_변동계수', '단독상품구매비율', '총구매액', '평균할인금액', '평균할인율', '6시이후_구매건수',
       ...
       'part_v236', 'part_v238', 'part_v256', 'part_v257', 'part_v267',
       'part_v283', 'part_v287', 'part_v290', 'part_v291', 'part_v297'],
      dtype='object', length=638)

In [46]:
X_new = pd.DataFrame(X_new)
X_te_new = pd.DataFrame(X_te_new)

X_new.columns = feature_selection_name
X_te_new.columns = feature_selection_name

In [47]:
from sklearn.model_selection import cross_val_score

scores = cross_val_score(clf, X_new, target, scoring='neg_mean_squared_error', cv=5)
print('교차 검증별 정확도:', np.round(scores, 4))
print('평균 검증 정확도:', np.round(np.mean(scores), 4))
print('RMSE:', np.sqrt(-np.mean(scores)))

교차 검증별 정확도: [-65.7411 -67.9135 -66.996  -63.572  -63.9369]
평균 검증 정확도: -65.6319
RMSE: 8.1013518158507


# LGBM_BO Modeling

In [48]:
X_train, X_val, y_train, y_val = train_test_split(X_new, target, test_size=0.3, random_state = 0)

In [49]:
from bayes_opt import BayesianOptimization

In [50]:
bayesian_params = {
    'max_depth':(8, 16),
    'num_leaves':(24, 64),
    'min_child_samples':(10, 200),
    'min_child_weight':(1, 50),
    'subsample':(0.5, 1),
    'colsample_bytree':(0.5, 1),
    'max_bin':(10, 500),
    'reg_lambda':(0.001, 10),
    'reg_alpha':(0.01, 50)
}

In [51]:
def lgb_rmse_eval(max_depth, num_leaves, min_child_samples, min_child_weight, subsample, 
                colsample_bytree, max_bin, reg_lambda, reg_alpha):
    
    params = {
        "n_estimators":2000, 
        "learning_rate":0.02,
        'max_depth':int(round(max_depth)),
        'num_leaves':int(round(num_leaves)),
        'min_child_samples': int(round(min_child_samples)),
        'min_child_weight': int(round(min_child_weight)),
        'subsample':max(min(subsample, 1), 0),
        'colsample_bytree':max(min(colsample_bytree, 1), 0),
        'reg_lambda': max(reg_lambda,0),
        'reg_alpha': max(reg_alpha, 0)
    }
    
    lgb_model = LGBMRegressor(**params)
    lgb_model.fit(X_train, y_train, eval_set=[(X_train, y_train), (X_val, y_val)], eval_metric= 'RMSE', verbose= 100, 
                early_stopping_rounds= 100)
    valid_pred = lgb_model.predict(X_val)
    RMSE = np.sqrt(mean_squared_error(y_val, valid_pred))
    
    return RMSE

In [52]:
lgbBO = BayesianOptimization(f = lgb_rmse_eval, pbounds=bayesian_params, random_state=0)
lgbBO.maximize(init_points=5, n_iter=25)

|   iter    |  target   | colsam... |  max_bin  | max_depth | min_ch... | min_ch... | num_le... | reg_alpha | reg_la... | subsample |
-------------------------------------------------------------------------------------------------------------------------------------
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 7.6077	training's l2: 57.8771	valid_1's rmse: 8.31281	valid_1's l2: 69.1029
[200]	training's rmse: 6.78038	training's l2: 45.9735	valid_1's rmse: 8.08869	valid_1's l2: 65.4269
[300]	training's rmse: 6.20824	training's l2: 38.5422	valid_1's rmse: 8.03432	valid_1's l2: 64.5503
[400]	training's rmse: 5.73251	training's l2: 32.8616	valid_1's rmse: 8.01072	valid_1's l2: 64.1717
[500]	training's rmse: 5.32387	training's l2: 28.3436	valid_1's rmse: 7.99862	valid_1's l2: 63.978
[600]	training's rmse: 4.95762	training's l2: 24.578	valid_1's rmse: 7.9929	valid_1's l2: 63.8865
[700]	training's rmse: 4.63271	training's l2: 21.462	valid_1's rmse: 7.991

[300]	training's rmse: 6.28834	training's l2: 39.5432	valid_1's rmse: 8.05283	valid_1's l2: 64.8481
[400]	training's rmse: 5.8646	training's l2: 34.3935	valid_1's rmse: 8.03081	valid_1's l2: 64.4939
[500]	training's rmse: 5.51181	training's l2: 30.38	valid_1's rmse: 8.0217	valid_1's l2: 64.3477
[600]	training's rmse: 5.2069	training's l2: 27.1118	valid_1's rmse: 8.01642	valid_1's l2: 64.2629
[700]	training's rmse: 4.93614	training's l2: 24.3654	valid_1's rmse: 8.013	valid_1's l2: 64.2081
Early stopping, best iteration is:
[654]	training's rmse: 5.05848	training's l2: 25.5882	valid_1's rmse: 8.01162	valid_1's l2: 64.186
| [0m 8       [0m | [0m 8.012   [0m | [0m 0.8113  [0m | [0m 427.9   [0m | [0m 11.38   [0m | [0m 166.2   [0m | [0m 29.22   [0m | [0m 50.44   [0m | [0m 3.07    [0m | [0m 5.492   [0m | [0m 0.8455  [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 7.61265	training's l2: 57.9524	valid_1's rmse: 8.30686	valid_1's l

[400]	training's rmse: 6.66691	training's l2: 44.4476	valid_1's rmse: 8.05114	valid_1's l2: 64.8209
[500]	training's rmse: 6.39593	training's l2: 40.9079	valid_1's rmse: 8.03778	valid_1's l2: 64.606
[600]	training's rmse: 6.14872	training's l2: 37.8067	valid_1's rmse: 8.02787	valid_1's l2: 64.4467
[700]	training's rmse: 5.92343	training's l2: 35.087	valid_1's rmse: 8.01867	valid_1's l2: 64.2991
[800]	training's rmse: 5.71402	training's l2: 32.65	valid_1's rmse: 8.01743	valid_1's l2: 64.2791
[900]	training's rmse: 5.51652	training's l2: 30.432	valid_1's rmse: 8.01619	valid_1's l2: 64.2594
Early stopping, best iteration is:
[879]	training's rmse: 5.55647	training's l2: 30.8743	valid_1's rmse: 8.01543	valid_1's l2: 64.2471
| [0m 15      [0m | [0m 8.015   [0m | [0m 0.9147  [0m | [0m 386.5   [0m | [0m 10.52   [0m | [0m 111.1   [0m | [0m 49.14   [0m | [0m 25.02   [0m | [0m 10.08   [0m | [0m 7.33    [0m | [0m 0.9481  [0m |
Training until validation scores don't improve f

[400]	training's rmse: 5.64587	training's l2: 31.8759	valid_1's rmse: 8.02322	valid_1's l2: 64.3721
[500]	training's rmse: 5.2434	training's l2: 27.4932	valid_1's rmse: 8.01323	valid_1's l2: 64.2119
[600]	training's rmse: 4.88189	training's l2: 23.8328	valid_1's rmse: 8.01307	valid_1's l2: 64.2093
Early stopping, best iteration is:
[522]	training's rmse: 5.15704	training's l2: 26.5951	valid_1's rmse: 8.01137	valid_1's l2: 64.182
| [0m 22      [0m | [0m 8.011   [0m | [0m 0.9423  [0m | [0m 432.9   [0m | [0m 13.85   [0m | [0m 166.8   [0m | [0m 23.29   [0m | [0m 55.58   [0m | [0m 5.256   [0m | [0m 4.266   [0m | [0m 0.67    [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 7.76621	training's l2: 60.3141	valid_1's rmse: 8.34583	valid_1's l2: 69.6529
[200]	training's rmse: 7.03334	training's l2: 49.4678	valid_1's rmse: 8.11877	valid_1's l2: 65.9144
[300]	training's rmse: 6.53564	training's l2: 42.7146	valid_1's rmse: 8.05897	valid

[500]	training's rmse: 5.47868	training's l2: 30.0159	valid_1's rmse: 8.00136	valid_1's l2: 64.0218
[600]	training's rmse: 5.13072	training's l2: 26.3243	valid_1's rmse: 7.99751	valid_1's l2: 63.9602
[700]	training's rmse: 4.81238	training's l2: 23.159	valid_1's rmse: 7.99961	valid_1's l2: 63.9938
Early stopping, best iteration is:
[629]	training's rmse: 5.03323	training's l2: 25.3334	valid_1's rmse: 7.9955	valid_1's l2: 63.928
| [0m 29      [0m | [0m 7.996   [0m | [0m 0.6775  [0m | [0m 429.1   [0m | [0m 13.64   [0m | [0m 165.2   [0m | [0m 12.66   [0m | [0m 48.56   [0m | [0m 19.51   [0m | [0m 3.755   [0m | [0m 0.5494  [0m |
Training until validation scores don't improve for 100 rounds
[100]	training's rmse: 7.83416	training's l2: 61.3741	valid_1's rmse: 8.35543	valid_1's l2: 69.8132
[200]	training's rmse: 7.14765	training's l2: 51.0888	valid_1's rmse: 8.12316	valid_1's l2: 65.9858
[300]	training's rmse: 6.69147	training's l2: 44.7757	valid_1's rmse: 8.06273	valid_

In [53]:
# dictionary에 있는 target값을 모두 추출
target_list = []

for result in lgbBO.res:
    target = result['target']
    target_list.append(target)
print(target_list)
# 가장 큰 target 값을 가지는 순번(index)를 추출
print('maximum target index:', np.argmin(np.array(target_list)))

[7.990879514186974, 8.021120875248982, 8.023113318279556, 8.004575929540204, 8.01100880431051, 8.014589828768006, 8.002746037444211, 8.011617391949414, 7.998953863504376, 8.01473894000794, 8.01096612614202, 8.011240405523802, 7.999181685297426, 8.016977059471685, 8.01542891730467, 8.00405997689988, 8.021857081131763, 8.005241412686667, 8.01161848776005, 8.006667952105282, 8.023062128468782, 8.011369512964752, 8.0112572494732, 8.011818670691497, 8.020704514002414, 8.021736521608966, 8.022392987625485, 8.027441730349006, 7.99550135571294, 8.001665451789307]
maximum target index: 0


In [54]:
# 가장 큰 target값을 가지는 index값을 기준으로 res에서 해당 parameter 추출. 
max_dict = lgbBO.res[np.argmin(np.array(target_list))]
print(max_dict)

{'target': 7.990879514186974, 'params': {'colsample_bytree': 0.7744067519636624, 'max_bin': 360.44278952248555, 'max_depth': 12.822107008573152, 'min_child_samples': 113.52780476941041, 'min_child_weight': 21.75908516760633, 'num_leaves': 49.835764522666246, 'reg_alpha': 21.884984691022, 'reg_lambda': 8.917838234820016, 'subsample': 0.9818313802505146}}


In [55]:
target = pd.read_csv('../input/y_train.csv', encoding = 'cp949').age

In [56]:
from sklearn.model_selection import KFold
ftr = X_new
target = target

def train_apps_all_with_oof(ftr, ttmp_arget, nfolds=5):
    ftr = ftr
    tmp_target = target

    # nfolds 개의 cross validatin fold set을 가지는 KFold 생성 
    folds = KFold(n_splits = nfolds, shuffle=True, random_state=0)
    
    # Out of Folds로 학습된 모델의 validation set을 예측하여 결과 확률을 담을 array 생성.
    # validation set가 n_split갯수만큼 있으므로 크기는 ftr_app의 크기가 되어야 함. 
    oof_preds = np.zeros((ftr.shape[0],))  
    
    # Ouf of Folds로 학습된 모델의 test dataset을 예측하여 결과 확률을 담을 array 생성. 
    test_preds = np.zeros(((X_te_new.shape[0],)))
    
    # n_estimators를 4000까지 확대. 
    clf = LGBMRegressor(
                nthread=4,
                n_estimators=4000,
                learning_rate=0.01,
                max_depth=16,
                num_leaves=41,
                colsample_bytree=0.504,
                subsample=0.612,
                max_bin=441,
                reg_alpha=24.647,
                reg_lambda=7.605,
                min_child_weight=28,
                min_child_samples=184,
                silent=-1,
                verbose=-1,
                )

    # nfolds 번 cross validation Iteration 반복하면서 OOF 방식으로 학습 및 테스트 데이터 예측
    for fold_idx, (train_idx, valid_idx) in enumerate(folds.split(ftr)):
        print('##### iteration ', fold_idx, ' 시작')
        # 학습용 데이터 세트의 인덱스와 검증용 데이터 세트의 인덱스 추출하여 이를 기반으로 학습/검증 데이터 추출
        train_x = ftr.iloc[train_idx, :]
        train_y = tmp_target.iloc[train_idx]
        valid_x = ftr.iloc[valid_idx, :]
        valid_y = tmp_target.iloc[valid_idx]
        
        # 추출된 학습/검증 데이터 세트로 모델 학습. early_stopping은 200으로 증가. 
        clf.fit(train_x, train_y, eval_set=[(train_x, train_y), (valid_x, valid_y)], eval_metric= 'RMSE', verbose= 200, 
                early_stopping_rounds= 200)
        # 검증 데이터 세트로 예측된 확률 저장. 사용되지는 않음. 
        #oof_preds[valid_idx] = clf.predict(valid_x, num_iteration=clf.best_iteration_)       
        # 학습된 모델로 테스트 데이터 세트에 예측 확률 계산. 
        # nfolds 번 반복 실행하므로 평균 확률을 구하기 위해 개별 수행시 마다 수행 횟수로 나눈 확률을 추후에 더해서 최종 평균 확률 계산. 
        test_preds += clf.predict(X_te_new, num_iteration=clf.best_iteration_)/folds.n_splits
        
        
    return clf, test_preds

In [57]:
clf, test_preds = train_apps_all_with_oof(ftr, target, nfolds=5)

##### iteration  0  시작
Training until validation scores don't improve for 200 rounds
[200]	training's rmse: 7.82631	training's l2: 61.2511	valid_1's rmse: 8.37478	valid_1's l2: 70.137
[400]	training's rmse: 7.15004	training's l2: 51.1231	valid_1's rmse: 8.11886	valid_1's l2: 65.9159
[600]	training's rmse: 6.70633	training's l2: 44.9749	valid_1's rmse: 8.05434	valid_1's l2: 64.8723
[800]	training's rmse: 6.33284	training's l2: 40.1048	valid_1's rmse: 8.02955	valid_1's l2: 64.4737
[1000]	training's rmse: 5.99961	training's l2: 35.9954	valid_1's rmse: 8.0125	valid_1's l2: 64.2002
[1200]	training's rmse: 5.69879	training's l2: 32.4762	valid_1's rmse: 8.00032	valid_1's l2: 64.005
[1400]	training's rmse: 5.42277	training's l2: 29.4064	valid_1's rmse: 7.99923	valid_1's l2: 63.9877
Early stopping, best iteration is:
[1327]	training's rmse: 5.52113	training's l2: 30.4829	valid_1's rmse: 7.99767	valid_1's l2: 63.9628
##### iteration  1  시작
Training until validation scores don't improve for 200 r

# Deployment

In [64]:
import pickle
import joblib

In [58]:
IDtest = num_features_test.custid.unique()

In [59]:
fname = 'submissions_0614_6_lgbmBO.csv'
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(test_preds, name="age")] ,axis=1)

In [60]:
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submissions_0614_6_lgbmBO.csv' is ready to submit.


In [62]:
pikle_data = (np.array(X_train), np.array(X_val), y_train, y_val, np.array(all_features_test), np.array(IDtest))

In [65]:
with open('DNN_features.pkl', 'wb') as f:
    pickle.dump(pikle_data, f)

# Model Ensemble

In [66]:
dnn_model = pd.read_csv('../submissions/dnn_submission_06150340_최종.csv')
lgbm_model = pd.read_csv('../submissions/submissions_0614_6_lgbmBO.csv')

In [67]:
submission_ours = (dnn_model['age'] + lgbm_model['age']) / 2

In [68]:
submission_1st = pd.read_csv('../submissions/submission_1st.csv')
submission_2nd = pd.read_csv('../submissions/submission_2nd.csv')
submission_3rd = pd.read_csv('../submissions/submission_3rd.csv')

In [69]:
professor_model = pd.read_csv('../submissions/5. dnn_submission_8.15961.csv')

In [74]:
pred = submission_ours*0.4 + submission_1st['age']*0.3 + submission_2nd['age']*0.2 + submission_3rd['age']*0.1
pred = pred*0.6 + professor_model['age']*0.4

In [76]:
fname = 'submissions_0615_19_최종.csv'
submissions = pd.concat([pd.Series(IDtest, name="custid"), pd.Series(pred, name="age")] ,axis=1)
submissions.to_csv(fname, index=False)
print("'{}' is ready to submit." .format(fname))

'submissions_0615_19_최종.csv' is ready to submit.


# <font color="#CC3D3D"> END