## Stacking Ensenble을 활용한 자율주행 센서 안테나 성능예측
> 팀명 : 될때까지간다리
>
> 작성일 : '22.08.31
>
> 개발환경 : Jupyter Notebook

In [3]:
# hyper parameter tuning을 위한 패키지 설치
!pip install optuna
!pip install catboost
!pip install skranger
!pip install ngboost
!pip install lightgbm
!pip install hyperopt
!pip install easydict





In [7]:
# 기본 modules
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tqdm

# 머신러닝 modules
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

from lightgbm import LGBMRegressor
from ngboost import NGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, Lasso, Ridge
from catboost import CatBoostRegressor, Pool
from skranger.ensemble import RangerForestRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from ngboost.scores import LogScore

from hyperopt import fmin, hp, tpe
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

# 모듈화된 함수 사용
import utils.preprocessing as preprocessing
import utils.utils as utils
import utils.stacking as stk

# argparse 사용
import argparse
import pandas as pd
from utils.utils import seed_everything
from utils.preprocessing import load_data
import utils.params as params

ModuleNotFoundError: No module named 'utils'

## Data Load

In [5]:
ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [6]:
utils.seed_everything(utils.Config.seed)

train_df = pd.read_csv('Data/train.csv')
test_x = pd.read_csv('Data/test.csv')
train_x, train_y = preprocessing.dataset_split_X_y(train_df)

cols_with_zero_variance = preprocessing.zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

NameError: name 'utils' is not defined

## Model
- 모델별 개별학습(타겟 Y_01~Y_14) 반복

### LGBM

In [None]:
# Parameter Tunning
space_lgbm = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 1),
    'max_depth': hp.quniform('max_depth', 5, 250, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 200, 5),
    'min_child_samples': hp.quniform('min_child_samples', 10, 150, 5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 500),
    'reg_lambda': hp.uniform('reg_lambda', 0, 500),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = params.lgbm_objective,
            space = space_lgbm,
            algo = tpe.suggest,
            verbose = 10,
            max_evals = 200)

print(best)
best['n_estimators'] = int(best['n_estimators'])
best['num_leaves'] = int(best['num_leaves'])
best['max_depth'] = int(best['max_depth'])
best['min_child_samples'] = int(best['min_child_samples'])

In [46]:
import easydict

if __name__ == '__main__':
    def dict() :
        args = easydict.EasyDict({'train' : 'Data/train.csv',
                                 'trest' : 'Data/test.csv',
                                 'sub' : 'Data/submssion.csv',
                                 'seed' : 42
                                 })

In [48]:
args = dict()
seed_everything(args.seed)

train_x, train_y, test_x = load_data(args.train, args.test)

AttributeError: 'NoneType' object has no attribute 'seed'

### Ensenble by LGBM
- 타겟 Y_01 ~ Y_14 개별학습 결과 입력

In [9]:
lg_1 = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
lg_2 =  {'colsample_bytree': 0.7641322280477741, 'learning_rate': 0.010977205425053654, 'max_depth': 90, 'min_child_samples': 75, 'min_split_gain': 0.13379952895779884, 'n_estimators': 900, 'num_leaves': 80, 'reg_alpha': 1.9214119194170154, 'reg_lambda': 14.454450236504218, 'scale_pos_weight': 2.171961031806387, 'subsample': 0.9552593593877317}
lg_3 = {'colsample_bytree': 0.5504769098255781,  'learning_rate': 0.019653385015120244, 'max_depth': 220, 'min_child_samples': 25, 'min_split_gain': 0.1273611040963466, 'n_estimators': 470, 'num_leaves': 160, 'reg_alpha': 3.5549669150756706, 'reg_lambda': 39.88636182674132, 'scale_pos_weight': 12.46696320152359, 'subsample': 0.7590007450921917}
lg_4 = {'colsample_bytree': 0.5597537952569402, 'learning_rate': 0.02374663979814546, 'max_depth': 32, 'min_child_samples': 100, 'min_split_gain': 0.12211426885216736, 'n_estimators': 1263, 'num_leaves': 200, 'reg_alpha': 14.606693962963451, 'reg_lambda': 299.52278825209424, 'scale_pos_weight': 7.7785016838070735, 'subsample': 0.6254745287838821}
lg_5 = {'colsample_bytree': 0.4311015575880258, 'learning_rate': 0.01749725932551278, 'max_depth': 53, 'min_child_samples': 15, 'min_split_gain': 0.2820951740673634, 'n_estimators': 974, 'num_leaves': 165, 'reg_alpha': 9.604623064885754, 'reg_lambda': 12.314490508636432, 'scale_pos_weight': 6.6422956907936825, 'subsample': 0.7390190399971659}
lg_6 = {'colsample_bytree': 0.6889745043181079, 'learning_rate': 0.06146161938790444, 'max_depth': 89, 'min_child_samples': 10, 'min_split_gain': 0.669592868575692, 'n_estimators': 1169, 'num_leaves': 175, 'reg_alpha': 11.405277636150856, 'reg_lambda': 112.37954230084294, 'scale_pos_weight': 5.932435783263877, 'subsample': 0.8265223228903998}  
lg_7 = {'colsample_bytree': 0.8663251864650988, 'learning_rate': 0.018110306887688978, 'max_depth': 166, 'min_child_samples': 50, 'min_split_gain': 0.025403061552667243, 'n_estimators': 1080, 'num_leaves': 100, 'reg_alpha': 2.0131018839563666, 'reg_lambda': 63.56640846106552, 'scale_pos_weight': 1.8584564419776715, 'subsample': 0.7643028435523616}
lg_8 = {'colsample_bytree': 0.8970390757241629, 'learning_rate': 0.03571726260659087, 'max_depth': 164, 'min_child_samples': 30, 'min_split_gain': 0.2863362850926679, 'n_estimators': 740, 'num_leaves': 100, 'reg_alpha': 1.1167159754886287, 'reg_lambda': 280.9798636389436, 'scale_pos_weight': 4.75867892931176, 'subsample': 0.681716202670263}
lg_9 = {'n_estimators': 900, 'max_depth': 86, 'num_leaves': 150, 'min_child_samples': 85, 'colsample_bytree': 0.90507, 'subsample': 0.62362, 'min_split_gain': 0.21034, 'scale_pos_weight': 8.77311, 'reg_alpha': 0.07069, 'reg_lambda': 499.10672, 'learning_rate': 0.04679}
lg_10 = {'colsample_bytree': 0.8350973419202665, 'learning_rate': 0.03134966396365972, 'max_depth': 114, 'min_child_samples': 20, 'min_split_gain': 0.24406788869557822, 'n_estimators': 454, 'num_leaves': 115, 'reg_alpha': 1.0870546166564243, 'reg_lambda': 346.21163772786895, 'scale_pos_weight': 5.81617865285278, 'subsample': 0.45612075761336973}
lg_11 = {'colsample_bytree': 0.7285829045071064, 'learning_rate': 0.019839273085108612, 'max_depth': 71, 'min_child_samples': 50, 'min_split_gain': 0.35567737788276876, 'n_estimators': 970, 'num_leaves': 140, 'reg_alpha': 0.27353134227182774, 'reg_lambda': 157.85749037224548, 'scale_pos_weight': 5.956126991298146, 'subsample': 0.7509931500532172}
lg_12 = {'colsample_bytree': 0.6115826698158419, 'learning_rate': 0.010052927231718068, 'max_depth': 71, 'min_child_samples': 85, 'min_split_gain': 0.12003011548878659, 'n_estimators': 1300, 'num_leaves': 120, 'reg_alpha': 1.3013867029804251, 'reg_lambda': 269.3915696845848, 'scale_pos_weight': 5.290961082236748, 'subsample': 0.7542724715058367}
lg_13 = {'colsample_bytree': 0.9511047907962863, 'learning_rate': 0.023257873709858216, 'max_depth': 58, 'min_child_samples': 80, 'min_split_gain': 0.21488153574891886, 'n_estimators': 1300, 'num_leaves': 150, 'reg_alpha': 0.33761852089148814, 'reg_lambda': 57.05291849099506, 'scale_pos_weight': 2.0801436555772854, 'subsample': 0.5580106548214563}
lg_14 = {'colsample_bytree': 0.8851122740930837, 'learning_rate': 0.013136814152245062, 'max_depth': 249, 'min_child_samples': 65, 'min_split_gain': 0.2072264172906347, 'n_estimators': 450, 'num_leaves': 135, 'reg_alpha': 0.642890771203696, 'reg_lambda': 45.624663648443345, 'scale_pos_weight': 6.400746088779947, 'subsample': 0.30084274480143686}

In [10]:
params = []
params.append(lg_1)
params.append(lg_2)
params.append(lg_3)
params.append(lg_4)
params.append(lg_5)
params.append(lg_6)
params.append(lg_7)
params.append(lg_8)
params.append(lg_9)
params.append(lg_10)
params.append(lg_11)
params.append(lg_12)
params.append(lg_13)
params.append(lg_14)

cols = train_y.columns.tolist()

In [11]:
model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_1)
model.fit(train_x, train_y['Y_01'])
pred1 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_2)
model.fit(train_x, train_y['Y_02'])
pred2 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_3)
model.fit(train_x, train_y['Y_03'])
pred3 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_4)
model.fit(train_x, train_y['Y_04'])
pred4 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_5)
model.fit(train_x, train_y['Y_05'])
pred5 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_6)
model.fit(train_x, train_y['Y_06'])
pred6 = model.predict(test_x)


model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_7)
model.fit(train_x, train_y['Y_07'])
pred7 = model.predict(test_x)


model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_8)
model.fit(train_x, train_y['Y_08'])
pred8 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_9)
model.fit(train_x, train_y['Y_09'])
pred9 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_10)
model.fit(train_x, train_y['Y_10'])
pred10 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_11)
model.fit(train_x, train_y['Y_11'])
pred11 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_12)
model.fit(train_x, train_y['Y_12'])
pred12 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_13)
model.fit(train_x, train_y['Y_13'])
pred13 = model.predict(test_x)

model = LGBMRegressor(n_jobs=-1, random_state=1, **lg_14)
model.fit(train_x, train_y['Y_14'])
pred14 = model.predict(test_x)

### csv파일로 만들기

In [12]:
sub = pd.read_csv('./sample_submission.csv')
sub

FileNotFoundError: [Errno 2] No such file or directory: './sample_submission.csv'

In [None]:
sub['Y_01'] = pred1
sub['Y_02'] = pred2
sub['Y_03'] = pred3
sub['Y_04'] = pred4
sub['Y_05'] = pred5
sub['Y_06'] = pred6
sub['Y_07'] = pred7
sub['Y_08'] = pred8
sub['Y_09'] = pred9
sub['Y_10'] = pred10
sub['Y_11'] = pred11
sub['Y_12'] = pred12
sub['Y_13'] = pred13
sub['Y_14'] = pred14
sub.to_csv('./LGBM_Ensenble.csv',index=False)