# Stacking Ensenble을 활용한 자율주행 센서 안테나 성능예측
> 팀명 : 될때까지간다리
>
> 작성일 : '22.08.31
>
> 개발환경 : Jupyter Notebook
>
> 결과 : 상위 4%

# Development Environment Setting

In [3]:
# hyper parameter tuning을 위한 패키지 설치
!pip install optuna
!pip install catboost
!pip install skranger
!pip install ngboost
!pip install lightgbm
!pip install hyperopt





In [4]:
# 기본 modules
import pandas as pd
import random
import os
import warnings
warnings.filterwarnings('ignore')
import numpy as np
import tqdm

# 머신러닝 modules
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

from lightgbm import LGBMRegressor
from ngboost import NGBRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import ElasticNet, LinearRegression, Lasso, Ridge
from catboost import CatBoostRegressor, Pool
from skranger.ensemble import RangerForestRegressor
from sklearn.neighbors import RadiusNeighborsRegressor
from ngboost.scores import LogScore

from hyperopt import fmin, hp, tpe
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
from sklearn.inspection import permutation_importance

# 모듈화된 함수 사용
import utils.preprocessing as preprocessing
import utils.utils as utils
import utils.stacking as stk
import utils.params as params

# Utils

In [5]:
# seed 고정
def seed_everything(seed): 
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

In [205]:
class Config:
    seed = 42
    epochs = 1 #200
    cv=2 #10
    test_size = 0.2

In [7]:
# Y_Feature별 NRMSE의 총합
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15]) # Y_01 ~ Y_08 까지 20% 가중치 부여
    return score


In [8]:
# Y_Feature 개별 NRMSE 계산
def lg_individual_nrmse(gt, preds):
    """
    @Description: Metric used in this project (individual)
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    rmse = mean_squared_error(gt, preds, squared=False)
    nrmse = rmse/np.mean(np.abs(gt))
    return nrmse

In [9]:
# 데이터에서 X_feature, Y_feature 구분
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [10]:
# DF 안의 결측치(NA)를 확인
def check_for_NAs(df, show=False):
    """
    @Description: checks for the NAs in the dataframe
    @Param1: df, pandas dataframe
    @Param2: show, boolean indicating whether NaN data are also necessary as a part of the output
    @Return: name of the columns with NaN
    """
    nan_values = df.loc[:, df.isnull().any()]
    if show:
        return df[df.isna().any(axis=1)]
    return list(nan_values.columns)

In [11]:
# DF 안의 결측치(NA)를 확인
def check_for_NAs(df, show=False):
    """
    @Description: checks for the NAs in the dataframe
    @Param1: df, pandas dataframe
    @Param2: show, boolean indicating whether NaN data are also necessary as a part of the output
    @Return: name of the columns with NaN
    """
    nan_values = df.loc[:, df.isnull().any()]
    if show:
        return df[df.isna().any(axis=1)]
    return list(nan_values.columns)

In [12]:
# 분산이 0인 feature 탐색
def zero_variance(train_x):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in train_x.columns:
        if train_x[col].var() == 0:
            result.append(col)
    return result

In [13]:
# 가장 높은 상관계수값과 feature 탐색
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [14]:
# 정규화 진행 및 이상치 탐색
def find_outlier_zscore(data, threshold = 3):
    mean = np.mean(data)
    std = np.std(data)
    zs = [(y - mean) / std for y in data]
    masks = np.where(np.abs(zs) > threshold)
    return masks[0]

In [15]:
# histogram 시각화
def adjacent_histogram_boxplot(feature_var, figsize = (7, 5)):
    """
    @Description: plot histogram and boxplot in next to each other
    @Param1: feature_var, pandas series 
    @Param2: figsize, size of the figure 
    """
    fig, (hist_plot, box_plot) = plt.subplots(nrows=2, sharex=True, gridspec_kw={'height_ratios':(.85,.15)}, figsize=figsize)
    sns.distplot(feature_var, kde=True, ax=hist_plot, kde_kws= {"linewidth":1.5}) 
    sns.boxplot(feature_var, ax=box_plot, linewidth = 1, width = 0.5)
    hist_plot.set_ylabel('')    
    hist_plot.set_xlabel('')
    box_plot.set_xlabel('')
    hist_plot.tick_params(labelsize=8)
    box_plot.tick_params(labelsize=8)
    fig.suptitle(feature_var.name, fontsize = 10)
    hist_plot.axvline(np.mean(feature_var),color='red',linestyle='-',lw = 1.5)
    hist_plot.axvline(np.median(feature_var),color='green',linestyle='--',lw = 1.5)

In [16]:
# data 불러오기
def load_data(train, test):
    train_df = pd.read_csv(train)
    test_df = pd.read_csv(test)

    train_x, train_y = dataset_split_X_y(train_df)
    cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
    train_x = train_x.drop(cols_with_zero_variance, axis=1)
    
    test_df = test_df.drop(cols_with_zero_variance, axis=1)

    train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
    test_df = test_df.drop(['X_10', 'X_11'], axis = 1)

    test_df = test_df.drop('ID', axis=1) 

    return train_x, train_y, test_df

# Load Data

In [17]:
utils.seed_everything(utils.Config.seed)

train_df = pd.read_csv('Data/train.csv')
test_x = pd.read_csv('Data/test.csv')
train_x, train_y = preprocessing.dataset_split_X_y(train_df)

cols_with_zero_variance = preprocessing.zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 http://localhost:8888/notebooks/20220801%20LG%20AI%20Research%20%EC%9E%90%EC%9C%A8%EC%A3%BC%ED%96%89%20%EC%84%BC%EC%84%9C%EC%9D%98%20%EC%95%88%ED%85%8C%EB%82%98%20%EC%84%B1%EB%8A%A5%20%EC%98%88%EC%B8%A1%20AI%20%EA%B2%BD%EC%A7%84%EB%8C%80%ED%9A%8C/Model_Submit/Submit1_Stacking%20ensemble_Full_Version.ipynb#(결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

# Model

In [160]:
# Y_01 ~ Y_14 반복을 위한 List
target = train_y.columns.tolist()

## LGBM

In [161]:
# Parameter Setting
def lgbm_objective(params, target):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.5f}'.format(params['colsample_bytree']),
        'subsample': '{:.5f}'.format(params['subsample']),
        'min_split_gain': '{:.5f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.5f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.5f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.5f}'.format(params['reg_lambda']),
        'learning_rate': '{:.5f}'.format(params['learning_rate']),   
    }

    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y[target], cv=Config.cv, scoring='neg_mean_squared_error')) # cross_val_score : 교차검증
    losses = losses / np.mean(np.abs(train_y[target]))
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    return losses.mean()

In [158]:
# Parameter Tunning
space_lgbm = {
    'n_estimators' : hp.quniform('n_estimators', 10, 30, 5), # 100, 1500,1
    'max_depth': hp.quniform('max_depth', 5, 250, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 200, 5),
    'min_child_samples': hp.quniform('min_child_samples', 10, 150, 5),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 500),
    'reg_lambda': hp.uniform('reg_lambda', 0, 500),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

In [159]:
# Y_01 ~ Y_14 반복

best_params_lgbm= []

for idx in range(len(target)) :
    
    lgbm_objective_lambda = lambda params : lgbm_objective(params, target = target[idx])
    
    best = fmin(fn = lgbm_objective_lambda,
            space = space_lgbm,
            algo = tpe.suggest,
            max_evals = 1) # 200
    
    
    best['n_estimators'] = int(best['n_estimators'])
    best['num_leaves'] = int(best['num_leaves'])
    best['max_depth'] = int(best['max_depth'])
    best['min_child_samples'] = int(best['min_child_samples'])
    best_params_lgbm.append(best)
    print(target[idx])
    print(best)

NRMSE Loss 0.26193 params {'n_estimators': 10, 'max_depth': 203, 'num_leaves': 170, 'min_child_samples': 120, 'colsample_bytree': '0.39658', 'subsample': '0.59458', 'min_split_gain': '0.51338', 'scale_pos_weight': '1.65109', 'reg_alpha': '53.98910', 'reg_lambda': '49.41766', 'learning_rate': '0.03674'}
100%|██████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.10s/trial, best loss: 0.2619324646898002]
Y_01
{'colsample_bytree': 0.39658347909403524, 'learning_rate': 0.03673708428131537, 'max_depth': 203, 'min_child_samples': 120, 'min_split_gain': 0.5133775816035386, 'n_estimators': 10, 'num_leaves': 170, 'reg_alpha': 53.98909751244874, 'reg_lambda': 49.417658170536505, 'scale_pos_weight': 1.6510881557018906, 'subsample': 0.5945769051885113}
NRMSE Loss 0.36391 params {'n_estimators': 30, 'max_depth': 241, 'num_leaves': 60, 'min_child_samples': 80, 'colsample_bytree': '0.78958', 'subsample': '0.95996', 'min_split_gain': '0.31844', 'scale_pos_weight': '4.05843', 'reg_alp

NRMSE Loss 0.02473 params {'n_estimators': 25, 'max_depth': 40, 'num_leaves': 180, 'min_child_samples': 120, 'colsample_bytree': '0.83230', 'subsample': '0.76652', 'min_split_gain': '0.65547', 'scale_pos_weight': '1.91277', 'reg_alpha': '397.92068', 'reg_lambda': '307.19096', 'learning_rate': '0.02474'}
100%|████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.40s/trial, best loss: 0.024725736754998236]
Y_12
{'colsample_bytree': 0.8323020179353042, 'learning_rate': 0.024736956232586874, 'max_depth': 40, 'min_child_samples': 120, 'min_split_gain': 0.6554688453585785, 'n_estimators': 25, 'num_leaves': 180, 'reg_alpha': 397.9206823626536, 'reg_lambda': 307.1909564217311, 'scale_pos_weight': 1.9127717216686992, 'subsample': 0.7665181526249765}
NRMSE Loss 0.02488 params {'n_estimators': 15, 'max_depth': 68, 'num_leaves': 110, 'min_child_samples': 35, 'colsample_bytree': '0.36902', 'subsample': '0.65841', 'min_split_gain': '0.62838', 'scale_pos_weight': '9.25762', 'reg_alph

## Catboost Regressor

In [78]:
# Parameter Setting
def cat_objective(params, target):
    params = {
        'n_estimators': int(params['n_estimators']),
        'depth': int(params['depth']),
        'learning_rate': params['learning_rate'],   
        'l2_leaf_reg': params['l2_leaf_reg'],
        'max_bin': int(params['max_bin']),
        'min_data_in_leaf': int(params['min_data_in_leaf']),
        'random_strength': params['random_strength'],
        'fold_len_multiplier': params['fold_len_multiplier'],
        
    }

    model = CatBoostRegressor(
        logging_level='Silent',
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y[target], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y[target]))
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    return losses.mean()

In [81]:
# Parameter Tunning
space_catboost = {
    'n_estimators' : hp.quniform('n_estimators', 10, 20, 5), #100, 300, 50
    'depth': hp.quniform("depth", 2, 16, 1),
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'l2_leaf_reg': hp.uniform('l2_leaf_reg', 3, 8),
    'max_bin' : hp.quniform('max_bin', 1, 254, 1),
    'min_data_in_leaf' : hp.quniform('min_data_in_leaf', 2, 700, 1),
    'random_strength' : hp.loguniform('random_strength', np.log(0.005), np.log(5)),
    'fold_len_multiplier' : hp.loguniform('fold_len_multiplier', np.log(1.01), np.log(2.5)),
}

In [82]:
# Y_01 ~ Y_14 반복

best_params_cat= []

for idx in range(len(target)) :
    
    cat_objective_lambda = lambda params : cat_objective(params, target = target[idx])
    
    best = fmin(fn = cat_objective_lambda,
            space = space_catboost,
            algo = tpe.suggest,
            verbose = 100,
            max_evals = 1)
    
    best_params_cat.append(best)
    print(target[idx])
    print(best)

NRMSE Loss 0.25774 params {'n_estimators': 20, 'depth': 3, 'learning_rate': 0.1901721693222083, 'l2_leaf_reg': 3.5202853420904185, 'max_bin': 128, 'min_data_in_leaf': 400, 'random_strength': 0.015078229232325525, 'fold_len_multiplier': 1.3446311829393718}
100%|█████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.69s/trial, best loss: 0.25773876062886714]
Y_01
{'depth': 3.0, 'fold_len_multiplier': 1.3446311829393718, 'l2_leaf_reg': 3.5202853420904185, 'learning_rate': 0.1901721693222083, 'max_bin': 128.0, 'min_data_in_leaf': 400.0, 'n_estimators': 20.0, 'random_strength': 0.015078229232325525}
NRMSE Loss 0.35981 params {'n_estimators': 15, 'depth': 4, 'learning_rate': 0.16883024869452154, 'l2_leaf_reg': 6.607539761800004, 'max_bin': 102, 'min_data_in_leaf': 534, 'random_strength': 2.45096006131398, 'fold_len_multiplier': 1.1898791666291697}
100%|████████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.58s/trial, best loss: 0.35981310462888]
Y_02
{'de

100%|████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.77s/trial, best loss: 0.024403933061224188]
Y_14
{'depth': 5.0, 'fold_len_multiplier': 1.8978300118431357, 'l2_leaf_reg': 4.248006763440869, 'learning_rate': 0.04789080066265784, 'max_bin': 175.0, 'min_data_in_leaf': 502.0, 'n_estimators': 15.0, 'random_strength': 0.009171366406988821}


## Extra Tree Regressor

In [162]:
# Y_01 ~ Y_14 반복을 위한 List
target = ['Y_01', 'Y_02', 'Y_03','Y_04','Y_05','Y_06','Y_07','Y_08','Y_09','Y_10','Y_11','Y_12','Y_13','Y_14']

In [163]:
# Parameter Setting
def extra_objective(params, target):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        #'min_samples_split': int(params['min_samples_split']),
        'min_samples_leaf': int(params['min_samples_leaf']),
        'min_weight_fraction_leaf': params['min_weight_fraction_leaf'],
        'max_features': params['max_features'],
        'max_leaf_nodes': int(params['max_leaf_nodes']),
        'min_impurity_decrease': params['min_impurity_decrease'],
        'bootstrap': params['bootstrap'],
        'ccp_alpha': params['ccp_alpha'],  
    }

    model = ExtraTreesRegressor(
        n_jobs = -1,
        verbose = 0,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y[target], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y[target]))
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    return losses.mean()

In [164]:
# Parameter Tunning
space_extra = {
    'n_estimators' : hp.quniform('n_estimators', 10, 20, 5), # 100, 1500, 50
    'max_depth': hp.quniform('max_depth', 3, 50, 1),
    #'min_samples_split': hp.quniform('min_samples_split', 0.5, 1, 0.5), # 1 이하여야 함
    'min_samples_leaf': hp.quniform('min_samples_leaf', 5, 50, 1),
    'min_weight_fraction_leaf': hp.uniform('min_weight_fraction_leaf', 0.01, 0.5),
    'max_features': hp.choice('max_features', ['sqrt', 'log2', None, 'auto']),
    'max_leaf_nodes': hp.quniform('max_leaf_nodes', 3, 30, 1),
    'min_impurity_decrease': hp.uniform('min_impurity_decrease', 0, 200),
    'bootstrap':  hp.choice('bootstrap', [True, False]),
    'ccp_alpha': hp.uniform('ccp_alpha', 0.01, 1.0),
}

In [165]:
# Y_01 ~ Y_14 반복

best_params_extra= []

for idx in range(len(target)) :
    
    extra_objective_lambda = lambda params : extra_objective(params, target = target[idx])
    
    best = fmin(fn = extra_objective_lambda,
            space = space_extra,
            algo = tpe.suggest,
            verbose = 100,
            max_evals = 1) #200
    
    best['n_estimators'] = int(best['n_estimators'])
    best['max_depth'] = int(best['max_depth'])
    best['max_leaf_nodes'] = int(best['max_leaf_nodes'])
    best_params_extra.append(best)
    print(target[idx])
    print(best)

NRMSE Loss 0.26302 params {'n_estimators': 10, 'max_depth': 6, 'min_samples_leaf': 11, 'min_weight_fraction_leaf': 0.3301713104021617, 'max_features': 'log2', 'max_leaf_nodes': 16, 'min_impurity_decrease': 5.086342801287125, 'bootstrap': True, 'ccp_alpha': 0.562204083307845}
100%|█████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.28s/trial, best loss: 0.26301810737744125]
Y_01
{'bootstrap': 0, 'ccp_alpha': 0.562204083307845, 'max_depth': 6, 'max_features': 1, 'max_leaf_nodes': 16, 'min_impurity_decrease': 5.086342801287125, 'min_samples_leaf': 11.0, 'min_weight_fraction_leaf': 0.3301713104021617, 'n_estimators': 10}
NRMSE Loss 0.36548 params {'n_estimators': 15, 'max_depth': 29, 'min_samples_leaf': 21, 'min_weight_fraction_leaf': 0.2742311968427292, 'max_features': 'auto', 'max_leaf_nodes': 28, 'min_impurity_decrease': 68.41529138540776, 'bootstrap': False, 'ccp_alpha': 0.14102518464702204}
100%|█████████████████████████████████████████████████| 1/1 [00:01<00:00,  

100%|█████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.86s/trial, best loss: 0.02497875953096713]
Y_13
{'bootstrap': 1, 'ccp_alpha': 0.9257024580701733, 'max_depth': 11, 'max_features': 3, 'max_leaf_nodes': 18, 'min_impurity_decrease': 26.189493967836363, 'min_samples_leaf': 9.0, 'min_weight_fraction_leaf': 0.07041991083746443, 'n_estimators': 15}
NRMSE Loss 0.02500 params {'n_estimators': 15, 'max_depth': 37, 'min_samples_leaf': 21, 'min_weight_fraction_leaf': 0.1706274283513835, 'max_features': None, 'max_leaf_nodes': 7, 'min_impurity_decrease': 88.19158265380365, 'bootstrap': True, 'ccp_alpha': 0.6049504118308293}
100%|████████████████████████████████████████████████| 1/1 [00:01<00:00,  1.89s/trial, best loss: 0.024997457562067973]
Y_14
{'bootstrap': 0, 'ccp_alpha': 0.6049504118308293, 'max_depth': 37, 'max_features': 2, 'max_leaf_nodes': 7, 'min_impurity_decrease': 88.19158265380365, 'min_samples_leaf': 21.0, 'min_weight_fraction_leaf': 0.1706274283513835, 'n_

## NGBR

In [166]:
# Parameter Setting
def ngbr_objective(params, target):
    params = {
        'n_estimators': int(params['n_estimators']),
        'learning_rate': params['learning_rate'],
        'natural_gradient': params['natural_gradient'],
        'col_sample': float(params['col_sample']),
        'minibatch_frac': float(params['minibatch_frac']),
        'tol': float(params['tol']),
    }

    model = NGBRegressor(
        verbose = 100,
        random_state = 1,
        **params
    )

    losses = np.sqrt(-cross_val_score(model, train_x, train_y[target], cv=Config.cv, scoring='neg_mean_squared_error'))
    losses = losses / np.mean(np.abs(train_y[target]))
    print("NRMSE Loss {:.5f} params {}".format(losses.mean(), params))
    return losses.mean()

In [167]:
# Parameter Tunning
space_ngboost = {
    'n_estimators': hp.quniform('n_estimators', 10, 30, 10), #100, 500, 10
    'learning_rate': hp.uniform('learning_rate', 0.01, 0.3),
    'natural_gradient': hp.choice('natural_gradient', [True, False]),
    'col_sample': hp.quniform('col_sample', 0, 1, 0.01),
    'minibatch_frac': hp.quniform('minibatch_frac', 0, 1, 0.01),
    'tol': hp.uniform('tol', 1e-6, 3e-4),
}

In [168]:
# Y_01 ~ Y_14 반복

best_params_ngbr= []

for idx in range(len(target)) :
    
    ngbr_objective_lambda = lambda params : ngbr_objective(params, target = target[idx])
    
    best = fmin(fn = ngbr_objective_lambda,
            space = space_ngboost,
            algo = tpe.suggest,
            verbose = 1000,#10
            max_evals = 1) #200
    
    best['n_estimators'] = int(best['n_estimators'])
    best_params_ngbr.append(best)
    print(target[idx])
    print(best)

[iter 0] loss=0.3836 val_loss=0.0000 scale=0.2500 norm=0.6399                                                          
[iter 0] loss=0.3873 val_loss=0.0000 scale=0.2500 norm=0.6449                                                          
[iter 0] loss=0.3999 val_loss=0.0000 scale=0.2500 norm=0.6455                                                          
[iter 0] loss=0.4002 val_loss=0.0000 scale=0.2500 norm=0.6443                                                          
[iter 0] loss=0.3965 val_loss=0.0000 scale=0.2500 norm=0.6409                                                          
[iter 0] loss=0.4000 val_loss=0.0000 scale=0.2500 norm=0.6388                                                          
[iter 0] loss=0.3893 val_loss=0.0000 scale=0.2500 norm=0.6395                                                          
[iter 0] loss=0.3865 val_loss=0.0000 scale=0.2500 norm=0.6364                                                          
[iter 0] loss=0.3918 val_loss=0.0000 sca

[iter 0] loss=1.9588 val_loss=0.0000 scale=1.0000 norm=1.6477                                                          
[iter 0] loss=1.9400 val_loss=0.0000 scale=0.5000 norm=0.8370                                                          
[iter 0] loss=2.0129 val_loss=0.0000 scale=0.5000 norm=0.8591                                                          
[iter 0] loss=2.0499 val_loss=0.0000 scale=0.5000 norm=0.8705                                                          
[iter 0] loss=2.1251 val_loss=0.0000 scale=1.0000 norm=1.7705                                                          
[iter 0] loss=2.1251 val_loss=0.0000 scale=1.0000 norm=1.7688                                                          
[iter 0] loss=2.1241 val_loss=0.0000 scale=1.0000 norm=1.7719                                                          
[iter 0] loss=2.1243 val_loss=0.0000 scale=1.0000 norm=1.7722                                                          
[iter 0] loss=2.1254 val_loss=0.0000 sca

100%|█████████████████████████████████████████████████| 1/1 [00:06<00:00,  6.71s/trial, best loss: 0.03968090179749164]
Y_10
{'col_sample': 0.25, 'learning_rate': 0.18575424412705777, 'minibatch_frac': 0.45, 'n_estimators': 10, 'natural_gradient': 1, 'tol': 8.375247761925755e-05}
[iter 0] loss=1.2248 val_loss=0.0000 scale=1.0000 norm=1.5014                                                          
[iter 0] loss=1.2342 val_loss=0.0000 scale=1.0000 norm=1.4935                                                          
[iter 0] loss=1.2324 val_loss=0.0000 scale=1.0000 norm=1.4954                                                          
[iter 0] loss=1.2322 val_loss=0.0000 scale=1.0000 norm=1.4947                                                          
[iter 0] loss=1.2353 val_loss=0.0000 scale=1.0000 norm=1.4919                                                          
[iter 0] loss=1.2381 val_loss=0.0000 scale=1.0000 norm=1.4871                                                          

## Stacking Ensenble
- 모델별 개별학습(타겟 Y_01~Y_14) 반복

In [228]:
# Y_01 ~ Y_14 반복을 위한 List
target = train_y.columns.tolist()

In [229]:
def get_stacking_base_datasets(model, train_x, train_y, col,test, params):
    kf = KFold(n_splits=2, shuffle=False)
    train_fold_pred = np.zeros((train_x.shape[0],1))
    test_pred = np.zeros((test.shape[0],10))
    
    
    for folder_counter, (train_index, valid_index) in enumerate(kf.split(train_x)):
        print('Fold : ', folder_counter, ' Start')
        X_tr = train_x.loc[train_index]
        y_tr = train_y[col].loc[train_index]
        X_te = train_x.loc[valid_index] 
        
        if model == 'cat':
          model = CatBoostRegressor(random_state=1,
                                    **params)
        
        elif model == 'extra':
          model = ExtraTreesRegressor(random_state=1, 
                                      **params)

        elif model == 'ngbr':
          model = NGBRegressor(random_state = 1)
        
        elif model == 'lgbm':
          model = LGBMRegressor(random_state=1, n_jobs=-1, 
                                **params)

        model.fit(X_tr, y_tr)
        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1,1) 
        test_pred[:, folder_counter] = model.predict(test) 
        
    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1,1)
    
    return train_fold_pred, test_pred_mean 

In [236]:
# Y_01 ~ Y_14 반복
target = train_y.columns.tolist()
print('Outside target: ', target)
for idx in range(len(target)) :
    print('Before target : ', target)
    print('index :', idx)
    print('After target: ', target)
    stack_final = []
    
    # Y_01 ~ Y_14의 best parameter 가져오기
    # xx_train, xx_test = get_stacking_base_datasets('cat', train_x, train_y, col=target[idx[, test=test_x, params = best_params_cat[idx])
    zz_train, zz_test = get_stacking_base_datasets('lgbm', train_x, train_y, col=target[idx], test=test_x, params = best_params_lgbm[idx])
    qq_train, qq_test = get_stacking_base_datasets('ngbr', train_x, train_y, col=target[idx], test=test_x, params = best_params_ngbr[idx])
    yy_train, yy_test = get_stacking_base_datasets('extra', train_x, train_y, col=target[idx], test=test_x, params = best_params_extra[idx])
    
    Stack_final_X_train = np.concatenate((xx_train, yy_train, zz_train, qq_train), axis=1)
    Stack_final_X_test = np.concatenate((xx_test, yy_test, zz_test, qq_test), axis=1)
    
    # final model 선택
    lr_final = LGBMRegressor(**best_params_lgbm[idx])
    lr_final.fit(Stack_final_X_train, train_y[target[idx]])
    stack_final_csv = lr_final.predict(Stack_final_X_test)
    stack_final.append(stack_final_csv)

Outside target:  ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14']
Before target :  ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14']
index : 0
After target:  ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 'Y_11', 'Y_12', 'Y_13', 'Y_14']
Fold :  0  Start
Fold :  1  Start
Fold :  0  Start
[iter 0] loss=0.3653 val_loss=0.0000 scale=1.0000 norm=0.6084
[iter 100] loss=0.3336 val_loss=0.0000 scale=1.0000 norm=0.5934
[iter 200] loss=0.3180 val_loss=0.0000 scale=1.0000 norm=0.5931
[iter 300] loss=0.3082 val_loss=0.0000 scale=1.0000 norm=0.5924
[iter 400] loss=0.3000 val_loss=0.0000 scale=2.0000 norm=1.1830
Fold :  1  Start
[iter 0] loss=0.4208 val_loss=0.0000 scale=1.0000 norm=0.6925
[iter 100] loss=0.3824 val_loss=0.0000 scale=1.0000 norm=0.6345
[iter 200] loss=0.3697 val_loss=0.0000 scale=1.0000 norm=0.6211
[iter 300] loss=0.3608 val

KeyboardInterrupt: 

In [None]:
sub = pd.read_csv('Data/sample_submission.csv')

for idx in range(len(target)) :
    
    sub[target[idx]] = stack_final[idx]

sub.to_csv('Data/stack_Ensenble.csv', index=False)

## Data Save(to CSV)

In [None]:
sub = pd.read_csv('Data/sample_submission.csv')
sub['Y_01'] = stack_final1
sub['Y_02'] = stack_final2
sub['Y_03'] = stack_final3
sub['Y_04'] = stack_final4
sub['Y_05'] = stack_final5
sub['Y_06'] = stack_final6
sub['Y_07'] = stack_final7
sub['Y_08'] = stack_final8
sub['Y_09'] = stack_final9
sub['Y_10'] = stack_final10
sub['Y_11'] = stack_final11
sub['Y_12'] = stack_final12
sub['Y_13'] = stack_final13
sub['Y_14'] = stack_final14
sub.to_csv('Data/stack_Ensenble.csv', index=False)