### Import Libraries

In [None]:
import pandas as pd
import random
import os
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
from functools import partial
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

import math
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import statsmodels.formula.api as sm
from statsmodels.api import add_constant
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

  import pandas.util.testing as tm


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### Usable Functions 

In [None]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [None]:
def check_for_NAs(df, show=False):
    """
    @Description: checks for the NAs in the dataframe
    @Param1: df, pandas dataframe
    @Param2: show, boolean indicating whether NaN data are also necessary as a part of the output
    @Return: name of the columns with NaN
    """
    nan_values = df.loc[:, df.isnull().any()]
    if show:
        return df[df.isna().any(axis=1)]
    return list(nan_values.columns)

In [None]:
def check_for_label_bound(df, labels, bound):
    """
    @Description: check bound is inbetween min and max
    @Param1: df, pandas dataframe
    @Param2: labels, list of column names 
    @Param3: thres: list of bounds
    @Return: names of the columns not within the bound
    """
    n = len(labels)
    result = []
    for idx in range(n):
        col = labels[idx]
        thres = bound[idx]
        extracted_column = df[col]
        if not extracted_column.between(thres[0], thres[1]).all():
            result.append(labels[idx])
    if len(result) == 0:
        print('everything is within the bound')
    return result

In [None]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [None]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [None]:
def adjacent_histogram_boxplot(feature_var, figsize = (7, 5)):
    """
    @Description: plot histogram and boxplot in next to each other
    @Param1: feature_var, pandas series 
    @Param2: figsize, size of the figure 
    """
    fig, (hist_plot, box_plot) = plt.subplots(nrows=2, sharex=True, gridspec_kw={'height_ratios':(.85,.15)}, figsize=figsize)
    sns.distplot(feature_var, kde=True, ax=hist_plot, kde_kws= {"linewidth":1.5}) 
    sns.boxplot(feature_var, ax=box_plot, linewidth = 1, width = 0.5)
    hist_plot.set_ylabel('')    
    hist_plot.set_xlabel('')
    box_plot.set_xlabel('')
    hist_plot.tick_params(labelsize=8)
    box_plot.tick_params(labelsize=8)
    fig.suptitle(feature_var.name, fontsize = 10)
    hist_plot.axvline(np.mean(feature_var),color='red',linestyle='-',lw = 1.5)
    hist_plot.axvline(np.median(feature_var),color='green',linestyle='--',lw = 1.5)
    

In [None]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [None]:
def rmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: rmse score
    """
    MSE = np.square(np.subtract(gt,preds)).mean() 
    RMSE = math.sqrt(MSE)
    return RMSE

In [None]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/LG AIMERS/Data/train.csv')
test_x = pd.read_csv('/content/drive/MyDrive/LG AIMERS/Data/test.csv')
train_x, train_y = dataset_split_X_y(train_df)
cols_with_zero_variance = zero_variance(train_x)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [None]:
def randomforest(data, target, significance_level=0.05):
   
    results = []
    rfg = RandomForestRegressor(n_estimators=300, random_state=0, n_jobs=-1)
    feature_names = data.columns
    rfg.fit(data, target)
    for feature, score in zip(feature_names, rfg.feature_importances_):
        if score < np.mean(rfg.feature_importances_):
            results.append(feature)
            print('Add {0} with importances {1}'.format(results[-1], score))
    return results

results = []
rfg = RandomForestRegressor(n_estimators=100, random_state=0, n_jobs=-1)
feature_names = train_x.columns
rfg.fit(train_x, train_y)
for feature, score in zip(feature_names, rfg.feature_importances_):
    if score < np.mean(rfg.feature_importances_):
        results.append(feature)
        
train_x = train_x.drop(results, axis = 1)
test_x = test_x.drop(results, axis = 1)

In [None]:
c = randomforest(train_x, train_y['Y_01'])

Add X_01 with importances 0.01245000288477721
Add X_02 with importances 0.0025628592516788937
Add X_06 with importances 0.01154380587598866
Add X_10 with importances 3.9324542702187516e-05
Add X_11 with importances 3.734773767361109e-05
Add X_12 with importances 0.012487157808772957
Add X_13 with importances 0.014202677349582556
Add X_14 with importances 0.010837569002242808
Add X_15 with importances 0.011314470378828238
Add X_16 with importances 0.013818411324905936
Add X_17 with importances 0.011156176964059569
Add X_18 with importances 0.011715529722404762
Add X_24 with importances 0.011226256404550713
Add X_25 with importances 0.012007338831116455
Add X_26 with importances 0.012381738061369154
Add X_27 with importances 0.013218174855795625
Add X_28 with importances 0.013299005014523605
Add X_29 with importances 0.013733120134394561
Add X_30 with importances 0.010451917688336749
Add X_31 with importances 0.01743025233948091
Add X_32 with importances 0.009146866968648056
Add X_33 wit

In [None]:
c1 = randomforest(train_x, train_y['Y_02'])

Add X_01 with importances 0.012615563987862082
Add X_02 with importances 0.002460209866888607
Add X_06 with importances 0.011394433074745258
Add X_10 with importances 3.910730848084107e-05
Add X_11 with importances 5.474704145283462e-05
Add X_12 with importances 0.012638570964467417
Add X_13 with importances 0.01395210804363875
Add X_14 with importances 0.01090216517718437
Add X_15 with importances 0.011071273710437939
Add X_16 with importances 0.014619569742510495
Add X_17 with importances 0.01053051126150058
Add X_18 with importances 0.011525268662298215
Add X_24 with importances 0.011141585673508226
Add X_25 with importances 0.012102308838966493
Add X_26 with importances 0.011695986960323155
Add X_27 with importances 0.013953823274845336
Add X_28 with importances 0.01328677562726545
Add X_29 with importances 0.013888690648523614
Add X_30 with importances 0.01054683938022205
Add X_31 with importances 0.016822411553657955
Add X_32 with importances 0.009637588760561264
Add X_33 with im

In [None]:
c2 = randomforest(train_x, train_y['Y_03'])

Add X_01 with importances 0.01263230441550626
Add X_02 with importances 0.002572160566606318
Add X_06 with importances 0.011289848063343369
Add X_10 with importances 1.7643640864117383e-05
Add X_11 with importances 2.332669244112006e-05
Add X_12 with importances 0.012347158875194443
Add X_13 with importances 0.014286564174403434
Add X_14 with importances 0.010730960385463263
Add X_15 with importances 0.011456368173764057
Add X_16 with importances 0.014143783731400102
Add X_17 with importances 0.01060861446767299
Add X_18 with importances 0.010695125600220274
Add X_24 with importances 0.011686107402487272
Add X_25 with importances 0.012587156453110123
Add X_26 with importances 0.013060150889688045
Add X_27 with importances 0.013996330443885737
Add X_28 with importances 0.014189036353270368
Add X_29 with importances 0.014190857923489463
Add X_30 with importances 0.010764332921713634
Add X_31 with importances 0.01773743171502607
Add X_32 with importances 0.009896171766361553
Add X_38 with

In [None]:
c3 = randomforest(train_x, train_y['Y_04'])

Add X_01 with importances 0.01376657301024169
Add X_02 with importances 0.002397458110692369
Add X_06 with importances 0.01054164965351651
Add X_10 with importances 0.00011310301035007127
Add X_11 with importances 5.963209646440389e-05
Add X_12 with importances 0.013237800757802893
Add X_13 with importances 0.01540844135970036
Add X_14 with importances 0.011636391734598859
Add X_15 with importances 0.011379511865530146
Add X_16 with importances 0.01322235815295064
Add X_17 with importances 0.01048687855684486
Add X_18 with importances 0.009647735797332852
Add X_24 with importances 0.0109623739483436
Add X_25 with importances 0.011670838579466585
Add X_26 with importances 0.01239777222972077
Add X_27 with importances 0.01250081167666636
Add X_28 with importances 0.014392833487154836
Add X_29 with importances 0.016176678637225012
Add X_30 with importances 0.011828065410302449
Add X_31 with importances 0.016482725566317795
Add X_33 with importances 0.016781814642283354
Add X_36 with impor

In [None]:
c4 = randomforest(train_x, train_y['Y_05'])

Add X_01 with importances 0.013272845788352031
Add X_02 with importances 0.0025165424351917386
Add X_06 with importances 0.011280495116635822
Add X_10 with importances 0.0003614449430324953
Add X_11 with importances 0.00011048434968032068
Add X_12 with importances 0.01394902026801178
Add X_13 with importances 0.01563383495864782
Add X_14 with importances 0.01135554672698857
Add X_15 with importances 0.011977867110328045
Add X_16 with importances 0.014230112005194613
Add X_17 with importances 0.01104929414029547
Add X_18 with importances 0.010886068702584004
Add X_24 with importances 0.011464262825215728
Add X_25 with importances 0.011987154824578274
Add X_26 with importances 0.012800414532255859
Add X_27 with importances 0.013017130497744644
Add X_28 with importances 0.013766649065133056
Add X_29 with importances 0.013742774934301781
Add X_30 with importances 0.010636926993914383
Add X_31 with importances 0.017060055580508274
Add X_32 with importances 0.011240118178907351
Add X_33 with

In [None]:
c5 = randomforest(train_x, train_y['Y_06'])

Add X_01 with importances 0.012493331611818178
Add X_02 with importances 0.0022126944317238063
Add X_06 with importances 0.010383093320728575
Add X_10 with importances 0.00012852136023725413
Add X_11 with importances 6.529409870547312e-05
Add X_12 with importances 0.007383341274197827
Add X_13 with importances 0.01368366594788132
Add X_14 with importances 0.0140643004397642
Add X_15 with importances 0.012825107536653472
Add X_16 with importances 0.014511821970092929
Add X_17 with importances 0.009341391317427067
Add X_18 with importances 0.006366665041406453
Add X_20 with importances 0.017645237953312874
Add X_22 with importances 0.01707065250269406
Add X_24 with importances 0.009095368570980398
Add X_25 with importances 0.009706541345031084
Add X_26 with importances 0.012426607473244266
Add X_27 with importances 0.013802542157487337
Add X_28 with importances 0.01103757663951131
Add X_29 with importances 0.010923349145761672
Add X_30 with importances 0.009403710743198795
Add X_31 with 

In [None]:
c6 = randomforest(train_x, train_y['Y_07'])

Add X_01 with importances 0.012929978695922429
Add X_02 with importances 0.0024969007051968803
Add X_06 with importances 0.0120319318154422
Add X_10 with importances 0.00021358420155208094
Add X_11 with importances 0.0001902828112009901
Add X_12 with importances 0.012802654549775213
Add X_13 with importances 0.016845673010682922
Add X_14 with importances 0.010705882984816098
Add X_15 with importances 0.011000304088361196
Add X_16 with importances 0.01429347857539222
Add X_17 with importances 0.010662724806904582
Add X_18 with importances 0.009928405137903348
Add X_24 with importances 0.011414226479638969
Add X_25 with importances 0.011694847786296987
Add X_26 with importances 0.012887412073440454
Add X_27 with importances 0.01327822851282106
Add X_28 with importances 0.012877345572408402
Add X_29 with importances 0.014363861094196174
Add X_30 with importances 0.01068084942682043
Add X_31 with importances 0.017504045582609845
Add X_32 with importances 0.012123410236815478
Add X_33 with 

In [None]:
c7 = randomforest(train_x, train_y['Y_08'])

Add X_01 with importances 0.012259204353501894
Add X_02 with importances 0.002303231020018249
Add X_06 with importances 0.011324445539692413
Add X_10 with importances 0.000721886754711759
Add X_11 with importances 0.00042743579929581995
Add X_12 with importances 0.012845288284351773
Add X_13 with importances 0.012944639855093702
Add X_14 with importances 0.010756179259046263
Add X_15 with importances 0.010689091525266084
Add X_16 with importances 0.014033516714980805
Add X_17 with importances 0.010131252341244923
Add X_18 with importances 0.01006570536332329
Add X_24 with importances 0.01120777188407022
Add X_25 with importances 0.011819459704716989
Add X_26 with importances 0.011962650289760728
Add X_27 with importances 0.01260925326709378
Add X_28 with importances 0.012927386830939077
Add X_29 with importances 0.013163999609803665
Add X_30 with importances 0.015176138746685575
Add X_31 with importances 0.016335000386840805
Add X_33 with importances 0.01748792559113626
Add X_34 with i

In [None]:
c8 = randomforest(train_x, train_y['Y_09'])

Add X_01 with importances 0.012316513968299766
Add X_02 with importances 0.0023043893703214298
Add X_06 with importances 0.011285395830022962
Add X_10 with importances 0.0007083154763853559
Add X_11 with importances 0.0003182733588021092
Add X_12 with importances 0.012819462407845985
Add X_13 with importances 0.01319770698650505
Add X_14 with importances 0.010830174944422413
Add X_15 with importances 0.010635953073439099
Add X_16 with importances 0.013757989079755976
Add X_17 with importances 0.010424183586450341
Add X_18 with importances 0.009639878831021895
Add X_24 with importances 0.011227949999230996
Add X_25 with importances 0.012022536415571074
Add X_26 with importances 0.01202822430481856
Add X_27 with importances 0.012889584620582099
Add X_28 with importances 0.0129207655601927
Add X_29 with importances 0.013719565051189219
Add X_30 with importances 0.016115598747213952
Add X_31 with importances 0.01608634814345197
Add X_33 with importances 0.017191892886104933
Add X_34 with i

In [None]:
c9 = randomforest(train_x, train_y['Y_10'])

Add X_01 with importances 0.012496385105451728
Add X_02 with importances 0.002085952284477409
Add X_06 with importances 0.01086991910381641
Add X_10 with importances 0.0006748106010617073
Add X_11 with importances 0.0001982712108797771
Add X_12 with importances 0.010954503943940836
Add X_13 with importances 0.012190419003014635
Add X_14 with importances 0.01282285447761473
Add X_15 with importances 0.010956495926932642
Add X_16 with importances 0.014786646256214226
Add X_17 with importances 0.009856626058360579
Add X_18 with importances 0.013686316169358234
Add X_24 with importances 0.010667846500570014
Add X_25 with importances 0.011692733458795911
Add X_26 with importances 0.01148877080099819
Add X_27 with importances 0.012799090755924027
Add X_28 with importances 0.011503699324345999
Add X_29 with importances 0.012409476937370933
Add X_30 with importances 0.010906415545365644
Add X_31 with importances 0.014883304961731111
Add X_32 with importances 0.01025370980142332
Add X_33 with i

In [None]:
c10 = randomforest(train_x, train_y['Y_11'])

Add X_01 with importances 0.01347368226639672
Add X_02 with importances 0.002441841960644487
Add X_06 with importances 0.012106404777667905
Add X_10 with importances 0.0007762322034743624
Add X_11 with importances 0.00029001071839188303
Add X_12 with importances 0.012792028926486112
Add X_13 with importances 0.013268050767554156
Add X_14 with importances 0.011252094315725785
Add X_15 with importances 0.011559029750861484
Add X_16 with importances 0.01408699358786535
Add X_17 with importances 0.01190343708133048
Add X_18 with importances 0.010446286887645533
Add X_24 with importances 0.012065324907296358
Add X_25 with importances 0.01209743651294949
Add X_26 with importances 0.012027683240631193
Add X_27 with importances 0.012951456941854959
Add X_28 with importances 0.013553965443749424
Add X_29 with importances 0.014096802039121692
Add X_30 with importances 0.010996706616836717
Add X_31 with importances 0.017276454954148954
Add X_32 with importances 0.013386620641381583
Add X_33 with 

In [None]:
c11 = randomforest(train_x, train_y['Y_12'])

Add X_01 with importances 0.01244513553893562
Add X_02 with importances 0.002275624414385845
Add X_06 with importances 0.01113508794729718
Add X_10 with importances 0.0007311396421055812
Add X_11 with importances 0.0005002110233057915
Add X_12 with importances 0.013086691900787368
Add X_13 with importances 0.013124720347023014
Add X_14 with importances 0.010831638779374432
Add X_15 with importances 0.010889145145056052
Add X_16 with importances 0.01380102226339367
Add X_17 with importances 0.010250905890490412
Add X_18 with importances 0.009807318727502653
Add X_24 with importances 0.011325968193850497
Add X_25 with importances 0.011978023347945418
Add X_26 with importances 0.012001366557823055
Add X_27 with importances 0.012357666976973996
Add X_28 with importances 0.01273007715335945
Add X_29 with importances 0.013343683727680246
Add X_30 with importances 0.01706652868330419
Add X_31 with importances 0.01634428716634436
Add X_33 with importances 0.017739422689821205
Add X_34 with imp

In [None]:
c12 = randomforest(train_x, train_y['Y_13'])

Add X_01 with importances 0.01230703553762008
Add X_02 with importances 0.002283098225751027
Add X_06 with importances 0.011246418854090647
Add X_10 with importances 0.0008015981902647291
Add X_11 with importances 0.0004697735268785984
Add X_12 with importances 0.012795440887068825
Add X_13 with importances 0.013206408467811922
Add X_14 with importances 0.010836484851906596
Add X_15 with importances 0.010702457650362512
Add X_16 with importances 0.013653727536070005
Add X_17 with importances 0.01018526372192157
Add X_18 with importances 0.009860503982919756
Add X_24 with importances 0.011283766098715122
Add X_25 with importances 0.011596225270091642
Add X_26 with importances 0.01201608155899099
Add X_27 with importances 0.012686915778237252
Add X_28 with importances 0.012797520238420429
Add X_29 with importances 0.01347708755140643
Add X_30 with importances 0.014660420307066397
Add X_31 with importances 0.016261879642158014
Add X_33 with importances 0.017308316247033044
Add X_34 with i

In [None]:
c13 = randomforest(train_x, train_y['Y_14'])

Add X_01 with importances 0.012245062359205122
Add X_02 with importances 0.002240706013047394
Add X_06 with importances 0.011416886099604092
Add X_10 with importances 0.0007396788931235352
Add X_11 with importances 0.00039293841213766597
Add X_12 with importances 0.012780993211948729
Add X_13 with importances 0.013160180798809162
Add X_14 with importances 0.010858541008096332
Add X_15 with importances 0.010758485437333414
Add X_16 with importances 0.013625558363780073
Add X_17 with importances 0.01046236738322703
Add X_18 with importances 0.009941503653944205
Add X_24 with importances 0.011172037710221516
Add X_25 with importances 0.011921755455348489
Add X_26 with importances 0.012089786612042701
Add X_27 with importances 0.012791432030657947
Add X_28 with importances 0.012894763243226633
Add X_29 with importances 0.01346032179792741
Add X_30 with importances 0.015394745757275959
Add X_31 with importances 0.016294121569461632
Add X_33 with importances 0.01785391873881103
Add X_34 with

### Train

In [None]:
train_x_for_y01, test_x_for_y01 = train_x[c], test_x[c]
train_y_01 = train_y['Y_01']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y01, train_y_01, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y01 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.354 params {'n_estimators': 1050, 'max_depth': 14, 'num_leaves': 70, 'min_child_samples': 200, 'colsample_bytree': '0.972', 'subsample': '0.921', 'min_split_gain': '0.503', 'scale_pos_weight': '3.918', 'reg_alpha': '63.004', 'reg_lambda': '80.080', 'learning_rate': '0.150'}
RMSE Loss 0.352 params {'n_estimators': 600, 'max_depth': 7, 'num_leaves': 40, 'min_child_samples': 230, 'colsample_bytree': '0.374', 'subsample': '0.801', 'min_split_gain': '0.173', 'scale_pos_weight': '2.106', 'reg_alpha': '17.288', 'reg_lambda': '93.451', 'learning_rate': '0.056'}
RMSE Loss 0.354 params {'n_estimators': 550, 'max_depth': 96, 'num_leaves': 30, 'min_child_samples': 110, 'colsample_bytree': '0.610', 'subsample': '0.688', 'min_split_gain': '0.358', 'scale_pos_weight': '9.435', 'reg_alpha': '53.774', 'reg_lambda': '72.148', 'learning_rate': '0.174'}
RMSE Loss 0.353 params {'n_estimators': 1150, 'max_depth': 53, 'num_leaves': 40, 'min_child_samples': 110, 'colsample_bytree': '0.377', 'subsa

In [None]:
train_x_for_y02, test_x_for_y02 = train_x[c1], test_x[c1]
train_y_02 = train_y['Y_02']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y02, train_y_02, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y02 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.381 params {'n_estimators': 1150, 'max_depth': 38, 'num_leaves': 70, 'min_child_samples': 130, 'colsample_bytree': '0.834', 'subsample': '0.981', 'min_split_gain': '0.146', 'scale_pos_weight': '3.068', 'reg_alpha': '15.109', 'reg_lambda': '1.511', 'learning_rate': '0.119'}
RMSE Loss 0.383 params {'n_estimators': 400, 'max_depth': 95, 'num_leaves': 50, 'min_child_samples': 260, 'colsample_bytree': '0.792', 'subsample': '0.511', 'min_split_gain': '0.139', 'scale_pos_weight': '6.762', 'reg_alpha': '94.590', 'reg_lambda': '44.095', 'learning_rate': '0.272'}
RMSE Loss 0.381 params {'n_estimators': 1400, 'max_depth': 76, 'num_leaves': 50, 'min_child_samples': 280, 'colsample_bytree': '0.695', 'subsample': '0.758', 'min_split_gain': '0.246', 'scale_pos_weight': '5.183', 'reg_alpha': '18.175', 'reg_lambda': '69.668', 'learning_rate': '0.153'}
RMSE Loss 0.382 params {'n_estimators': 150, 'max_depth': 78, 'num_leaves': 60, 'min_child_samples': 70, 'colsample_bytree': '0.455', 'subsam

In [None]:
train_x_for_y03, test_x_for_y03 = train_x[c2], test_x[c2]
train_y_03 = train_y['Y_03']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y03, train_y_03, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y03 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.359 params {'n_estimators': 1200, 'max_depth': 80, 'num_leaves': 70, 'min_child_samples': 20, 'colsample_bytree': '0.353', 'subsample': '0.595', 'min_split_gain': '0.444', 'scale_pos_weight': '9.852', 'reg_alpha': '42.765', 'reg_lambda': '76.651', 'learning_rate': '0.013'}
RMSE Loss 0.359 params {'n_estimators': 1200, 'max_depth': 22, 'num_leaves': 90, 'min_child_samples': 260, 'colsample_bytree': '0.693', 'subsample': '0.617', 'min_split_gain': '0.410', 'scale_pos_weight': '5.464', 'reg_alpha': '41.766', 'reg_lambda': '55.916', 'learning_rate': '0.093'}
RMSE Loss 0.358 params {'n_estimators': 400, 'max_depth': 53, 'num_leaves': 30, 'min_child_samples': 260, 'colsample_bytree': '0.731', 'subsample': '0.492', 'min_split_gain': '0.580', 'scale_pos_weight': '8.828', 'reg_alpha': '12.277', 'reg_lambda': '15.104', 'learning_rate': '0.203'}
RMSE Loss 0.358 params {'n_estimators': 450, 'max_depth': 10, 'num_leaves': 80, 'min_child_samples': 160, 'colsample_bytree': '0.714', 'subsa

In [None]:
train_x_for_y04, test_x_for_y04 = train_x[c3], test_x[c3]
train_y_04 = train_y['Y_04']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y04, train_y_04, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y04 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 2.649 params {'n_estimators': 200, 'max_depth': 28, 'num_leaves': 40, 'min_child_samples': 280, 'colsample_bytree': '0.802', 'subsample': '0.729', 'min_split_gain': '0.158', 'scale_pos_weight': '5.830', 'reg_alpha': '74.206', 'reg_lambda': '28.646', 'learning_rate': '0.100'}
RMSE Loss 2.648 params {'n_estimators': 400, 'max_depth': 79, 'num_leaves': 80, 'min_child_samples': 80, 'colsample_bytree': '0.796', 'subsample': '0.441', 'min_split_gain': '0.360', 'scale_pos_weight': '4.818', 'reg_alpha': '63.132', 'reg_lambda': '0.552', 'learning_rate': '0.057'}
RMSE Loss 2.668 params {'n_estimators': 1150, 'max_depth': 26, 'num_leaves': 30, 'min_child_samples': 140, 'colsample_bytree': '0.399', 'subsample': '0.844', 'min_split_gain': '0.223', 'scale_pos_weight': '7.735', 'reg_alpha': '15.221', 'reg_lambda': '57.856', 'learning_rate': '0.066'}
RMSE Loss 2.646 params {'n_estimators': 1050, 'max_depth': 45, 'num_leaves': 90, 'min_child_samples': 100, 'colsample_bytree': '0.583', 'subsam

In [None]:
train_x_for_y05, test_x_for_y05 = train_x[c4], test_x[c4]
train_y_05 = train_y['Y_05']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y05, train_y_05, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y05 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 2.522 params {'n_estimators': 750, 'max_depth': 83, 'num_leaves': 20, 'min_child_samples': 90, 'colsample_bytree': '0.496', 'subsample': '0.539', 'min_split_gain': '0.209', 'scale_pos_weight': '9.492', 'reg_alpha': '67.283', 'reg_lambda': '32.333', 'learning_rate': '0.029'}
RMSE Loss 2.544 params {'n_estimators': 400, 'max_depth': 49, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.663', 'subsample': '0.572', 'min_split_gain': '0.284', 'scale_pos_weight': '8.662', 'reg_alpha': '48.109', 'reg_lambda': '26.412', 'learning_rate': '0.249'}
RMSE Loss 2.571 params {'n_estimators': 650, 'max_depth': 52, 'num_leaves': 40, 'min_child_samples': 180, 'colsample_bytree': '0.527', 'subsample': '0.558', 'min_split_gain': '0.691', 'scale_pos_weight': '8.049', 'reg_alpha': '0.821', 'reg_lambda': '66.356', 'learning_rate': '0.233'}
RMSE Loss 2.533 params {'n_estimators': 800, 'max_depth': 53, 'num_leaves': 100, 'min_child_samples': 80, 'colsample_bytree': '0.646', 'subsampl

In [None]:
train_x_for_y06, test_x_for_y06 = train_x[c5], test_x[c5]
train_y_06 = train_y['Y_06']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y06, train_y_06, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y06 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 1.581 params {'n_estimators': 200, 'max_depth': 89, 'num_leaves': 70, 'min_child_samples': 140, 'colsample_bytree': '0.452', 'subsample': '0.445', 'min_split_gain': '0.102', 'scale_pos_weight': '8.469', 'reg_alpha': '71.101', 'reg_lambda': '76.626', 'learning_rate': '0.056'}
RMSE Loss 1.582 params {'n_estimators': 500, 'max_depth': 67, 'num_leaves': 100, 'min_child_samples': 240, 'colsample_bytree': '0.640', 'subsample': '0.740', 'min_split_gain': '0.080', 'scale_pos_weight': '8.323', 'reg_alpha': '91.105', 'reg_lambda': '53.046', 'learning_rate': '0.020'}
RMSE Loss 1.581 params {'n_estimators': 700, 'max_depth': 66, 'num_leaves': 50, 'min_child_samples': 40, 'colsample_bytree': '0.794', 'subsample': '0.610', 'min_split_gain': '0.502', 'scale_pos_weight': '8.433', 'reg_alpha': '45.079', 'reg_lambda': '39.597', 'learning_rate': '0.115'}
RMSE Loss 1.581 params {'n_estimators': 450, 'max_depth': 96, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.998', 'subsam

In [None]:
train_x_for_y07, test_x_for_y07 = train_x[c6], test_x[c6]
train_y_07 = train_y['Y_07']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y07, train_y_07, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y07 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.415 params {'n_estimators': 150, 'max_depth': 17, 'num_leaves': 80, 'min_child_samples': 100, 'colsample_bytree': '0.547', 'subsample': '0.392', 'min_split_gain': '0.162', 'scale_pos_weight': '6.408', 'reg_alpha': '2.944', 'reg_lambda': '3.495', 'learning_rate': '0.209'}
RMSE Loss 0.415 params {'n_estimators': 750, 'max_depth': 71, 'num_leaves': 70, 'min_child_samples': 200, 'colsample_bytree': '0.756', 'subsample': '0.532', 'min_split_gain': '0.374', 'scale_pos_weight': '8.329', 'reg_alpha': '8.335', 'reg_lambda': '11.624', 'learning_rate': '0.396'}
RMSE Loss 0.415 params {'n_estimators': 1450, 'max_depth': 24, 'num_leaves': 80, 'min_child_samples': 240, 'colsample_bytree': '0.538', 'subsample': '0.510', 'min_split_gain': '0.521', 'scale_pos_weight': '6.984', 'reg_alpha': '12.822', 'reg_lambda': '6.887', 'learning_rate': '0.112'}
RMSE Loss 0.415 params {'n_estimators': 550, 'max_depth': 68, 'num_leaves': 100, 'min_child_samples': 70, 'colsample_bytree': '0.761', 'subsample

In [None]:
train_x_for_y08, test_x_for_y08 = train_x[c7], test_x[c7]
train_y_08 = train_y['Y_08']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y08, train_y_08, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y08 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.642 params {'n_estimators': 250, 'max_depth': 12, 'num_leaves': 50, 'min_child_samples': 120, 'colsample_bytree': '0.790', 'subsample': '0.496', 'min_split_gain': '0.443', 'scale_pos_weight': '5.303', 'reg_alpha': '49.998', 'reg_lambda': '38.205', 'learning_rate': '0.403'}
RMSE Loss 0.642 params {'n_estimators': 950, 'max_depth': 41, 'num_leaves': 70, 'min_child_samples': 240, 'colsample_bytree': '0.376', 'subsample': '0.626', 'min_split_gain': '0.432', 'scale_pos_weight': '4.153', 'reg_alpha': '39.012', 'reg_lambda': '60.437', 'learning_rate': '0.215'}
RMSE Loss 0.640 params {'n_estimators': 500, 'max_depth': 44, 'num_leaves': 20, 'min_child_samples': 180, 'colsample_bytree': '0.587', 'subsample': '0.906', 'min_split_gain': '0.024', 'scale_pos_weight': '4.841', 'reg_alpha': '24.495', 'reg_lambda': '13.440', 'learning_rate': '0.089'}
RMSE Loss 0.641 params {'n_estimators': 1250, 'max_depth': 49, 'num_leaves': 90, 'min_child_samples': 290, 'colsample_bytree': '0.940', 'subsa

In [None]:
train_x_for_y09, test_x_for_y09 = train_x[c8], test_x[c8]
train_y_09 = train_y['Y_09']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y09, train_y_09, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y09 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.635 params {'n_estimators': 950, 'max_depth': 39, 'num_leaves': 80, 'min_child_samples': 130, 'colsample_bytree': '0.563', 'subsample': '0.310', 'min_split_gain': '0.620', 'scale_pos_weight': '3.084', 'reg_alpha': '8.533', 'reg_lambda': '31.509', 'learning_rate': '0.050'}
RMSE Loss 0.638 params {'n_estimators': 350, 'max_depth': 95, 'num_leaves': 50, 'min_child_samples': 120, 'colsample_bytree': '0.560', 'subsample': '0.949', 'min_split_gain': '0.407', 'scale_pos_weight': '2.500', 'reg_alpha': '80.190', 'reg_lambda': '98.797', 'learning_rate': '0.011'}
RMSE Loss 0.640 params {'n_estimators': 100, 'max_depth': 87, 'num_leaves': 80, 'min_child_samples': 170, 'colsample_bytree': '0.499', 'subsample': '0.515', 'min_split_gain': '0.486', 'scale_pos_weight': '7.220', 'reg_alpha': '0.330', 'reg_lambda': '2.086', 'learning_rate': '0.356'}
RMSE Loss 0.636 params {'n_estimators': 1450, 'max_depth': 92, 'num_leaves': 30, 'min_child_samples': 90, 'colsample_bytree': '0.807', 'subsample

In [None]:
train_x_for_y10, test_x_for_y10 = train_x[c9], test_x[c9]
train_y_10 = train_y['Y_10']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y10, train_y_10, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y10 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.888 params {'n_estimators': 700, 'max_depth': 89, 'num_leaves': 50, 'min_child_samples': 170, 'colsample_bytree': '0.616', 'subsample': '0.376', 'min_split_gain': '0.291', 'scale_pos_weight': '8.786', 'reg_alpha': '56.899', 'reg_lambda': '54.542', 'learning_rate': '0.021'}
RMSE Loss 0.886 params {'n_estimators': 1300, 'max_depth': 98, 'num_leaves': 30, 'min_child_samples': 60, 'colsample_bytree': '0.958', 'subsample': '0.812', 'min_split_gain': '0.373', 'scale_pos_weight': '4.872', 'reg_alpha': '5.261', 'reg_lambda': '21.046', 'learning_rate': '0.048'}
RMSE Loss 0.891 params {'n_estimators': 1000, 'max_depth': 62, 'num_leaves': 30, 'min_child_samples': 170, 'colsample_bytree': '0.816', 'subsample': '0.986', 'min_split_gain': '0.489', 'scale_pos_weight': '2.497', 'reg_alpha': '95.089', 'reg_lambda': '29.719', 'learning_rate': '0.019'}
RMSE Loss 0.890 params {'n_estimators': 150, 'max_depth': 11, 'num_leaves': 40, 'min_child_samples': 230, 'colsample_bytree': '0.431', 'subsam

In [None]:
train_x_for_y11, test_x_for_y11 = train_x[c10], test_x[c10]
train_y_11 = train_y['Y_11']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y11, train_y_11, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y11 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.826 params {'n_estimators': 250, 'max_depth': 52, 'num_leaves': 30, 'min_child_samples': 70, 'colsample_bytree': '0.431', 'subsample': '0.670', 'min_split_gain': '0.643', 'scale_pos_weight': '5.469', 'reg_alpha': '88.244', 'reg_lambda': '17.563', 'learning_rate': '0.037'}
RMSE Loss 0.823 params {'n_estimators': 1250, 'max_depth': 45, 'num_leaves': 50, 'min_child_samples': 170, 'colsample_bytree': '0.956', 'subsample': '0.419', 'min_split_gain': '0.387', 'scale_pos_weight': '9.069', 'reg_alpha': '8.019', 'reg_lambda': '34.329', 'learning_rate': '0.052'}
RMSE Loss 0.826 params {'n_estimators': 1500, 'max_depth': 4, 'num_leaves': 90, 'min_child_samples': 90, 'colsample_bytree': '0.383', 'subsample': '0.502', 'min_split_gain': '0.362', 'scale_pos_weight': '9.326', 'reg_alpha': '68.829', 'reg_lambda': '35.894', 'learning_rate': '0.207'}
RMSE Loss 0.826 params {'n_estimators': 100, 'max_depth': 88, 'num_leaves': 60, 'min_child_samples': 230, 'colsample_bytree': '0.624', 'subsampl

In [None]:
train_x_for_y12, test_x_for_y12 = train_x[c11], test_x[c11]
train_y_12 = train_y['Y_12']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y12, train_y_12, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y12 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.640 params {'n_estimators': 950, 'max_depth': 26, 'num_leaves': 60, 'min_child_samples': 190, 'colsample_bytree': '0.929', 'subsample': '0.702', 'min_split_gain': '0.616', 'scale_pos_weight': '6.608', 'reg_alpha': '84.760', 'reg_lambda': '96.082', 'learning_rate': '0.085'}
RMSE Loss 0.637 params {'n_estimators': 350, 'max_depth': 14, 'num_leaves': 40, 'min_child_samples': 100, 'colsample_bytree': '0.836', 'subsample': '0.890', 'min_split_gain': '0.125', 'scale_pos_weight': '2.452', 'reg_alpha': '21.845', 'reg_lambda': '65.125', 'learning_rate': '0.085'}
RMSE Loss 0.638 params {'n_estimators': 350, 'max_depth': 99, 'num_leaves': 20, 'min_child_samples': 50, 'colsample_bytree': '0.736', 'subsample': '0.401', 'min_split_gain': '0.643', 'scale_pos_weight': '3.845', 'reg_alpha': '21.388', 'reg_lambda': '57.349', 'learning_rate': '0.055'}
RMSE Loss 0.638 params {'n_estimators': 1400, 'max_depth': 65, 'num_leaves': 80, 'min_child_samples': 250, 'colsample_bytree': '0.313', 'subsam

In [None]:
train_x_for_y13, test_x_for_y13 = train_x[c12], test_x[c12]
train_y_13 = train_y['Y_13']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y13, train_y_13, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y13 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.636 params {'n_estimators': 1450, 'max_depth': 35, 'num_leaves': 90, 'min_child_samples': 10, 'colsample_bytree': '0.610', 'subsample': '0.385', 'min_split_gain': '0.362', 'scale_pos_weight': '1.839', 'reg_alpha': '23.497', 'reg_lambda': '90.990', 'learning_rate': '0.031'}
RMSE Loss 0.639 params {'n_estimators': 650, 'max_depth': 76, 'num_leaves': 30, 'min_child_samples': 20, 'colsample_bytree': '0.738', 'subsample': '0.567', 'min_split_gain': '0.353', 'scale_pos_weight': '1.770', 'reg_alpha': '93.751', 'reg_lambda': '92.044', 'learning_rate': '0.110'}
RMSE Loss 0.636 params {'n_estimators': 550, 'max_depth': 77, 'num_leaves': 90, 'min_child_samples': 270, 'colsample_bytree': '0.979', 'subsample': '0.429', 'min_split_gain': '0.240', 'scale_pos_weight': '6.553', 'reg_alpha': '1.232', 'reg_lambda': '65.362', 'learning_rate': '0.054'}
RMSE Loss 0.637 params {'n_estimators': 1300, 'max_depth': 70, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.672', 'subsamp

In [None]:
train_x_for_y14, test_x_for_y14 = train_x[c13], test_x[c13]
train_y_14 = train_y['Y_14']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y14, train_y_14, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y14 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.640 params {'n_estimators': 1450, 'max_depth': 16, 'num_leaves': 90, 'min_child_samples': 280, 'colsample_bytree': '0.736', 'subsample': '0.807', 'min_split_gain': '0.610', 'scale_pos_weight': '4.321', 'reg_alpha': '73.457', 'reg_lambda': '50.216', 'learning_rate': '0.043'}
RMSE Loss 0.639 params {'n_estimators': 100, 'max_depth': 57, 'num_leaves': 90, 'min_child_samples': 210, 'colsample_bytree': '0.914', 'subsample': '0.933', 'min_split_gain': '0.448', 'scale_pos_weight': '9.491', 'reg_alpha': '74.628', 'reg_lambda': '57.799', 'learning_rate': '0.416'}
RMSE Loss 0.638 params {'n_estimators': 550, 'max_depth': 45, 'num_leaves': 80, 'min_child_samples': 50, 'colsample_bytree': '0.888', 'subsample': '0.355', 'min_split_gain': '0.616', 'scale_pos_weight': '6.144', 'reg_alpha': '40.026', 'reg_lambda': '99.110', 'learning_rate': '0.042'}
RMSE Loss 0.638 params {'n_estimators': 250, 'max_depth': 67, 'num_leaves': 70, 'min_child_samples': 150, 'colsample_bytree': '0.742', 'subsam

In [None]:
a=1.2*(0.3461105172166437+0.3771880158156553+0.35396054039411357+2.6051619313773107+1.5073251849357232+1.509877272936923+0.4108223450760097+0.6354741784324317)
print(a+4.199029623458831)

13.494133606880604


### Save Prediction

In [None]:
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
submit.to_csv('./submission_4.csv', index = False)