### Import Libraries

In [None]:
import pandas as pd
import random
import os
import numpy as np
import time
import seaborn as sns
import matplotlib.pyplot as plt
from functools import partial
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from hyperopt.pyll.base import scope
from hyperopt.pyll.stochastic import sample
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer

import math
from sklearn.linear_model import LinearRegression
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold, StratifiedKFold
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

import statsmodels.formula.api as sm
from statsmodels.api import add_constant
import statsmodels.api as sm

from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SelectFromModel

  import pandas.util.testing as tm


In [None]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

### Usable Functions 

In [None]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [None]:
def check_for_NAs(df, show=False):
    """
    @Description: checks for the NAs in the dataframe
    @Param1: df, pandas dataframe
    @Param2: show, boolean indicating whether NaN data are also necessary as a part of the output
    @Return: name of the columns with NaN
    """
    nan_values = df.loc[:, df.isnull().any()]
    if show:
        return df[df.isna().any(axis=1)]
    return list(nan_values.columns)

In [None]:
def check_for_label_bound(df, labels, bound):
    """
    @Description: check bound is inbetween min and max
    @Param1: df, pandas dataframe
    @Param2: labels, list of column names 
    @Param3: thres: list of bounds
    @Return: names of the columns not within the bound
    """
    n = len(labels)
    result = []
    for idx in range(n):
        col = labels[idx]
        thres = bound[idx]
        extracted_column = df[col]
        if not extracted_column.between(thres[0], thres[1]).all():
            result.append(labels[idx])
    if len(result) == 0:
        print('everything is within the bound')
    return result

In [None]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [None]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [None]:
def adjacent_histogram_boxplot(feature_var, figsize = (7, 5)):
    """
    @Description: plot histogram and boxplot in next to each other
    @Param1: feature_var, pandas series 
    @Param2: figsize, size of the figure 
    """
    fig, (hist_plot, box_plot) = plt.subplots(nrows=2, sharex=True, gridspec_kw={'height_ratios':(.85,.15)}, figsize=figsize)
    sns.distplot(feature_var, kde=True, ax=hist_plot, kde_kws= {"linewidth":1.5}) 
    sns.boxplot(feature_var, ax=box_plot, linewidth = 1, width = 0.5)
    hist_plot.set_ylabel('')    
    hist_plot.set_xlabel('')
    box_plot.set_xlabel('')
    hist_plot.tick_params(labelsize=8)
    box_plot.tick_params(labelsize=8)
    fig.suptitle(feature_var.name, fontsize = 10)
    hist_plot.axvline(np.mean(feature_var),color='red',linestyle='-',lw = 1.5)
    hist_plot.axvline(np.median(feature_var),color='green',linestyle='--',lw = 1.5)
    

In [None]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [None]:
def rmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: rmse score
    """
    MSE = np.square(np.subtract(gt,preds)).mean() 
    RMSE = math.sqrt(MSE)
    return RMSE

In [None]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
train_df = pd.read_csv('/content/drive/MyDrive/LG AIMERS/Data/train.csv')
test_x = pd.read_csv('/content/drive/MyDrive/LG AIMERS/Data/test.csv')
train_x, train_y = dataset_split_X_y(train_df)
cols_with_zero_variance = zero_variance(train_x)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [None]:
def forward_selection(data, target, significance_level=0.05):
    initial_features = data.columns.tolist()
    best_features = []
    while (len(initial_features)>0):
        remaining_features = list(set(initial_features)-set(best_features))
        new_pval = pd.Series(index=remaining_features, dtype = 'float64')
        for new_column in remaining_features:
            model = sm.OLS(target, sm.add_constant(data[best_features+[new_column]])).fit()
            new_pval[new_column] = model.pvalues[new_column]
        min_p_value = new_pval.min()
        if(min_p_value < significance_level):
            best_features.append(new_pval.idxmin())
            print('Add {0} with p-value {1}'.format(best_features[-1], min_p_value))
        else:
            break
    return best_features

In [None]:
c = forward_selection(train_x, train_y['Y_01'])

  x = pd.concat(x[::order], 1)


Add X_22 with p-value 1.1400284769076106e-50
Add X_19 with p-value 3.6220140027816325e-39
Add X_18 with p-value 6.489774701847175e-42
Add X_52 with p-value 2.9686534765521732e-37
Add X_17 with p-value 7.076051130298434e-24
Add X_45 with p-value 1.435789467650014e-17
Add X_13 with p-value 3.22321133309267e-16
Add X_49 with p-value 5.896808040711008e-14
Add X_51 with p-value 3.3126558338071085e-11
Add X_40 with p-value 3.414439138844369e-08
Add X_46 with p-value 1.6707370005119338e-07
Add X_03 with p-value 5.840649654392577e-06
Add X_06 with p-value 6.433326755967228e-08
Add X_20 with p-value 6.550174368600346e-07
Add X_05 with p-value 2.861701684660636e-06
Add X_27 with p-value 3.051658139025568e-07
Add X_24 with p-value 4.241919612358081e-06
Add X_07 with p-value 0.00014833534850787816
Add X_21 with p-value 0.002885478726137532
Add X_15 with p-value 0.0016617876414716526
Add X_43 with p-value 0.00226969899616824
Add X_42 with p-value 2.3141286024724186e-09
Add X_33 with p-value 0.00796

In [None]:
c1 = forward_selection(train_x, train_y['Y_02'])

  x = pd.concat(x[::order], 1)


Add X_22 with p-value 2.7258872299128843e-120
Add X_18 with p-value 6.084698083722552e-46
Add X_43 with p-value 1.763126188859107e-31
Add X_17 with p-value 7.455380138339882e-20
Add X_51 with p-value 3.2716512127228855e-17
Add X_42 with p-value 1.1771624877924978e-14
Add X_49 with p-value 3.798346499812959e-09
Add X_13 with p-value 1.341161042626847e-08
Add X_40 with p-value 1.6749578595245583e-07
Add X_05 with p-value 1.9851492448899915e-07
Add X_52 with p-value 2.2137148346356748e-07
Add X_06 with p-value 2.069800168454424e-06
Add X_45 with p-value 2.325197651753006e-05
Add X_27 with p-value 2.911437756533645e-05
Add X_24 with p-value 6.983905784570828e-09
Add X_25 with p-value 8.360488314239118e-06
Add X_56 with p-value 0.00014255608716371424
Add X_09 with p-value 0.00023570197602712885
Add X_19 with p-value 0.0002949490047556248
Add X_20 with p-value 0.00014831636285593083
Add X_46 with p-value 0.0013789830533236832
Add X_16 with p-value 0.03426491073596752
Add X_15 with p-value 0.

In [None]:
c2 = forward_selection(train_x, train_y['Y_03'])

  x = pd.concat(x[::order], 1)


Add X_18 with p-value 2.610805058403531e-34
Add X_17 with p-value 1.3744846651432461e-42
Add X_43 with p-value 1.060620200461944e-23
Add X_22 with p-value 3.2556397674511835e-22
Add X_19 with p-value 4.884965152395843e-32
Add X_49 with p-value 7.026053703956741e-18
Add X_51 with p-value 1.737752238739539e-16
Add X_05 with p-value 6.555308032950936e-11
Add X_13 with p-value 3.981132904251287e-08
Add X_42 with p-value 4.228078657901075e-08
Add X_24 with p-value 1.3382914170147482e-07
Add X_25 with p-value 1.1543257482310695e-11
Add X_52 with p-value 2.8844731295377286e-06
Add X_06 with p-value 1.5775200696654002e-05
Add X_40 with p-value 0.00011697918559217049
Add X_46 with p-value 0.00016501211587278017
Add X_45 with p-value 0.000462033642117044
Add X_56 with p-value 0.0006740499265707652
Add X_32 with p-value 0.0013218288932816311
Add X_03 with p-value 0.0015692204963771847
Add X_27 with p-value 0.0021111415044620424
Add X_33 with p-value 0.004047917753638909
Add X_01 with p-value 0.00

In [None]:
c3 = forward_selection(train_x, train_y['Y_04'])

  x = pd.concat(x[::order], 1)


Add X_30 with p-value 3.309248119330999e-58
Add X_21 with p-value 7.919885160133404e-77
Add X_32 with p-value 1.1319962562662815e-17
Add X_16 with p-value 3.136420560690022e-12
Add X_17 with p-value 2.779778861217942e-31
Add X_12 with p-value 1.165362752569123e-12
Add X_19 with p-value 6.9312190951533015e-12
Add X_03 with p-value 2.0508189513191614e-13
Add X_24 with p-value 3.2103930044270073e-13
Add X_54 with p-value 5.050579231742754e-09
Add X_50 with p-value 1.157346567266427e-18
Add X_52 with p-value 6.151597325892205e-10
Add X_56 with p-value 9.57876406297284e-06
Add X_22 with p-value 1.0508171752638482e-05
Add X_09 with p-value 0.00017786599734697219
Add X_25 with p-value 0.00013437898635880853
Add X_06 with p-value 0.00032573608738830604
Add X_40 with p-value 0.0003144440980365648
Add X_44 with p-value 0.0007526130832283015
Add X_42 with p-value 3.2583384287418217e-12
Add X_51 with p-value 0.00043878527725522103
Add X_55 with p-value 0.00017256836013757016
Add X_18 with p-value 

In [None]:
c4 = forward_selection(train_x, train_y['Y_05'])

  x = pd.concat(x[::order], 1)


Add X_13 with p-value 3.327492761558441e-36
Add X_56 with p-value 3.3182106354189956e-19
Add X_32 with p-value 2.4194768221105086e-10
Add X_21 with p-value 7.940843747711714e-12
Add X_12 with p-value 7.092929103581809e-11
Add X_15 with p-value 7.487080622374793e-11
Add X_18 with p-value 6.606043004116294e-24
Add X_17 with p-value 6.125713797927264e-27
Add X_33 with p-value 5.942885335215171e-10
Add X_29 with p-value 1.928453036018365e-09
Add X_54 with p-value 5.448446260701209e-09
Add X_40 with p-value 2.115763506664737e-08
Add X_50 with p-value 3.5651620358502465e-08
Add X_09 with p-value 1.4710100118265272e-08
Add X_44 with p-value 1.4358224020930618e-06
Add X_49 with p-value 9.809145988674683e-06
Add X_51 with p-value 9.970979004025396e-05
Add X_55 with p-value 3.981094373857387e-06
Add X_01 with p-value 0.00022058888276723225
Add X_30 with p-value 8.302074086710868e-05
Add X_19 with p-value 0.000518788142696434
Add X_52 with p-value 0.002483938505242094
Add X_28 with p-value 0.0028

In [None]:
c5 = forward_selection(train_x, train_y['Y_06'])

  x = pd.concat(x[::order], 1)


Add X_19 with p-value 7.156269204552303e-37
Add X_30 with p-value 3.718130253477482e-11
Add X_56 with p-value 8.635630693992347e-11
Add X_05 with p-value 3.302805011519708e-07
Add X_06 with p-value 1.555729317738427e-07
Add X_26 with p-value 3.6366201105298813e-06
Add X_49 with p-value 4.1323666142567965e-06
Add X_17 with p-value 2.6095620082458575e-06
Add X_18 with p-value 8.518702387126051e-07
Add X_14 with p-value 1.700514070946896e-10
Add X_16 with p-value 0.0005305494425387348
Add X_44 with p-value 2.36541330998519e-06
Add X_13 with p-value 0.001965028924941161
Add X_43 with p-value 0.001245517213631335
Add X_51 with p-value 0.007103674433167382
Add X_55 with p-value 0.0010864562578339323
Add X_50 with p-value 0.008623813716580088
Add X_09 with p-value 0.00663049124409724
Add X_03 with p-value 0.021976127634274752
Add X_22 with p-value 0.013263347976531024


In [None]:
c6 = forward_selection(train_x, train_y['Y_07'])

  x = pd.concat(x[::order], 1)


Add X_19 with p-value 7.511834743583558e-77
Add X_13 with p-value 2.1742573828051777e-74
Add X_32 with p-value 2.669686027284441e-12
Add X_21 with p-value 4.331741555330308e-13
Add X_15 with p-value 7.1358013324539e-08
Add X_18 with p-value 3.8507318621243855e-09
Add X_03 with p-value 3.736292553153236e-10
Add X_17 with p-value 4.5719600347693195e-06
Add X_50 with p-value 6.7730997462574516e-06
Add X_54 with p-value 1.4557832055546847e-11
Add X_41 with p-value 5.258823983974556e-05
Add X_24 with p-value 1.0447319375317652e-05
Add X_12 with p-value 1.2920764913781428e-10
Add X_14 with p-value 2.4121547899804517e-05
Add X_38 with p-value 7.499353924792789e-05
Add X_31 with p-value 0.000179255734133822
Add X_09 with p-value 0.001289735154545115
Add X_25 with p-value 0.0024261317541953292
Add X_56 with p-value 0.0034515801527796076
Add X_52 with p-value 0.0005891483175484549
Add X_06 with p-value 0.006347743145948686
Add X_05 with p-value 0.0021071475135274707
Add X_22 with p-value 0.00112

In [None]:
c7 = forward_selection(train_x, train_y['Y_08'])

  x = pd.concat(x[::order], 1)


Add X_20 with p-value 8.064405592255072e-194
Add X_32 with p-value 8.563319852298222e-79
Add X_16 with p-value 1.2927186662628716e-48
Add X_30 with p-value 2.1442073108204747e-28
Add X_03 with p-value 1.893882538301511e-27
Add X_42 with p-value 2.711263393515821e-21
Add X_22 with p-value 7.427615271891987e-16
Add X_52 with p-value 1.0831695267944014e-11
Add X_40 with p-value 3.1797513812015806e-10
Add X_17 with p-value 7.664905914715629e-07
Add X_18 with p-value 3.067278435605502e-08
Add X_15 with p-value 1.2485455412677663e-21
Add X_10 with p-value 1.9500784154946596e-05
Add X_51 with p-value 2.9088043434401783e-05
Add X_55 with p-value 2.603566592769475e-06
Add X_09 with p-value 2.030702678306937e-05
Add X_27 with p-value 8.596111512871953e-05
Add X_56 with p-value 0.000233379026175985
Add X_41 with p-value 0.0004410965988038377
Add X_14 with p-value 0.00021282886528026912
Add X_54 with p-value 0.001294451748949147
Add X_53 with p-value 0.0037817255024213286
Add X_35 with p-value 0.0

In [None]:
c8 = forward_selection(train_x, train_y['Y_09'])

  x = pd.concat(x[::order], 1)


Add X_20 with p-value 3.303745859012631e-170
Add X_30 with p-value 4.874625725504182e-78
Add X_16 with p-value 1.4962477028088325e-49
Add X_42 with p-value 1.684232100758233e-23
Add X_03 with p-value 9.921925794998253e-23
Add X_32 with p-value 2.2528372272585126e-17
Add X_22 with p-value 1.1050554486631773e-13
Add X_52 with p-value 5.052290386776377e-11
Add X_40 with p-value 1.5091485330771489e-09
Add X_17 with p-value 5.308882792544909e-09
Add X_18 with p-value 1.9567731417759238e-07
Add X_15 with p-value 2.0270531076463994e-17
Add X_51 with p-value 3.335840245783594e-05
Add X_55 with p-value 2.077603411053329e-05
Add X_09 with p-value 3.3201466138828304e-05
Add X_12 with p-value 8.379216456091157e-05
Add X_41 with p-value 8.320484838407599e-05
Add X_10 with p-value 0.0001443719382128691
Add X_56 with p-value 0.0006158415765597566
Add X_54 with p-value 0.0008239545236230096
Add X_14 with p-value 0.0019689890510152276
Add X_53 with p-value 0.004245497458822195
Add X_35 with p-value 0.0

In [None]:
c9 = forward_selection(train_x, train_y['Y_10'])

  x = pd.concat(x[::order], 1)


Add X_18 with p-value 3.8925528205858604e-182
Add X_15 with p-value 1.4198721665663633e-122
Add X_22 with p-value 4.364231804724337e-55
Add X_42 with p-value 1.0796848582778564e-29
Add X_30 with p-value 5.2887456600769236e-30
Add X_56 with p-value 1.1018784303128356e-27
Add X_49 with p-value 6.532797057515857e-22
Add X_03 with p-value 3.3594763727207626e-17
Add X_09 with p-value 1.2898947251242368e-13
Add X_51 with p-value 2.342955154320033e-09
Add X_16 with p-value 1.532651803690288e-08
Add X_27 with p-value 2.6763341806401787e-09
Add X_24 with p-value 1.7239669339023892e-06
Add X_41 with p-value 5.324899121513288e-07
Add X_32 with p-value 4.850007247287381e-06
Add X_45 with p-value 4.9677751559296245e-06
Add X_17 with p-value 2.0789465612074192e-06
Add X_21 with p-value 4.4821344969079156e-07
Add X_01 with p-value 0.00022321460584083492
Add X_40 with p-value 0.0002218914346341664
Add X_55 with p-value 0.0006914188581099249
Add X_52 with p-value 4.160778631294178e-06
Add X_54 with p-v

In [None]:
c10 = forward_selection(train_x, train_y['Y_11'])

  x = pd.concat(x[::order], 1)


Add X_17 with p-value 5.776319495476359e-37
Add X_18 with p-value 6.053232562189295e-27
Add X_15 with p-value 2.4434062940996466e-20
Add X_30 with p-value 4.593153644019237e-13
Add X_51 with p-value 2.3394098590191522e-11
Add X_42 with p-value 4.3243669824949055e-08
Add X_03 with p-value 2.3206823242211729e-07
Add X_01 with p-value 2.149164389617397e-08
Add X_56 with p-value 4.954434116832316e-06
Add X_05 with p-value 1.8039203837241304e-05
Add X_54 with p-value 2.029677776249466e-05
Add X_31 with p-value 1.9047910835143056e-05
Add X_21 with p-value 3.269282194449437e-05
Add X_52 with p-value 0.00021662496200102534
Add X_55 with p-value 0.00018156231585695322
Add X_10 with p-value 0.0005850880540251812
Add X_50 with p-value 0.0008617107781659628
Add X_32 with p-value 0.0062435994203327875
Add X_09 with p-value 0.006899521673595579
Add X_33 with p-value 0.013401104316046533
Add X_35 with p-value 0.021727475862918573


In [None]:
c11 = forward_selection(train_x, train_y['Y_12'])

  x = pd.concat(x[::order], 1)


Add X_20 with p-value 4.8166100304271086e-175
Add X_30 with p-value 4.545344554335348e-77
Add X_16 with p-value 1.1898696204864055e-52
Add X_42 with p-value 1.3927604726821786e-22
Add X_03 with p-value 9.958468648592394e-25
Add X_32 with p-value 4.8960703724866806e-17
Add X_22 with p-value 6.94541354385559e-15
Add X_56 with p-value 2.6427436969182496e-12
Add X_40 with p-value 2.8758983062441156e-10
Add X_17 with p-value 9.218429627597743e-07
Add X_18 with p-value 4.56601334807422e-07
Add X_15 with p-value 2.770717210756484e-20
Add X_51 with p-value 5.926487702045288e-06
Add X_09 with p-value 6.7792010238484e-05
Add X_13 with p-value 8.34720112700803e-05
Add X_10 with p-value 9.230413373510234e-05
Add X_41 with p-value 0.0003607118938552539
Add X_55 with p-value 0.0006850787007824842
Add X_52 with p-value 4.0888387292711004e-06
Add X_12 with p-value 0.0012538497284008787
Add X_54 with p-value 0.0021603422466656766
Add X_53 with p-value 0.008297108374959138
Add X_35 with p-value 0.020172

In [None]:
c12 = forward_selection(train_x, train_y['Y_13'])

  x = pd.concat(x[::order], 1)


Add X_20 with p-value 1.2216548512567081e-178
Add X_30 with p-value 2.2760998953825906e-78
Add X_16 with p-value 2.382101713266921e-53
Add X_42 with p-value 1.4051959968409598e-22
Add X_03 with p-value 1.1732269981454084e-24
Add X_32 with p-value 8.569953126546789e-18
Add X_22 with p-value 2.299935960732824e-15
Add X_52 with p-value 9.060189608887956e-12
Add X_40 with p-value 7.865879627931812e-10
Add X_17 with p-value 3.31012332737539e-07
Add X_18 with p-value 7.759526051153441e-08
Add X_15 with p-value 1.1173570812998269e-19
Add X_51 with p-value 7.729434641768872e-06
Add X_09 with p-value 1.1449321255542603e-05
Add X_55 with p-value 2.4903269298456495e-05
Add X_10 with p-value 3.6685211765929355e-05
Add X_56 with p-value 0.00028965258426459195
Add X_41 with p-value 0.00034415160865322364
Add X_14 with p-value 0.00043099814811720924
Add X_54 with p-value 0.0025059441528852457
Add X_12 with p-value 0.005135929378935063
Add X_53 with p-value 0.006411858306363702
Add X_35 with p-value 0

In [None]:
c13 = forward_selection(train_x, train_y['Y_14'])

  x = pd.concat(x[::order], 1)


Add X_20 with p-value 8.140712496571332e-170
Add X_32 with p-value 2.331948794902241e-78
Add X_16 with p-value 2.8095620135471246e-44
Add X_30 with p-value 7.967418552631219e-30
Add X_03 with p-value 1.231350962079994e-21
Add X_42 with p-value 1.911586696221342e-19
Add X_22 with p-value 3.552519056754831e-14
Add X_52 with p-value 2.1772394090481097e-13
Add X_40 with p-value 2.630642386333253e-08
Add X_17 with p-value 3.34854299365015e-08
Add X_18 with p-value 5.117122829988325e-08
Add X_15 with p-value 6.501710357167612e-21
Add X_51 with p-value 1.7734821832969167e-06
Add X_55 with p-value 2.766721961880589e-05
Add X_09 with p-value 1.7349762726790312e-05
Add X_27 with p-value 0.00018570730104244655
Add X_10 with p-value 0.00031955451637124974
Add X_56 with p-value 0.000629458779571061
Add X_54 with p-value 0.0005193697261166049
Add X_41 with p-value 0.0009083785638562541
Add X_13 with p-value 0.0017106768647488168
Add X_53 with p-value 0.0031177199335901138
Add X_35 with p-value 0.014

### Train

In [None]:
train_x_for_y01, test_x_for_y01 = train_x[c], test_x[c]
train_y_01 = train_y['Y_01']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y01, train_y_01, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y01 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

RMSE Loss 0.347 params {'n_estimators': 1350, 'max_depth': 46, 'num_leaves': 90, 'min_child_samples': 140, 'colsample_bytree': '0.383', 'subsample': '0.698', 'min_split_gain': '0.194', 'scale_pos_weight': '7.440', 'reg_alpha': '9.528', 'reg_lambda': '33.366', 'learning_rate': '0.040'}
RMSE Loss 0.349 params {'n_estimators': 850, 'max_depth': 31, 'num_leaves': 90, 'min_child_samples': 80, 'colsample_bytree': '0.417', 'subsample': '0.978', 'min_split_gain': '0.488', 'scale_pos_weight': '6.130', 'reg_alpha': '81.456', 'reg_lambda': '12.488', 'learning_rate': '0.079'}
RMSE Loss 0.349 params {'n_estimators': 250, 'max_depth': 54, 'num_leaves': 90, 'min_child_samples': 50, 'colsample_bytree': '0.681', 'subsample': '0.815', 'min_split_gain': '0.157', 'scale_pos_weight': '2.894', 'reg_alpha': '93.852', 'reg_lambda': '33.720', 'learning_rate': '0.347'}
RMSE Loss 0.349 params {'n_estimators': 550, 'max_depth': 61, 'num_leaves': 90, 'min_child_samples': 270, 'colsample_bytree': '0.764', 'subsampl

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y01 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30,
            rstate=np.random.default_rng(123)
           )

  0%|          | 0/30 [00:00<?, ?it/s, best loss: ?]


AttributeError: ignored

In [None]:
dir(best_y01)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [None]:
lg_nrmse(0.34624102053756733, best_y01)

ValueError: ignored

In [None]:
train_x_for_y02, test_x_for_y02 = train_x[c1], test_x[c1]
train_y_02 = train_y['Y_02']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y02, train_y_02, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y02 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

RMSE Loss 0.379 params {'n_estimators': 1100, 'max_depth': 97, 'num_leaves': 40, 'min_child_samples': 120, 'colsample_bytree': '0.398', 'subsample': '0.395', 'min_split_gain': '0.479', 'scale_pos_weight': '7.486', 'reg_alpha': '41.489', 'reg_lambda': '83.824', 'learning_rate': '0.202'}
RMSE Loss 0.379 params {'n_estimators': 1100, 'max_depth': 46, 'num_leaves': 60, 'min_child_samples': 200, 'colsample_bytree': '0.712', 'subsample': '0.564', 'min_split_gain': '0.143', 'scale_pos_weight': '7.577', 'reg_alpha': '92.996', 'reg_lambda': '48.819', 'learning_rate': '0.100'}
RMSE Loss 0.379 params {'n_estimators': 250, 'max_depth': 48, 'num_leaves': 40, 'min_child_samples': 40, 'colsample_bytree': '0.956', 'subsample': '0.959', 'min_split_gain': '0.379', 'scale_pos_weight': '4.849', 'reg_alpha': '72.191', 'reg_lambda': '73.354', 'learning_rate': '0.323'}
RMSE Loss 0.380 params {'n_estimators': 150, 'max_depth': 80, 'num_leaves': 70, 'min_child_samples': 280, 'colsample_bytree': '0.535', 'subsa

In [None]:
dir(best_y02)

['__class__',
 '__contains__',
 '__delattr__',
 '__delitem__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setitem__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 'clear',
 'copy',
 'fromkeys',
 'get',
 'items',
 'keys',
 'pop',
 'popitem',
 'setdefault',
 'update',
 'values']

In [None]:
train_x_for_y03, test_x_for_y03 = train_x[c2], test_x[c2]
train_y_03 = train_y['Y_03']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y03, train_y_03, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y03 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

RMSE Loss 0.356 params {'n_estimators': 1500, 'max_depth': 16, 'num_leaves': 50, 'min_child_samples': 250, 'colsample_bytree': '0.868', 'subsample': '0.613', 'min_split_gain': '0.275', 'scale_pos_weight': '5.163', 'reg_alpha': '59.031', 'reg_lambda': '55.820', 'learning_rate': '0.119'}
RMSE Loss 0.356 params {'n_estimators': 400, 'max_depth': 89, 'num_leaves': 70, 'min_child_samples': 130, 'colsample_bytree': '0.693', 'subsample': '0.557', 'min_split_gain': '0.623', 'scale_pos_weight': '2.249', 'reg_alpha': '39.996', 'reg_lambda': '39.934', 'learning_rate': '0.022'}
RMSE Loss 0.356 params {'n_estimators': 350, 'max_depth': 33, 'num_leaves': 40, 'min_child_samples': 240, 'colsample_bytree': '0.577', 'subsample': '0.455', 'min_split_gain': '0.312', 'scale_pos_weight': '8.082', 'reg_alpha': '40.373', 'reg_lambda': '41.326', 'learning_rate': '0.263'}
RMSE Loss 0.355 params {'n_estimators': 850, 'max_depth': 59, 'num_leaves': 70, 'min_child_samples': 280, 'colsample_bytree': '0.817', 'subsa

In [None]:
train_x_for_y04, test_x_for_y04 = train_x[c3], test_x[c3]
train_y_04 = train_y['Y_04']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y04, train_y_04, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y04 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

RMSE Loss 2.607 params {'n_estimators': 1150, 'max_depth': 25, 'num_leaves': 40, 'min_child_samples': 40, 'colsample_bytree': '0.375', 'subsample': '0.369', 'min_split_gain': '0.217', 'scale_pos_weight': '6.767', 'reg_alpha': '59.174', 'reg_lambda': '12.459', 'learning_rate': '0.015'}
RMSE Loss 2.621 params {'n_estimators': 950, 'max_depth': 86, 'num_leaves': 40, 'min_child_samples': 100, 'colsample_bytree': '0.993', 'subsample': '0.933', 'min_split_gain': '0.511', 'scale_pos_weight': '2.980', 'reg_alpha': '80.090', 'reg_lambda': '32.495', 'learning_rate': '0.219'}
RMSE Loss 2.631 params {'n_estimators': 1250, 'max_depth': 27, 'num_leaves': 80, 'min_child_samples': 190, 'colsample_bytree': '0.486', 'subsample': '0.731', 'min_split_gain': '0.290', 'scale_pos_weight': '8.847', 'reg_alpha': '91.349', 'reg_lambda': '69.108', 'learning_rate': '0.331'}
RMSE Loss 2.609 params {'n_estimators': 500, 'max_depth': 8, 'num_leaves': 40, 'min_child_samples': 210, 'colsample_bytree': '0.707', 'subsam

In [None]:
train_x_for_y05, test_x_for_y05 = train_x[c4], test_x[c4]
train_y_05 = train_y['Y_05']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y05, train_y_05, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y05 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

In [None]:
train_x_for_y06, test_x_for_y06 = train_x[c5], test_x[c5]
train_y_06 = train_y['Y_06']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y06, train_y_06, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y06 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

In [None]:
train_x_for_y07, test_x_for_y07 = train_x[c6], test_x[c6]
train_y_07 = train_y['Y_07']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y07, train_y_07, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y07 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

In [None]:
train_x_for_y08, test_x_for_y08 = train_x[c7], test_x[c7]
train_y_08 = train_y['Y_08']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y08, train_y_08, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y08 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 30)

In [None]:
train_x_for_y09, test_x_for_y09 = train_x[c8], test_x[c8]
train_y_09 = train_y['Y_09']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y09, train_y_09, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y09 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 300)

RMSE Loss 0.632 params {'n_estimators': 1000, 'max_depth': 84, 'num_leaves': 50, 'min_child_samples': 200, 'colsample_bytree': '0.749', 'subsample': '0.820', 'min_split_gain': '0.644', 'scale_pos_weight': '9.749', 'reg_alpha': '22.972', 'reg_lambda': '6.934', 'learning_rate': '0.024'}
RMSE Loss 0.633 params {'n_estimators': 750, 'max_depth': 92, 'num_leaves': 90, 'min_child_samples': 240, 'colsample_bytree': '0.413', 'subsample': '0.764', 'min_split_gain': '0.647', 'scale_pos_weight': '8.257', 'reg_alpha': '56.544', 'reg_lambda': '1.552', 'learning_rate': '0.110'}
RMSE Loss 0.640 params {'n_estimators': 1400, 'max_depth': 78, 'num_leaves': 50, 'min_child_samples': 80, 'colsample_bytree': '0.564', 'subsample': '0.488', 'min_split_gain': '0.314', 'scale_pos_weight': '8.521', 'reg_alpha': '0.365', 'reg_lambda': '42.150', 'learning_rate': '0.440'}
RMSE Loss 0.634 params {'n_estimators': 1450, 'max_depth': 8, 'num_leaves': 50, 'min_child_samples': 260, 'colsample_bytree': '0.626', 'subsampl

KeyboardInterrupt: ignored

In [None]:
train_x_for_y10, test_x_for_y10 = train_x[c9], test_x[c9]
train_y_10 = train_y['Y_10']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y10, train_y_10, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y10 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.868 params {'n_estimators': 1150, 'max_depth': 30, 'num_leaves': 90, 'min_child_samples': 70, 'colsample_bytree': '0.351', 'subsample': '0.906', 'min_split_gain': '0.410', 'scale_pos_weight': '3.181', 'reg_alpha': '48.766', 'reg_lambda': '73.623', 'learning_rate': '0.014'}
RMSE Loss 0.858 params {'n_estimators': 300, 'max_depth': 59, 'num_leaves': 60, 'min_child_samples': 70, 'colsample_bytree': '0.700', 'subsample': '0.347', 'min_split_gain': '0.607', 'scale_pos_weight': '6.645', 'reg_alpha': '9.376', 'reg_lambda': '22.040', 'learning_rate': '0.039'}
RMSE Loss 0.868 params {'n_estimators': 600, 'max_depth': 22, 'num_leaves': 90, 'min_child_samples': 60, 'colsample_bytree': '0.779', 'subsample': '0.351', 'min_split_gain': '0.131', 'scale_pos_weight': '2.901', 'reg_alpha': '75.942', 'reg_lambda': '37.190', 'learning_rate': '0.231'}
RMSE Loss 0.864 params {'n_estimators': 1050, 'max_depth': 96, 'num_leaves': 50, 'min_child_samples': 190, 'colsample_bytree': '0.629', 'subsampl

In [None]:
train_x_for_y11, test_x_for_y11 = train_x[c10], test_x[c10]
train_y_11 = train_y['Y_11']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y11, train_y_11, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y11 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.820 params {'n_estimators': 950, 'max_depth': 53, 'num_leaves': 90, 'min_child_samples': 120, 'colsample_bytree': '0.855', 'subsample': '0.948', 'min_split_gain': '0.321', 'scale_pos_weight': '6.265', 'reg_alpha': '3.251', 'reg_lambda': '51.747', 'learning_rate': '0.015'}
RMSE Loss 0.820 params {'n_estimators': 1250, 'max_depth': 51, 'num_leaves': 30, 'min_child_samples': 50, 'colsample_bytree': '0.836', 'subsample': '0.827', 'min_split_gain': '0.453', 'scale_pos_weight': '2.097', 'reg_alpha': '36.130', 'reg_lambda': '99.908', 'learning_rate': '0.089'}
RMSE Loss 0.821 params {'n_estimators': 300, 'max_depth': 92, 'num_leaves': 60, 'min_child_samples': 100, 'colsample_bytree': '0.892', 'subsample': '0.793', 'min_split_gain': '0.364', 'scale_pos_weight': '6.905', 'reg_alpha': '45.037', 'reg_lambda': '13.959', 'learning_rate': '0.132'}
RMSE Loss 0.821 params {'n_estimators': 250, 'max_depth': 95, 'num_leaves': 80, 'min_child_samples': 180, 'colsample_bytree': '0.627', 'subsamp

In [None]:
train_x_for_y12, test_x_for_y12 = train_x[c11], test_x[c11]
train_y_12 = train_y['Y_12']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y12, train_y_12, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y12 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.636 params {'n_estimators': 300, 'max_depth': 35, 'num_leaves': 90, 'min_child_samples': 260, 'colsample_bytree': '0.656', 'subsample': '0.455', 'min_split_gain': '0.218', 'scale_pos_weight': '1.522', 'reg_alpha': '86.043', 'reg_lambda': '66.811', 'learning_rate': '0.075'}
RMSE Loss 0.636 params {'n_estimators': 900, 'max_depth': 41, 'num_leaves': 60, 'min_child_samples': 130, 'colsample_bytree': '0.905', 'subsample': '0.401', 'min_split_gain': '0.116', 'scale_pos_weight': '1.499', 'reg_alpha': '94.508', 'reg_lambda': '43.977', 'learning_rate': '0.224'}
RMSE Loss 0.635 params {'n_estimators': 1000, 'max_depth': 68, 'num_leaves': 40, 'min_child_samples': 210, 'colsample_bytree': '0.387', 'subsample': '0.581', 'min_split_gain': '0.043', 'scale_pos_weight': '1.947', 'reg_alpha': '69.458', 'reg_lambda': '7.956', 'learning_rate': '0.012'}
RMSE Loss 0.636 params {'n_estimators': 550, 'max_depth': 15, 'num_leaves': 80, 'min_child_samples': 90, 'colsample_bytree': '0.534', 'subsamp

In [None]:
train_x_for_y13, test_x_for_y13 = train_x[c12], test_x[c12]
train_y_13 = train_y['Y_13']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y13, train_y_13, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y13 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.635 params {'n_estimators': 1050, 'max_depth': 68, 'num_leaves': 20, 'min_child_samples': 260, 'colsample_bytree': '0.405', 'subsample': '0.916', 'min_split_gain': '0.485', 'scale_pos_weight': '8.841', 'reg_alpha': '86.211', 'reg_lambda': '30.149', 'learning_rate': '0.011'}
RMSE Loss 0.633 params {'n_estimators': 750, 'max_depth': 17, 'num_leaves': 20, 'min_child_samples': 170, 'colsample_bytree': '0.828', 'subsample': '0.744', 'min_split_gain': '0.178', 'scale_pos_weight': '6.869', 'reg_alpha': '35.879', 'reg_lambda': '88.069', 'learning_rate': '0.043'}
RMSE Loss 0.634 params {'n_estimators': 550, 'max_depth': 74, 'num_leaves': 50, 'min_child_samples': 140, 'colsample_bytree': '0.610', 'subsample': '0.329', 'min_split_gain': '0.325', 'scale_pos_weight': '8.941', 'reg_alpha': '64.732', 'reg_lambda': '77.875', 'learning_rate': '0.028'}
RMSE Loss 0.636 params {'n_estimators': 800, 'max_depth': 57, 'num_leaves': 30, 'min_child_samples': 180, 'colsample_bytree': '0.397', 'subsa

In [None]:
train_x_for_y14, test_x_for_y14 = train_x[c13], test_x[c13]
train_y_14 = train_y['Y_14']

In [None]:
def objective(params):

    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = LGBMRegressor(
        n_jobs = -1,
        random_state = 42,
        **params
    )
    
    loss = -cross_val_score(model, train_x_for_y14, train_y_14, cv=10, scoring=make_scorer(rmse, greater_is_better=False)).mean()
    print("RMSE Loss {:.3f} params {}".format(loss, params))
    return loss

In [None]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

# rstate = np.random.RandomState(42)
best_y14 = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 100)

RMSE Loss 0.636 params {'n_estimators': 550, 'max_depth': 32, 'num_leaves': 100, 'min_child_samples': 100, 'colsample_bytree': '0.718', 'subsample': '0.717', 'min_split_gain': '0.665', 'scale_pos_weight': '7.079', 'reg_alpha': '67.295', 'reg_lambda': '27.512', 'learning_rate': '0.249'}
RMSE Loss 0.636 params {'n_estimators': 1500, 'max_depth': 89, 'num_leaves': 60, 'min_child_samples': 70, 'colsample_bytree': '0.844', 'subsample': '0.437', 'min_split_gain': '0.383', 'scale_pos_weight': '4.335', 'reg_alpha': '90.568', 'reg_lambda': '94.046', 'learning_rate': '0.032'}
RMSE Loss 0.636 params {'n_estimators': 1050, 'max_depth': 27, 'num_leaves': 80, 'min_child_samples': 80, 'colsample_bytree': '0.957', 'subsample': '0.780', 'min_split_gain': '0.135', 'scale_pos_weight': '5.955', 'reg_alpha': '84.138', 'reg_lambda': '84.736', 'learning_rate': '0.089'}
RMSE Loss 0.634 params {'n_estimators': 450, 'max_depth': 15, 'num_leaves': 80, 'min_child_samples': 250, 'colsample_bytree': '0.369', 'subsa

In [None]:
a = 0.6304426511675747+0.852249093889939+0.8190750074839114+0.6330751007984236+0.6315881554183201+0.6325996147006622
print(a)

4.199029623458831


### Save Prediction

In [None]:
submit = pd.read_csv('./sample_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
submit.to_csv('./submission_4.csv', index = False)