### Import Libraries

In [2]:
import pandas as pd
import random
import os
import numpy as np
from functools import partial
from lightgbm import LGBMRegressor
from hyperopt import fmin, hp, tpe, Trials, STATUS_OK
from sklearn.model_selection import cross_val_score
from sklearn import metrics
from sklearn.metrics import mean_squared_error
from sklearn.metrics import make_scorer
from sklearn.multioutput import MultiOutputRegressor
from sklearn.model_selection import KFold

import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler

In [3]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
seed_everything(42) # Seed 고정

In [4]:
def dataset_split_X_y(df):    
    """
    @Description: split data into features and labels
    @Param: df, pandas dataframe with columns starting with X for features and Y for labels
    @Return: features and labels in pandas dataframes
    """
    xs = df.filter(regex='X') # Input : X Feature
    ys = df.filter(regex='Y') # Output : Y Feature
    return xs, ys

In [5]:
def zero_variance(df):
    """
    @Description: check for zero_variance
    @Param1: df, pandas dataframe
    @Return: names of the columns with zero variance
    """
    result = []
    for col in df.columns:
        if df[col].var() == 0:
            result.append(col)
    return result

In [6]:
def get_top_correlation(df, n=10):
    """
    @Description: print out top correlated features
    @Param1: df, pandas dataframe
    @Param2: n, number of lines to print 
    @Return: pandas series
    """
    pairs = set()
    for idx1 in range(0, df.shape[1]):
        for idx2 in range(0, idx1+1):
            pairs.add((df.columns[idx1], df.columns[idx2]))
    corr = df.corr().abs().unstack()
    corr = corr.drop(labels=pairs).sort_values(ascending=False)
    return corr[0:n]

In [7]:
def lg_nrmse(gt, preds):
    """
    @Description: Metric used in this project
    @Params1: gt, pandas dataframe
    @Param2: preds, pandas dataframe
    @Return: nrmse score
    """
    # 각 Y Feature별 NRMSE 총합
    # Y_01 ~ Y_08 까지 20% 가중치 부여
    preds = pd.DataFrame(preds)
    all_nrmse = []
    for idx in range(0,14):
        rmse = mean_squared_error(gt.iloc[:,idx], preds.iloc[:,idx], squared=False)
        nrmse = rmse/np.mean(np.abs(gt.iloc[:,idx]))
        all_nrmse.append(nrmse)
    score = 1.2 * np.sum(all_nrmse[:8]) + 1.0 * np.sum(all_nrmse[8:15])
    return score

In [236]:
train_df = pd.read_csv('data/train.csv')
test_x = pd.read_csv('data/test.csv')
y_feature_spec_info = pd.read_csv('data/meta/y_feature_spec_info.csv')

train_x, train_y = dataset_split_X_y(train_df)

cols_with_zero_variance = zero_variance(train_x) # 분산이 0 (통과 여부)
train_x = train_x.drop(cols_with_zero_variance, axis = 1)
test_x = test_x.drop(cols_with_zero_variance, axis = 1)

train_x = train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
test_x = test_x.drop(['X_10', 'X_11'], axis = 1)

test_x = test_x.drop('ID', axis=1)

In [237]:
ys = ['Y_01', 'Y_02', 'Y_03', 'Y_04', 'Y_05', 
      'Y_06', 'Y_07', 'Y_08', 'Y_09', 'Y_10', 
      'Y_11', 'Y_12', 'Y_13', 'Y_14']
ys_bounds = [[0.2, 2], [0.2, 2.1], [0.2, 2.1], 
             [7, 19], [22, 36.5], [-19.2, 19], 
             [2.4, 4], [-29.2, -24], [-29.2, -24],
             [-30.6, -20], [19.6, 26.6], [-29.2, -24],
             [-29.2, -24], [-29.2, -24]]

In [238]:
df_indicator = pd.DataFrame()
for idx in range(len(ys)):
    y_series = ~train_y[ys[idx]].between(ys_bounds[idx][0], ys_bounds[idx][1])
    df_indicator = pd.concat([df_indicator, y_series.astype(int)], axis = 1)

In [239]:
lst = []
for i in df_indicator.columns:
    lst.append(df_indicator[df_indicator[i] == 1].index)
ans=set()
for i in lst:
    for k in i:
        ans.add(k)
print(len(ans))

ans = list(ans)
ans.sort()
train_data_spec = train_df.loc[ans, :]

3917


In [240]:
train_data_norm = train_df.drop(train_data.index)

In [241]:
len(train_data_norm)

train_x_norm, train_y_norm = dataset_split_X_y(train_data_norm)

In [242]:
len(train_data_norm)

train_x_spec, train_y_spec = dataset_split_X_y(train_data_spec)

In [243]:
msk = np.random.rand(len(train_x_norm)) < 0.8

tv_train_x_norm = train_x_norm[msk]
tv_valid_x_norm = train_x_norm[~msk]
tv_train_y_norm = train_y_norm[msk]
tv_valid_y_norm = train_y_norm[~msk]

tv_train_x_norm = tv_train_x_norm.reset_index()
tv_valid_x_norm = tv_valid_x_norm.reset_index()
tv_train_y_norm = tv_train_y_norm.reset_index()
tv_valid_y_norm = tv_valid_y_norm.reset_index()

In [244]:
msk = np.random.rand(len(train_x_spec)) < 0.8

tv_train_x_spec = train_x_spec[msk]
tv_valid_x_spec = train_x_spec[~msk]
tv_train_y_spec = train_y_spec[msk]
tv_valid_y_spec = train_y_spec[~msk]

tv_train_x_spec = tv_train_x_spec.reset_index()
tv_valid_x_spec = tv_valid_x_spec.reset_index()
tv_train_y_spec = tv_train_y_spec.reset_index()
tv_valid_y_spec = tv_valid_y_spec.reset_index()

In [245]:
print(len(tv_train_x_norm))
print(len(tv_train_x_spec))

28664
3156


In [246]:
tv_train_x = pd.concat([tv_train_x_norm, tv_train_x_spec], axis=0)
tv_valid_x = pd.concat([tv_valid_x_norm, tv_valid_x_spec], axis=0)
tv_train_y = pd.concat([tv_train_y_norm, tv_train_y_spec], axis=0)
tv_valid_y = pd.concat([tv_valid_y_norm, tv_valid_y_spec], axis=0)


In [247]:
tv_train_x.reset_index(inplace = True)
tv_valid_x.reset_index(inplace = True)
tv_train_y.reset_index(inplace = True)
tv_valid_y.reset_index(inplace = True)

In [248]:
train_x

Unnamed: 0,X_01,X_02,X_03,X_05,X_06,X_07,X_08,X_09,X_12,X_13,...,X_45,X_46,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56
0,70.544,103.320,67.47,101.892,74.983,29.45,62.38,245.71,4.34,0.18,...,0.29,1463,9706.03,137.043591,135.359219,147.837968,134.313475,125.605427,136.721425,125.028256
1,69.524,103.321,65.17,101.944,72.943,28.73,61.23,233.61,4.38,0.18,...,0.13,1463,10423.43,133.736691,135.979817,149.924692,123.630583,127.893337,143.322659,124.877308
2,72.583,103.320,64.07,103.153,72.943,28.81,105.77,272.20,4.36,0.15,...,0.14,1468,10948.53,132.805112,131.055355,146.814592,128.939070,127.012195,140.395688,122.238232
3,71.563,103.320,67.57,101.971,77.022,28.92,115.21,255.36,4.33,0.21,...,0.22,1469,15007.03,134.138760,133.239422,139.720132,132.260824,130.723186,147.624829,134.875225
4,69.524,103.320,63.57,101.981,70.904,29.68,103.38,241.46,4.35,0.16,...,0.22,1469,11051.03,142.728970,136.620022,134.853555,134.760252,125.647793,139.331105,123.272762
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39602,66.465,103.320,62.27,103.150,66.825,30.20,77.83,298.05,4.36,0.15,...,0.11,1469,60630.73,129.965741,130.807148,133.481737,125.273130,121.780933,133.780110,129.029812
39603,66.465,103.321,62.77,102.021,66.825,29.21,102.25,270.67,4.40,0.13,...,0.12,1458,60763.43,127.633885,120.158764,142.667802,122.465490,122.987209,143.090741,122.811413
39604,68.504,103.320,64.67,103.144,68.864,29.96,102.61,198.07,4.38,0.14,...,0.13,1459,8813.33,132.501286,136.893025,134.419328,129.115431,130.920147,140.489232,119.166699
39605,66.465,103.320,63.67,102.025,67.845,30.30,112.60,275.52,4.33,0.16,...,0.11,1469,62222.33,128.189679,121.495930,141.288011,130.141676,125.518825,136.603634,124.525929


In [249]:
tv_train_x = tv_train_x.iloc[:, 2:]
tv_train_y = tv_train_y.iloc[:, 2:]
tv_valid_y = tv_valid_y.iloc[:, 2:]
tv_valid_x = tv_valid_x.iloc[:, 2:]

In [250]:
tv_train_x.columns

Index(['X_01', 'X_02', 'X_03', 'X_04', 'X_05', 'X_06', 'X_07', 'X_08', 'X_09',
       'X_10', 'X_11', 'X_12', 'X_13', 'X_14', 'X_15', 'X_16', 'X_17', 'X_18',
       'X_19', 'X_20', 'X_21', 'X_22', 'X_23', 'X_24', 'X_25', 'X_26', 'X_27',
       'X_28', 'X_29', 'X_30', 'X_31', 'X_32', 'X_33', 'X_34', 'X_35', 'X_36',
       'X_37', 'X_38', 'X_39', 'X_40', 'X_41', 'X_42', 'X_43', 'X_44', 'X_45',
       'X_46', 'X_47', 'X_48', 'X_49', 'X_50', 'X_51', 'X_52', 'X_53', 'X_54',
       'X_55', 'X_56'],
      dtype='object')

In [251]:
#cols_with_zero_variance = zero_variance(tv_train_x) # 분산이 0 (통과 여부)
#tv_train_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)
#tv_valid_x = tv_train_x.drop(cols_with_zero_variance, axis = 1)

tv_train_x = tv_train_x.drop(['X_10', 'X_11'], axis = 1) # 결측치가 많음 (결측치 = 0, 공지사항)
tv_valid_x = tv_valid_x.drop(['X_10', 'X_11'], axis = 1)

#tv_valid_x = tv_valid_x.drop('ID', axis=1)

In [252]:
print(len(tv_valid_x) + len(tv_train_x))
print(len(tv_train_y) + len(tv_valid_y))

39607
39607


In [212]:
def objective(params):
    params = {
        'n_estimators': int(params['n_estimators']),
        'max_depth': int(params['max_depth']),
        'num_leaves': int(params['num_leaves']),
        'min_child_samples': int(params['min_child_samples']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
        'subsample': '{:.3f}'.format(params['subsample']),
        'min_split_gain': '{:.3f}'.format(params['min_split_gain']),
        'scale_pos_weight': '{:.3f}'.format(params['scale_pos_weight']),
        'reg_alpha': '{:.3f}'.format(params['reg_alpha']),
        'reg_lambda': '{:.3f}'.format(params['reg_lambda']),
        'learning_rate': '{:.3f}'.format(params['learning_rate']),
        
    }
    
    model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **params))
    
    loss = -cross_val_score(model, tv_train_x, tv_train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
    print("NRMSE Loss {:.5f} params {}".format(loss, params))
    return loss

In [213]:
space = {
    'n_estimators' : hp.quniform('n_estimators', 100, 1500, 50),
    'max_depth': hp.quniform('max_depth', 3, 100, 1),
    'num_leaves': hp.quniform('num_leaves', 20, 100, 10),
    'min_child_samples': hp.quniform('min_child_samples', 10, 300, 10),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'subsample': hp.uniform('subsample', 0.3, 1.0),
    'min_split_gain': hp.uniform('min_split_gain', 0, 0.7),
    'scale_pos_weight': hp.uniform('scale_pos_weight', 1, 10),
    'reg_alpha': hp.uniform('reg_alpha', 0, 100),
    'reg_lambda': hp.uniform('reg_lambda', 0, 100),
    'learning_rate': hp.loguniform('learning_rate', np.log(0.01), np.log(0.5)),
}

best = fmin(fn = objective,
            space = space,
            algo = tpe.suggest,
            max_evals = 200,
            rstate=np.random.default_rng(1))

NRMSE Loss 2.65674 params {'n_estimators': 1350, 'max_depth': 47, 'num_leaves': 20, 'min_child_samples': 240, 'colsample_bytree': '0.715', 'subsample': '0.657', 'min_split_gain': '0.655', 'scale_pos_weight': '4.117', 'reg_alpha': '22.649', 'reg_lambda': '55.509', 'learning_rate': '0.042'}
NRMSE Loss 3.07150 params {'n_estimators': 1500, 'max_depth': 13, 'num_leaves': 90, 'min_child_samples': 110, 'colsample_bytree': '0.538', 'subsample': '0.901', 'min_split_gain': '0.652', 'scale_pos_weight': '7.202', 'reg_alpha': '1.693', 'reg_lambda': '75.762', 'learning_rate': '0.159'}
NRMSE Loss 3.30260 params {'n_estimators': 250, 'max_depth': 67, 'num_leaves': 80, 'min_child_samples': 170, 'colsample_bytree': '0.387', 'subsample': '0.902', 'min_split_gain': '0.436', 'scale_pos_weight': '8.879', 'reg_alpha': '86.379', 'reg_lambda': '88.854', 'learning_rate': '0.039'}
NRMSE Loss 2.55534 params {'n_estimators': 600, 'max_depth': 27, 'num_leaves': 60, 'min_child_samples': 120, 'colsample_bytree': '0.

KeyboardInterrupt: 

In [277]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, tv_train_x, tv_train_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

KeyboardInterrupt: 

In [253]:
print(len(tv_train_x.columns))
print(len(tv_valid_x.columns))

54
54


In [254]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
model.fit(tv_train_x, tv_train_y)
preds = model.predict(tv_valid_x)

In [255]:
a = pd.DataFrame(preds)

In [256]:
a

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11,12,13
0,1.446507,1.137792,1.045768,15.227431,32.615332,16.906100,2.986165,-25.947657,-25.765378,-21.902065,24.735737,-25.784795,-25.740885,-25.757329
1,1.431742,1.120024,1.024030,15.029590,32.390215,17.209713,2.988233,-25.713384,-25.715275,-21.841914,24.962425,-25.646701,-25.655308,-25.678052
2,1.495924,1.186009,1.061395,15.356338,32.514368,17.159899,3.057053,-25.780801,-25.768229,-21.768285,24.933223,-25.678485,-25.705325,-25.711040
3,1.415409,1.065247,1.009585,14.930267,31.881809,16.961192,3.162955,-25.805136,-25.773855,-22.165272,24.812976,-25.742998,-25.743181,-25.733091
4,1.337675,1.047920,0.996762,14.976072,31.944545,16.610842,3.130619,-25.796795,-25.873355,-22.223934,24.511776,-25.771818,-25.752090,-25.767825
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7782,1.384294,0.985944,1.057341,14.019708,31.076711,16.661411,3.037990,-26.491520,-26.492135,-22.881096,24.335104,-26.388963,-26.385404,-26.398349
7783,1.268924,0.955923,0.967320,13.928643,31.355935,16.787182,3.146530,-26.378061,-26.412763,-22.671448,24.517360,-26.362269,-26.357743,-26.332813
7784,1.236473,0.938163,0.913796,13.440733,30.950467,16.624488,3.174044,-26.565908,-26.560427,-22.905764,24.302540,-26.471408,-26.476264,-26.477470
7785,1.377455,0.996994,1.022282,13.212770,30.823362,16.697535,3.158726,-26.419834,-26.473437,-22.692769,24.258464,-26.369343,-26.392119,-26.392700


In [258]:
submit = pd.read_csv('data/validation_test_submission.csv')
for idx, col in enumerate(submit.columns):
    if col=='ID':
        continue
    submit[col] = preds[:,idx-1]
#submit.to_csv('data/param_test.csv', index = False)

In [271]:
submit.head()

df_indicator = pd.DataFrame()

for i, k in enumerate(submit.columns):
    if k == 'ID':
        continue
    y_series = ~submit[k].between(y_feature_spec_info['Min'][i-1], y_feature_spec_info['Max'][i-1])
    if i == 1:
        df_indicator = y_series
    else:
        df_indicator = df_indicator + y_series


In [272]:
df_indicator

0       False
1       False
2       False
3       False
4       False
        ...  
7782    False
7783    False
7784    False
7785    False
7786    False
Length: 7787, dtype: bool

In [273]:
print(df_indicator.value_counts())
df_indicator[df_indicator==True] = 1
df_indicator[df_indicator==False] = 0

False    7787
dtype: int64


In [274]:
df_indicator

0       0
1       0
2       0
3       0
4       0
       ..
7782    0
7783    0
7784    0
7785    0
7786    0
Length: 7787, dtype: object

In [114]:
tv_valid_x['X_57']

0       1
1       1
2       1
3       1
4       1
       ..
7914    1
7915    1
7916    1
7917    1
7918    1
Name: X_57, Length: 7919, dtype: object

In [275]:
tv_valid_x['X_57'] = df_indicator



In [276]:
tv_valid_x

Unnamed: 0,X_01,X_02,X_03,X_04,X_05,X_06,X_07,X_08,X_09,X_12,...,X_48,X_49,X_50,X_51,X_52,X_53,X_54,X_55,X_56,X_57
0,71.563,103.320,67.57,1,101.971,77.022,28.92,115.21,255.36,4.33,...,1,15007.03,134.138760,133.239422,139.720132,132.260824,130.723186,147.624829,134.875225,0
1,71.563,103.320,68.77,1,101.957,77.022,29.19,65.61,226.13,4.33,...,1,13529.73,138.896565,131.893040,146.084222,124.647261,133.378628,133.070508,127.483059,0
2,72.583,103.321,68.97,1,101.886,73.963,27.28,100.07,201.50,4.32,...,1,18198.33,135.728131,136.153523,142.999094,133.327017,132.920208,132.970944,127.394512,0
3,71.563,103.321,66.97,1,101.894,73.963,29.54,70.54,232.43,4.38,...,1,10175.73,138.719342,141.135945,148.968607,129.596365,134.140392,140.277272,129.608166,0
4,68.504,103.320,66.27,1,102.060,73.963,29.27,1489.31,37.58,4.36,...,1,11701.33,132.915585,126.253651,134.689798,131.627813,128.283962,137.340878,127.799259,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7782,66.465,103.320,63.47,1,101.991,66.825,29.25,113.56,309.77,4.34,...,1,10788.33,137.863515,127.060688,141.395921,130.144148,129.194988,149.254658,134.306645,0
7783,66.465,103.320,61.37,1,103.124,66.825,30.09,75.55,271.60,4.41,...,1,9520.33,134.625760,131.708607,148.079164,134.663763,133.885074,144.587681,130.180798,0
7784,67.485,103.320,62.57,1,103.147,67.845,29.92,91.48,262.52,4.39,...,1,11219.83,129.077313,127.901684,139.193048,121.558905,129.625019,135.090886,132.432675,0
7785,66.465,103.320,66.57,1,102.004,70.904,29.68,102.63,293.71,4.42,...,1,9590.43,135.558888,130.808529,149.619753,133.522978,130.025181,147.548352,138.629078,0


In [278]:
tv_valid_x['X_57'] = tv_valid_x['X_57'].astype('int')

In [279]:
tv_valid_x.dtypes

X_01    float64
X_02    float64
X_03    float64
X_04      int64
X_05    float64
X_06    float64
X_07    float64
X_08    float64
X_09    float64
X_12    float64
X_13    float64
X_14    float64
X_15    float64
X_16    float64
X_17    float64
X_18    float64
X_19    float64
X_20    float64
X_21    float64
X_22    float64
X_23      int64
X_24    float64
X_25    float64
X_26    float64
X_27    float64
X_28    float64
X_29    float64
X_30    float64
X_31    float64
X_32    float64
X_33    float64
X_34    float64
X_35    float64
X_36    float64
X_37    float64
X_38    float64
X_39    float64
X_40    float64
X_41    float64
X_42    float64
X_43    float64
X_44    float64
X_45    float64
X_46      int64
X_47      int64
X_48      int64
X_49    float64
X_50    float64
X_51    float64
X_52    float64
X_53    float64
X_54    float64
X_55    float64
X_56    float64
X_57      int32
dtype: object

In [280]:
len(tv_train_x)
len(tv_valid_x)

7787

In [281]:
best = {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
model = MultiOutputRegressor(LGBMRegressor(n_jobs = -1, random_state = 1, **best))
    
loss = -cross_val_score(model, tv_valid_x, tv_valid_y, cv=10, scoring=make_scorer(lg_nrmse, greater_is_better=False)).mean()
print("NRMSE Loss {:.5f} params {}".format(loss, best))

NRMSE Loss 1.86358 params {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}


NRMSE Loss 1.89188 params {'colsample_bytree': 0.572280100273023, 'learning_rate': 0.010283635038627429, 'max_depth': 180, 'min_child_samples': 135, 'min_split_gain': 0.04511227284338413, 'n_estimators': 900, 'num_leaves': 70, 'reg_alpha': 4.406681827912319, 'reg_lambda': 20.4785600448913, 'scale_pos_weight': 8.302374117433086, 'subsample': 0.1688669888026464}
