In [2]:
import numpy as np
import pandas as pd
import math

import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [3]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict

def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ', model.__class__.__name__, ' #####')
    print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

In [13]:
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    # print(model.__class__.__name__, ' model 시작 ')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        # print('\t 폴드 세트: ', folder_counter, ' 시작 ')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]

        model.fit(X_tr, y_tr)

        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

In [5]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')

In [6]:
train_upper = train[train['Y18'].isnull()]
train_upper = train_upper.drop(['X04', 'X14', 'X16', 'X19', 'X36', 'Y18'], axis=1)

In [7]:
train_upper.columns

Index(['X00', 'X01', 'X02', 'X03', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10',
       'X11', 'X12', 'X13', 'X15', 'X17', 'X18', 'X20', 'X21', 'X22', 'X23',
       'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33',
       'X34', 'X35', 'X37', 'X38', 'X39', 'Y00', 'Y01', 'Y02', 'Y03', 'Y04',
       'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14',
       'Y15', 'Y16', 'Y17'],
      dtype='object')

In [8]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Lasso

xgbr = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

lgbmr = LGBMRegressor(colsample_bylevel= 0.5, colsample_bytree= 0.7, max_depth= 3, n_estimators= 1500)

lasso_reg = Lasso(alpha= 0.001)

In [81]:
def three_model_auto(X_train_n, X_test_n, y_train_n):
    fold = 5
    lasso_train, lasso_test = get_stacking_base_datasets(lasso_reg, X_train_n, y_train_n, X_test_n, fold)
    xgb_train, xgb_test = get_stacking_base_datasets(xgbr, X_train_n, y_train_n, X_test_n, fold)
    lgbm_train, lgbm_test = get_stacking_base_datasets(lgbmr, X_train_n, y_train_n, X_test_n, fold)

    Stack_final_X_train = np.concatenate((lasso_train, xgb_train, lgbm_train), axis=1)
    Stack_final_X_test = np.concatenate((lasso_test, xgb_test, lgbm_test), axis=1)

    return Stack_final_X_train, Stack_final_X_test

In [9]:
columnList = train_upper.columns
target_var_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']

input_var = list(filter(lambda x: target_var_list.count(x) == 0, columnList))

In [27]:
X_features = train_upper[input_var]
y_target = train_upper['Y00']

# X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)
# print("X_train :: ",X_train.head())
# print("X_test :: ",X_test.head())
# print("y_train :: ",y_train.head())
# print("y_test :: ",y_test.head())


print("X_features :: ",X_features.head())
print("test_input :: ",test[input_var].head())


X_features ::      X00    X01  X02  X03     X05    X06   X07     X08     X09  X10  ...  \
id                                                                  ...   
0   9.7  988.8  1.2  0.6  1009.3  989.6  12.2  1009.9  1009.8  0.0  ...   
1   9.3  988.9  1.7  1.9  1009.3  989.6  12.1  1010.0  1009.9  0.0  ...   
2   9.4  989.0  1.1  2.3  1009.2  989.7  12.1  1010.1  1010.1  0.0  ...   
3   9.4  988.9  1.5  0.7  1009.2  989.6  12.0  1010.0  1010.0  0.0  ...   
4   9.2  988.9  0.8  1.7  1009.2  989.7  12.0  1010.1  1010.0  0.0  ...   

       X29   X30  X31   X32     X33  X34    X35   X37   X38  X39  
id                                                                
0   1001.6  69.1  8.2  10.7  1010.1  0.0  256.4  77.2  62.6  0.0  
1   1001.7  70.3  8.3  10.3  1010.1  0.0  215.4  77.3  63.5  0.0  
2   1001.6  71.5  8.0   9.7  1010.0  0.0  235.2  77.3  63.9  0.0  
3   1001.5  73.2  7.7   9.4  1010.1  0.0  214.0  77.5  64.5  0.0  
4   1001.5  74.3  7.4   9.4  1010.1  0.0  174.9  78.0  65

In [66]:
columnList = train_upper.columns
target_var_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']

input_var = list(filter(lambda x: target_var_list.count(x) == 0, columnList))

test_input = test.drop(['X04', 'X14', 'X16', 'X19', 'X36'], axis=1)
X_features = train_upper[input_var]
X_features

Unnamed: 0_level_0,X00,X01,X02,X03,X05,X06,X07,X08,X09,X10,...,Y08,Y09,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9.7,988.8,1.2,0.6,1009.3,989.6,12.2,1009.9,1009.8,0.0,...,10.0,7.0,7.5,7.0,9.0,10.0,9.5,9.0,8.0,9.0
1,9.3,988.9,1.7,1.9,1009.3,989.6,12.1,1010.0,1009.9,0.0,...,9.5,6.5,7.5,7.0,8.5,10.0,9.5,9.0,7.5,9.0
2,9.4,989.0,1.1,2.3,1009.2,989.7,12.1,1010.1,1010.1,0.0,...,9.0,6.5,7.5,6.5,8.0,9.5,9.5,8.5,7.5,8.5
3,9.4,988.9,1.5,0.7,1009.2,989.6,12.0,1010.0,1010.0,0.0,...,9.0,6.0,7.0,6.0,8.0,9.5,9.0,8.5,7.5,8.5
4,9.2,988.9,0.8,1.7,1009.2,989.7,12.0,1010.1,1010.0,0.0,...,8.5,6.0,7.0,6.0,7.5,9.5,9.0,8.5,7.5,8.5
5,9.1,988.9,1.3,0.7,1009.1,989.6,12.0,1010.0,1010.0,0.0,...,8.5,6.0,7.0,6.0,7.5,9.0,9.0,8.5,7.0,8.5
6,9.1,988.8,2.4,1.9,1009.1,989.6,12.0,1010.1,1010.0,0.0,...,8.5,5.5,6.5,5.5,7.5,9.0,8.5,8.0,7.0,8.0
7,8.9,988.9,1.0,1.2,1009.2,989.7,11.9,1010.1,1010.1,0.0,...,8.5,5.5,6.0,5.5,7.5,9.0,8.5,8.0,7.0,8.0
8,9.0,988.9,1.5,0.1,1009.2,989.6,11.9,1010.0,1010.1,0.0,...,8.0,5.0,6.0,5.0,7.0,9.0,8.5,8.0,7.0,8.0
9,8.8,988.9,1.4,0.4,1009.4,989.6,11.8,1010.1,1010.1,0.0,...,8.0,4.5,5.5,5.0,7.0,8.5,8.5,8.0,7.0,8.0


In [78]:
from sklearn.metrics import mean_squared_error

def stack_auto(X_features, X_test, y_target):

    X_train_n = X_features.values
    X_test_n = X_test.values
    y_train_n = y_target.values

    Stack_final_X_train, Stack_final_X_test = three_model_auto(X_train_n, X_test_n, y_train_n)

    meta_model_lgbmr = LGBMRegressor(colsample_bylevel= 0.5, colsample_bytree= 0.7, max_depth= 3, n_estimators= 1500)

    meta_model_lgbmr.fit(Stack_final_X_train, y_target)
    final = meta_model_lgbmr.predict(Stack_final_X_train)
    mse = mean_squared_error(y_target, final)
    rmse = np.sqrt(mse)
    print('스태킹 회귀 모델 최종 RMSE :: ',rmse)

    return meta_model_lgbmr

In [82]:
X_features = train_upper[input_var]

trainNotNull = train[train['Y18'].notnull()]
trainNotNull = trainNotNull.drop(['X04', 'X14', 'X16', 'X19', 'X36'], axis=1)
trainNotNull


for target_col in target_var_list:
    print(target_col)

    y_target = train_upper[target_col]
    model = stack_auto(X_features, test_input, y_target)

    X_train_n = train_upper[input_var].values
    X_test_n = trainNotNull[input_var].values
    y_train_n = train_upper[target_col].values

    final_X_train, final_X_test = three_model_auto(X_train_n, X_test_n, y_train_n)

    pred = model.predict(final_X_test)
    trainNotNull[target_col] = pred

win64_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
스태킹 회귀 모델 최종 RMSE ::  1.7090953643756879
Y07
스태킹 회귀 모델 최종 RMSE ::  2.4046914136651436
Y08
스태킹 회귀 모델 최종 RMSE ::  2.373834363585924
Y09
스태킹 회귀 모델 최종 RMSE ::  1.4428999027855829
Y10
스태킹 회귀 모델 최종 RMSE ::  1.6410069191781216
Y11
스태킹 회귀 모델 최종 RMSE ::  1.9523652540121246
Y12
스태킹 회귀 모델 최종 RMSE ::  2.0584313502918388
Y13
스태킹 회귀 모델 최종 RMSE ::  0.9872671590028124
Y14
스태킹 회귀 모델 최종 RMSE ::  1.3614650232705945
Y15
스태킹 회귀 모델 최종 RMSE ::  1.2604191356174996
Y16
스태킹 회귀 모델 최종 RMSE ::  1.437818611401416
Y17
스태킹 회귀 모델 최종 RMSE ::  1.885828630100493


In [83]:
trainNotNull

Unnamed: 0_level_0,X00,X01,X02,X03,X05,X06,X07,X08,X09,X10,...,Y09,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17,Y18
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4320,19.3,987.7,0.9,2.2,1007.7,988.0,20.8,1007.8,1007.4,0.0,...,18.024362,19.016616,18.707396,18.266830,19.493777,18.833328,19.157150,19.492274,18.524280,20.5
4321,19.0,987.6,1.9,2.2,1007.7,988.0,20.6,1007.8,1007.4,0.0,...,18.561098,19.156632,18.771770,18.634934,19.344874,18.758819,18.861898,19.356637,19.266974,20.5
4322,19.1,987.6,2.0,1.4,1007.8,988.1,20.5,1007.9,1007.4,0.0,...,18.764516,20.171874,18.485021,19.980204,19.372882,18.669037,18.772897,19.309103,18.497922,20.5
4323,19.2,987.7,1.8,1.5,1007.9,988.1,20.5,1007.9,1007.5,0.0,...,18.850351,18.986766,18.266430,19.834118,19.344874,18.669037,19.011264,19.235606,18.875372,20.5
4324,19.2,987.8,1.4,1.4,1007.9,988.1,20.5,1007.8,1007.6,0.0,...,18.838112,20.171874,18.525876,19.405385,19.159680,18.646099,19.061131,19.316650,18.962914,20.5
4325,19.0,987.9,1.5,1.2,1007.8,988.1,20.4,1008.0,1007.7,0.0,...,18.826109,19.557012,18.406596,19.839679,19.159680,18.773703,18.967355,19.465124,18.906076,20.5
4326,19.1,987.9,1.2,1.0,1007.8,988.1,20.3,1008.0,1007.7,0.0,...,18.561098,19.767994,18.218662,19.437144,19.249303,18.679527,18.923413,19.256234,18.557909,20.5
4327,19.3,987.8,0.7,0.6,1007.8,988.1,20.4,1008.0,1007.6,0.0,...,18.493920,20.112263,18.266430,19.744001,19.283814,18.648551,19.423429,19.601100,18.962066,20.5
4328,19.1,987.7,2.7,0.1,1007.7,988.1,20.4,1008.0,1007.5,0.0,...,18.564053,19.402727,18.425798,20.171782,19.301722,18.648551,19.080972,19.601100,18.939172,20.5
4329,18.8,987.7,1.2,0.8,1007.6,988.1,20.4,1008.0,1007.5,0.0,...,18.308893,18.236791,18.652354,19.019723,19.283814,18.761271,18.772897,19.256234,19.035289,20.5


In [85]:
# test

trainNotNull_X_features = trainNotNull[input_var]

for target_col in target_var_list:
    print(target_col)
    trainNotNull_y_target = trainNotNull[target_col]
    model = stack_auto(trainNotNull_X_features, test_input, trainNotNull_y_target)

    X_train_n = trainNotNull[input_var].values
    X_test_n = test[input_var].values
    y_train_n = trainNotNull[target_col].values

    final_X_train, final_X_test = three_model_auto(X_train_n, X_test_n, y_train_n)

    pred = model.predict(final_X_test)
    test[target_col] = pred


4_release_0.90/src/objective/regression_obj.cu:152: reg:linear is now deprecated in favor of reg:squarederror.
스태킹 회귀 모델 최종 RMSE ::  0.6420223162478942
Y07
스태킹 회귀 모델 최종 RMSE ::  0.8567847365235595
Y08
스태킹 회귀 모델 최종 RMSE ::  0.9612652265499358
Y09
스태킹 회귀 모델 최종 RMSE ::  0.47663732159265226
Y10
스태킹 회귀 모델 최종 RMSE ::  0.6085186610790525
Y11
스태킹 회귀 모델 최종 RMSE ::  0.8842396135571593
Y12
스태킹 회귀 모델 최종 RMSE ::  0.7407231447003322
Y13
스태킹 회귀 모델 최종 RMSE ::  0.3974545216861552
Y14
스태킹 회귀 모델 최종 RMSE ::  0.5492027282958716
Y15
스태킹 회귀 모델 최종 RMSE ::  0.6103677216787506
Y16
스태킹 회귀 모델 최종 RMSE ::  0.5372330056569986
Y17
스태킹 회귀 모델 최종 RMSE ::  0.8196228165729676


In [86]:
test

Unnamed: 0_level_0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,Y08,Y09,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4752,18.7,987.4,1.5,1.3,0.0,1006.8,987.7,21.1,1007.5,1007.2,...,20.810480,18.589973,20.834686,21.158426,19.074035,19.743733,18.931820,19.938210,18.908477,18.973299
4753,18.8,987.4,1.1,1.2,0.0,1006.7,987.7,21.2,1007.5,1007.2,...,20.640954,18.945430,20.254233,20.704838,19.493299,19.437392,18.778500,19.938210,18.974492,18.877379
4754,19.0,987.4,1.3,1.2,0.0,1006.6,987.6,21.2,1007.4,1007.2,...,20.646925,19.059014,20.343408,20.234506,19.887911,19.153711,18.753222,19.826470,18.963576,18.987541
4755,18.7,987.4,2.3,0.8,0.0,1006.6,987.6,21.1,1007.4,1007.2,...,20.157757,18.755443,19.991351,20.279525,19.463265,19.437392,18.774489,19.811864,18.664110,18.889356
4756,18.4,987.4,1.1,0.7,0.0,1006.7,987.7,20.9,1007.5,1007.2,...,20.397901,18.552885,20.256246,21.104111,19.059931,19.043804,18.744309,19.826409,18.649848,18.916155
4757,18.6,987.4,1.3,0.0,0.0,1006.8,987.7,20.7,1007.5,1007.2,...,19.761211,18.480741,20.098883,18.742098,19.493299,18.812625,18.772141,19.702203,18.838739,18.837903
4758,18.6,987.3,1.0,0.8,0.0,1006.7,987.6,20.6,1007.4,1007.1,...,19.835520,18.325949,20.126553,19.717526,18.504135,18.619740,18.783610,19.763850,18.600157,18.864702
4759,18.7,987.5,1.5,0.8,0.0,1006.7,987.5,20.5,1007.4,1007.3,...,19.755429,18.209733,20.098883,18.583011,18.443301,18.540791,18.833859,19.643970,18.940482,18.916155
4760,18.5,987.5,1.2,1.6,0.0,1006.8,987.4,20.4,1007.3,1007.4,...,19.656321,18.346893,19.993771,18.640271,19.194002,18.542074,18.552544,19.643970,18.189084,18.604016
4761,18.5,987.5,1.5,0.0,0.0,1006.8,987.5,20.4,1007.4,1007.4,...,19.673058,18.131959,19.726530,19.046563,19.041268,18.283820,18.425272,19.324241,18.057854,18.446468


In [87]:
trainNotNull

input_var_y = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']
target_var_y = ['Y18']

trainNotNull_features_y = trainNotNull[input_var_y]
trainNotNull_y_target = trainNotNull[target_var_y]
modelBB = stack_auto(trainNotNull_features_y, trainNotNull_features_y, trainNotNull_y_target)

trainNotNull_X_train_n = trainNotNull[input_var_y].values
trainNotNull_X_test_n = test[input_var_y].values
trainNotNull_y_train_n = trainNotNull[target_var_y].values

trainNotNull_final_X_train, trainNotNull_final_X_test = three_model_auto(trainNotNull_X_train_n, trainNotNull_X_test_n, trainNotNull_y_train_n)

pred = modelBB.predict(trainNotNull_final_X_test)
pred

스태킹 회귀 모델 최종 RMSE ::  0.9289497864980866


array([20.99288906, 20.88184469, 20.92225391, ..., 28.88505748,
       29.45419682, 29.36736865])

In [88]:
pred

array([20.99288906, 20.88184469, 20.92225391, ..., 28.88505748,
       29.45419682, 29.36736865])

In [89]:
outputDf = pd.DataFrame()
outputDf['id'] = test.index
outputDf['Y18'] = pred
outputDf.to_csv('submission.csv', index=False)