In [1]:
import numpy as np
import pandas as pd
import math

import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ', model.__class__.__name__, ' #####')
    print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    # print(model.__class__.__name__, ' model 시작 ')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        # print('\t 폴드 세트: ', folder_counter, ' 시작 ')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]

        model.fit(X_tr, y_tr)

        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

In [3]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')

In [4]:
train_upper = train[train['Y18'].isnull()]
train_upper = train_upper.drop(['X04', 'X14', 'X16', 'X19', 'X36', 'Y18'], axis=1)

In [5]:
train_upper.columns

Index(['X00', 'X01', 'X02', 'X03', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10',
       'X11', 'X12', 'X13', 'X15', 'X17', 'X18', 'X20', 'X21', 'X22', 'X23',
       'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33',
       'X34', 'X35', 'X37', 'X38', 'X39', 'Y00', 'Y01', 'Y02', 'Y03', 'Y04',
       'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14',
       'Y15', 'Y16', 'Y17'],
      dtype='object')

In [6]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Lasso

xgbr = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

lgbmr = LGBMRegressor(colsample_bylevel= 0.5, colsample_bytree= 0.7, max_depth= 3, n_estimators= 1500)

lasso_reg = Lasso(alpha= 0.001)

In [7]:
def three_model_auto(X_train_n, X_test_n, y_train_n):
    fold = 5
    lasso_train, lasso_test = get_stacking_base_datasets(lasso_reg, X_train_n, y_train_n, X_test_n, fold)
    xgb_train, xgb_test = get_stacking_base_datasets(xgbr, X_train_n, y_train_n, X_test_n, fold)
    lgbm_train, lgbm_test = get_stacking_base_datasets(lgbmr, X_train_n, y_train_n, X_test_n, fold)

    Stack_final_X_train = np.concatenate((lasso_train, xgb_train, lgbm_train), axis=1)
    Stack_final_X_test = np.concatenate((lasso_test, xgb_test, lgbm_test), axis=1)

    return Stack_final_X_train, Stack_final_X_test

In [8]:
columnList = train_upper.columns
target_var_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']

input_var = list(filter(lambda x: target_var_list.count(x) == 0, columnList))

In [10]:
X_features = train_upper[input_var]
y_target = train_upper['Y00']

# X_train, X_test, y_train, y_test = train_test_split(X_features, y_target, test_size=0.3, random_state=0)
# print("X_train :: ",X_train.head())
# print("X_test :: ",X_test.head())
# print("y_train :: ",y_train.head())
# print("y_test :: ",y_test.head())


# print("X_features :: ",X_features.head())
# print("test_input :: ",test[input_var].head())

In [14]:
columnList = train_upper.columns
y_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']
target_var_list = ['Y15', 'Y16']

input_var = list(filter(lambda x: y_list.count(x) == 0, columnList))

test_input = test.drop(['X04', 'X14', 'X16', 'X19', 'X36'], axis=1)
X_features = train_upper[input_var]
# X_features

In [13]:
from sklearn.metrics import mean_squared_error

def stack_auto(X_features, X_test, y_target):

    X_train_n = X_features.values
    X_test_n = X_test.values
    y_train_n = y_target.values

    Stack_final_X_train, Stack_final_X_test = three_model_auto(X_train_n, X_test_n, y_train_n)

    meta_model_lgbmr = LGBMRegressor(colsample_bylevel= 0.5, colsample_bytree= 0.7, max_depth= 3, n_estimators= 1500)

    meta_model_lgbmr.fit(Stack_final_X_train, y_target)
    final = meta_model_lgbmr.predict(Stack_final_X_train)
    mse = mean_squared_error(y_target, final)
    rmse = np.sqrt(mse)
    print('스태킹 회귀 모델 최종 RMSE :: ',rmse)

    return meta_model_lgbmr

In [16]:
X_features = train_upper[input_var]

trainNotNull = train[train['Y18'].notnull()]
trainNotNull = trainNotNull.drop(['X04', 'X14', 'X16', 'X19', 'X36'], axis=1)
trainNotNull


for target_col in target_var_list:
    print(target_col)

    y_target = train_upper[target_col]
    model = stack_auto(X_features, test_input, y_target)

    X_train_n = train_upper[input_var].values
    X_test_n = trainNotNull[input_var].values
    y_train_n = train_upper[target_col].values

    final_X_train, final_X_test = three_model_auto(X_train_n, X_test_n, y_train_n)

    pred = model.predict(final_X_test)
    trainNotNull[target_col] = pred

Y15
스태킹 회귀 모델 최종 RMSE ::  1.2604191356174996
Y16
스태킹 회귀 모델 최종 RMSE ::  1.437818611401416


In [18]:
# test

trainNotNull_X_features = trainNotNull[input_var]

for target_col in target_var_list:
    print(target_col)
    trainNotNull_y_target = trainNotNull[target_col]
    model = stack_auto(trainNotNull_X_features, test_input, trainNotNull_y_target)

    X_train_n = trainNotNull[input_var].values
    X_test_n = test[input_var].values
    y_train_n = trainNotNull[target_col].values

    final_X_train, final_X_test = three_model_auto(X_train_n, X_test_n, y_train_n)

    pred = model.predict(final_X_test)
    test[target_col] = pred

Y15
스태킹 회귀 모델 최종 RMSE ::  0.6103677216787506
Y16
스태킹 회귀 모델 최종 RMSE ::  0.5372330056569986


In [20]:
trainNotNull

input_var_y = ['Y15', 'Y16']
target_var_y = ['Y18']

trainNotNull_features_y = trainNotNull[input_var_y]
trainNotNull_y_target = trainNotNull[target_var_y]
modelBB = stack_auto(trainNotNull_features_y, trainNotNull_features_y, trainNotNull_y_target)

trainNotNull_X_train_n = trainNotNull[input_var_y].values
trainNotNull_X_test_n = test[input_var_y].values
trainNotNull_y_train_n = trainNotNull[target_var_y].values

trainNotNull_final_X_train, trainNotNull_final_X_test = three_model_auto(trainNotNull_X_train_n, trainNotNull_X_test_n, trainNotNull_y_train_n)

pred = modelBB.predict(trainNotNull_final_X_test)
pred

스태킹 회귀 모델 최종 RMSE ::  1.0377650112415577


array([21.09660948, 21.45818682, 21.54536619, ..., 30.21326435,
       26.48961455, 27.25603648])

In [21]:
outputDf = pd.DataFrame()
outputDf['id'] = test.index
outputDf['Y18'] = pred
outputDf.to_csv('submission.csv', index=False)