In [1]:
import numpy as np
import pandas as pd
import math

import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from sklearn.preprocessing import StandardScaler

import warnings
warnings.filterwarnings('ignore')

%matplotlib inline

In [2]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')

In [3]:
train_upper = train[train['Y18'].isnull()]
train_upper = train_upper.drop(['X04', 'X14', 'X16', 'X19', 'X36', 'Y18'], axis=1)

In [4]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Lasso

xgbr = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

lgbmr = LGBMRegressor(colsample_bylevel= 0.5, colsample_bytree= 0.7, max_depth= 3, n_estimators= 1500)

lasso_reg = Lasso(alpha= 0.001)

In [5]:
columnList = train_upper.columns
target_var_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']

input_var = list(filter(lambda x: target_var_list.count(x) == 0, columnList))

In [6]:
X_features = train_upper[input_var]
X_target = train_upper['Y00']

In [33]:
from sklearn.metrics import mean_squared_error

def model_auto(model, features, target, test):
    model.fit(features, target)
    pred = model.predict(test)
    
    return pred

def model_list_auto(models, features, target, input_test):
    print("model_list_auto working")

    predList = []
    testPredList = []
    for model in models:
        model.fit(features, target)

        pred = model.predict(features)
        testPred = model.predict(input_test)

        predList.append(pred)
        testPredList.append(testPred)

    transposed = np.array(predList).T.tolist()
    testTransposed = np.array(testPredList).T.tolist()

    df = pd.DataFrame(transposed) 
    testDf = pd.DataFrame(testTransposed) 

    return df, testDf

def stack_model_pred(model, new_features, target, input_test):
    model.fit(new_features, target)
    testPred = model.predict(new_features)

    mse = mean_squared_error(target, testPred)
    rmse = np.sqrt(mse)

    print('스태킹 회귀 모델 최종 MSE :: ',mse)
    print('스태킹 회귀 모델 최종 RMSE :: ',rmse)

    pred = model.predict(input_test)
    
    return model, pred


In [16]:
trainY18 = train[train['Y18'].notnull()]
trainY18 = trainY18.drop(['X04', 'X14', 'X16', 'X19', 'X36'], axis=1)
trainY18.head()

Unnamed: 0_level_0,X00,X01,X02,X03,X05,X06,X07,X08,X09,X10,...,Y09,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17,Y18
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4320,19.3,987.7,0.9,2.2,1007.7,988.0,20.8,1007.8,1007.4,0.0,...,,,,,,,,,,20.5
4321,19.0,987.6,1.9,2.2,1007.7,988.0,20.6,1007.8,1007.4,0.0,...,,,,,,,,,,20.5
4322,19.1,987.6,2.0,1.4,1007.8,988.1,20.5,1007.9,1007.4,0.0,...,,,,,,,,,,20.5
4323,19.2,987.7,1.8,1.5,1007.9,988.1,20.5,1007.9,1007.5,0.0,...,,,,,,,,,,20.5
4324,19.2,987.8,1.4,1.4,1007.9,988.1,20.5,1007.8,1007.6,0.0,...,,,,,,,,,,20.5


In [17]:
models = [xgbr, lgbmr, lasso_reg]

X_features = train_upper[input_var]
X_test = test[input_var]

trainY18_test = trainY18[input_var]

# target_var_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']
target_var_list = ['Y15', 'Y16']
# target_var_list = ['Y00']

for target_col in target_var_list:
    print(target_col)
    X_target = train_upper[target_col]
    df, X_test_df = model_list_auto(models, X_features, X_target, X_test)
    new_model, pred = stack_model_pred(lgbmr, df, X_target, X_test_df)

    df, y18_df = model_list_auto(models, X_features, X_target, trainY18_test)
    y18_new_model, y18_pred = stack_model_pred(lgbmr, df, X_target, y18_df)
    # y18_pred = new_model.predict(y_18_df)

    test[target_col] = pred
    trainY18[target_col] = y18_pred

Y15
model_list_auto working
스태킹 회귀 모델 최종 MSE ::  0.05020387290126828
스태킹 회귀 모델 최종 RMSE ::  0.22406220765954324
model_list_auto working
스태킹 회귀 모델 최종 MSE ::  0.05020387290126828
스태킹 회귀 모델 최종 RMSE ::  0.22406220765954324
Y16
model_list_auto working
스태킹 회귀 모델 최종 MSE ::  0.07725853144750126
스태킹 회귀 모델 최종 RMSE ::  0.2779541894764338
model_list_auto working
스태킹 회귀 모델 최종 MSE ::  0.07725853144750126
스태킹 회귀 모델 최종 RMSE ::  0.2779541894764338


In [30]:
# trainY18.head()
test.head()

Unnamed: 0_level_0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,X33,X34,X35,X36,X37,X38,X39,Y00,Y15,Y16
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4752,18.7,987.4,1.5,1.3,0.0,1006.8,987.7,21.1,1007.5,1007.2,...,1007.6,0.0,194.7,0.0,84.0,59.9,0.0,22.004439,18.550034,18.822753
4753,18.8,987.4,1.1,1.2,0.0,1006.7,987.7,21.2,1007.5,1007.2,...,1007.5,0.0,173.1,0.0,84.0,60.5,0.0,21.977006,18.829327,19.027689
4754,19.0,987.4,1.3,1.2,0.0,1006.6,987.6,21.2,1007.4,1007.2,...,1007.5,0.0,208.6,0.0,85.0,60.8,0.0,22.022443,18.806921,18.6131
4755,18.7,987.4,2.3,0.8,0.0,1006.6,987.6,21.1,1007.4,1007.2,...,1007.4,0.0,185.0,0.0,85.8,61.2,0.0,22.001407,18.412259,18.205663
4756,18.4,987.4,1.1,0.7,0.0,1006.7,987.7,20.9,1007.5,1007.2,...,1007.5,0.0,162.8,0.0,87.3,61.7,0.0,21.584307,18.65569,18.649193


In [35]:
models = [xgbr, lgbmr, lasso_reg]
final_input_var = ['Y15', 'Y16']
final_target_var = ['Y18']

final_test = test[final_input_var]
final_features = trainY18[final_input_var]

output = []
for target_col in final_target_var:
    print(target_col)
    final_target = trainY18[target_col]
    df, final_test_df = model_list_auto(models, final_features, final_target, final_test)
    new_model, pred = stack_model_pred(lgbmr, df, final_target, final_test_df)

    output = pred

print('output res :: ', output)
print('output res :: ', len(output))


Y18
model_list_auto working
스태킹 회귀 모델 최종 MSE ::  0.03332872616376597
스태킹 회귀 모델 최종 RMSE ::  0.18256156814556007
output res ::  [20.17712705 20.37246313 21.19021489 ... 24.44192943 23.4631205
 23.76295941]
output res ::  11520


In [36]:
outputDf = pd.DataFrame()
outputDf['id'] = test.index
outputDf['Y18'] = output
outputDf.to_csv('submission.csv', index=False)