In [85]:
import numpy as np
import pandas as pd
import math

import seaborn as sns
from sklearn.model_selection import train_test_split
from matplotlib import pyplot as plt
from scipy.stats import chi2_contingency
from lightgbm import LGBMRegressor
from sklearn.preprocessing import StandardScaler

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')


In [86]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error

def get_model_cv_prediction(model, X_data, y_target):
    neg_mse_scores = cross_val_score(model, X_data, y_target, scoring="neg_mean_squared_error", cv = 5)
    rmse_scores = np.sqrt(-1 * neg_mse_scores)
    avg_rmse = np.mean(rmse_scores)
    print('##### ', model.__class__.__name__, ' #####')
    print(' 5 교차 검증의 평균 RMSE : {0:.3f} '.format(avg_rmse))

def get_stacking_base_datasets(model, X_train_n, y_train_n, X_test_n, n_folds):
    kf = KFold(n_splits=n_folds, shuffle=False, random_state=0)
    train_fold_pred = np.zeros((X_train_n.shape[0], 1))
    test_pred = np.zeros((X_test_n.shape[0], n_folds))
    # print(model.__class__.__name__, ' model 시작 ')

    for folder_counter, (train_index, valid_index) in enumerate(kf.split(X_train_n)):
        # print('\t 폴드 세트: ', folder_counter, ' 시작 ')
        X_tr = X_train_n[train_index]
        y_tr = y_train_n[train_index]
        X_te = X_train_n[valid_index]

        model.fit(X_tr, y_tr)

        train_fold_pred[valid_index, :] = model.predict(X_te).reshape(-1, 1)
        test_pred[:, folder_counter] = model.predict(X_test_n)

    test_pred_mean = np.mean(test_pred, axis=1).reshape(-1, 1)

    return train_fold_pred, test_pred_mean

def rmse_expm1(pred, true):
    return -np.sqrt(np.mean((np.expm1(pred)-np.expm1(true))**2))

def evaluate(model, x_data, y_data):
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, random_state=0)
    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)
    val_pred = model.predict(x_val)
    score = rmse_expm1(val_pred, y_val)
    return score

def rfe(x_data, y_data, method, ratio=0.9, min_feats=40):
    feats = x_data.columns.tolist()
    archive = pd.DataFrame(columns=['model', 'n_feats', 'feats', 'score'])
    while True:
        model = LGBMRegressor(objective='regression', num_iterations=10**5)
        x_train, x_val, y_train, y_val = train_test_split(x_data[feats], y_data, random_state=0)
        model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)
        val_pred = model.predict(x_val)
        score = rmse_expm1(val_pred, y_val)
        n_feats = len(feats)
        print(n_feats, score)
        archive = archive.append({'model': model, 'n_feats': n_feats, 'feats': feats, 'score': score}, ignore_index=True)
        if method == 'basic':
            feat_imp = pd.Series(model.feature_importances_, index=feats).sort_values(ascending=False)
        elif method == 'perm':
            perm = PermutationImportance(model, random_state=0).fit(x_val, y_val)
            feat_imp = pd.Series(perm.feature_importances_, index=feats).sort_values(ascending=False)
        elif method == 'shap':
            explainer = shap.TreeExplainer(model)
            shap_values = explainer.shap_values(x_data[feats])
            feat_imp = pd.Series(np.abs(shap_values).mean(axis=0), index=feats).sort_values(ascending=False)
        next_n_feats = int(n_feats * ratio)
        if next_n_feats < min_feats:
            break
        else:
            feats = feat_imp.iloc[:next_n_feats].index.tolist()
    return archive

In [87]:
train = pd.read_csv('../data/train.csv', index_col='id')
test = pd.read_csv('../data/test.csv', index_col='id')
submission = pd.read_csv('../data/sample_submission.csv', index_col='id')

In [88]:
trainMinute = (train.index%144).astype(int)
testMinute = (test.index%144).astype(int)

train['min'] = trainMinute
test['min'] = testMinute

In [89]:
features = ['X00', 'X01', 'X02', 'X03', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10', 'X11', 'X12', 'X13', 'X15', 'X17', 'X18', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X37', 'X38', 'X39', 'min']

In [90]:
import shap
import eli5
from eli5.sklearn import PermutationImportance

In [91]:
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor
from sklearn.linear_model import Lasso

xgbr = XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.7, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=1500,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)

lgbmr = LGBMRegressor(objective='regression', num_iterations=10**5)

lasso_reg = Lasso(alpha= 0.001)

model_list = [xgbr, lgbmr, lasso_reg]

In [92]:
trainIsNull = train[train['Y18'].isnull()]


x_data = trainIsNull[features]
y_data = trainIsNull['Y00']
x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, random_state=0)

xgbr.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)

val_pred = xgbr.predict(x_val)
score = rmse_expm1(val_pred, y_val)
score

## min 변수가 없으면 -2025780998808.443
## 있으면 -1839436923909.5493



-1576211398048.5164

In [93]:
def get_perm_cols(model, x_val, y_val, train_df, target_col_name):
    features = ['X00', 'X01', 'X02', 'X03', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10', 'X11', 'X12', 'X13', 'X15', 'X17', 'X18', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X37', 'X38', 'X39', 'min']

    ## perm 작업을 하기전에 model fit을 수행함
    x_data = train_df[features]
    y_data = train_df['Y00']
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, random_state=0)
    model.fit(x_train, y_train, eval_set=[(x_val, y_val)], early_stopping_rounds=100, verbose=False)

    ## fit 된 model을 permutation 작업수행

    perm = PermutationImportance(model, random_state=0).fit(x_val, y_val)
    perm_feat_imp = pd.Series(perm.feature_importances_, index=features).sort_values(ascending=False)
    return perm_feat_imp

In [97]:
def check_best_col_num(model, x_val, y_val, train_df, target_col_name):
    perm_feat_imp = get_perm_cols(model, x_val, y_val, train_df, target_col_name)
    perm_cols = []
    for i in range(10, 20, 1):
        eval_res = evaluate(model, train_df[perm_feat_imp.iloc[:i].index], train_df[target_col_name])
        perm_cols.append((i, eval_res))

    sorted = pd.DataFrame(perm_cols, columns=['col_num', 'score']).sort_values('score', ascending=False)
    print(sorted)
    col_num = sorted['col_num'].iloc[0]

    return col_num, perm_feat_imp


In [95]:
trainIsNull.head()

Unnamed: 0_level_0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17,Y18,min
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
0,9.7,988.8,1.2,0.6,0.0,1009.3,989.6,12.2,1009.9,1009.8,...,7.5,7.0,9.0,10.0,9.5,9.0,8.0,9.0,,0
1,9.3,988.9,1.7,1.9,0.0,1009.3,989.6,12.1,1010.0,1009.9,...,7.5,7.0,8.5,10.0,9.5,9.0,7.5,9.0,,1
2,9.4,989.0,1.1,2.3,0.0,1009.2,989.7,12.1,1010.1,1010.1,...,7.5,6.5,8.0,9.5,9.5,8.5,7.5,8.5,,2
3,9.4,988.9,1.5,0.7,0.0,1009.2,989.6,12.0,1010.0,1010.0,...,7.0,6.0,8.0,9.5,9.0,8.5,7.5,8.5,,3
4,9.2,988.9,0.8,1.7,0.0,1009.2,989.7,12.0,1010.1,1010.0,...,7.0,6.0,7.5,9.5,9.0,8.5,7.5,8.5,,4


In [98]:
import warnings
warnings.filterwarnings('ignore')

target_var_list = ['Y00', 'Y01', 'Y02', 'Y03', 'Y04', 'Y05', 'Y06', 'Y07', 'Y08', 'Y09', 'Y10', 'Y11', 'Y12', 'Y13', 'Y14', 'Y15', 'Y16', 'Y17']
features = ['X00', 'X01', 'X02', 'X03', 'X05', 'X06', 'X07', 'X08', 'X09', 'X10', 'X11', 'X12', 'X13', 'X15', 'X17', 'X18', 'X20', 'X21', 'X22', 'X23', 'X24', 'X25', 'X26', 'X27', 'X28', 'X29', 'X30', 'X31', 'X32', 'X33', 'X34', 'X35', 'X37', 'X38', 'X39', 'min']

# target_var_list = ['Y00']

train_y18 = train[train['Y18'].notnull()]

for target_col in target_var_list:
    print('target col :: ', target_col)
    x_data = trainIsNull[features]
    y_data = trainIsNull[target_col]
    x_train, x_val, y_train, y_val = train_test_split(x_data, y_data, random_state=0)

    best_col_num, perm_col_list = check_best_col_num(xgbr, x_val, y_val, trainIsNull, target_col)
    print('best col num :: ', best_col_num)

    x_train_new, x_val_new, y_train_new, y_val_new = train_test_split(trainIsNull[perm_col_list.iloc[:best_col_num].index], trainIsNull[target_col], random_state=1)

    xgbr.fit(x_train_new, y_train_new, eval_set=[(x_val_new, y_val_new)], early_stopping_rounds=100, verbose=False)
    train_y18[target_col] = xgbr.predict(train_y18[perm_col_list.iloc[:best_col_num].index])
    test[target_col] = xgbr.predict(test[perm_col_list.iloc[:best_col_num].index])


error.
   col_num         score
5       15 -3.925593e+16
3       13 -4.142074e+16
6       16 -4.184887e+16
4       14 -4.435507e+16
7       17 -4.597893e+16
9       19 -4.723600e+16
2       12 -4.775811e+16
8       18 -4.855887e+16
0       10 -5.589821e+16
1       11 -5.809743e+16
best col num ::15
target col ::Y10
   col_num         score
6       16 -1.918828e+19
5       15 -2.573654e+19
2       12 -2.741512e+19
1       11 -2.823848e+19
9       19 -2.885414e+19
4       14 -3.077464e+19
8       18 -3.265026e+19
0       10 -3.308065e+19
7       17 -3.777007e+19
3       13 -4.227569e+19
best col num ::16
target col ::Y11
   col_num         score
8       18 -2.937788e+21
9       19 -3.092248e+21
5       15 -3.128679e+21
1       11 -3.135752e+21
7       17 -3.156932e+21
4       14 -3.191240e+21
6       16 -3.215333e+21
0       10 -3.221476e+21
3       13 -3.245179e+21
2       12 -3.324821e+21
best col num ::18
target col ::Y12
   col_num         score
0       10 -2.267521e+17
5       15 -2

In [99]:
train_y18

Unnamed: 0_level_0,X00,X01,X02,X03,X04,X05,X06,X07,X08,X09,...,Y10,Y11,Y12,Y13,Y14,Y15,Y16,Y17,Y18,min
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4320,19.3,987.7,0.9,2.2,0.0,1007.7,988.0,20.8,1007.8,1007.4,...,19.235428,18.011063,18.785843,19.100283,18.818462,18.864697,18.864325,18.235130,20.5,0
4321,19.0,987.6,1.9,2.2,0.0,1007.7,988.0,20.6,1007.8,1007.4,...,18.998430,18.059540,18.944035,18.961758,18.895243,18.608625,19.028246,18.421999,20.5,1
4322,19.1,987.6,2.0,1.4,0.0,1007.8,988.1,20.5,1007.9,1007.4,...,18.921997,17.734970,18.990133,18.812595,18.593117,18.661751,18.332323,18.745970,20.5,2
4323,19.2,987.7,1.8,1.5,0.0,1007.9,988.1,20.5,1007.9,1007.5,...,19.075909,17.969610,19.186008,18.804304,18.582626,18.487194,18.536533,18.570435,20.5,3
4324,19.2,987.8,1.4,1.4,0.0,1007.9,988.1,20.5,1007.8,1007.6,...,19.167698,18.118151,19.230677,18.976982,18.605314,18.566994,18.252573,18.678518,20.5,4
4325,19.0,987.9,1.5,1.2,0.0,1007.8,988.1,20.4,1008.0,1007.7,...,18.873762,17.872196,19.140951,19.080946,18.605314,18.505583,18.437763,18.644232,20.5,5
4326,19.1,987.9,1.2,1.0,0.0,1007.8,988.1,20.3,1008.0,1007.7,...,19.195274,18.109852,19.014959,19.119270,18.746696,18.943382,18.578840,18.611666,20.5,6
4327,19.3,987.8,0.7,0.6,0.0,1007.8,988.1,20.4,1008.0,1007.6,...,19.054167,18.027884,19.089655,19.127529,18.733000,19.061148,18.586298,18.684757,20.5,7
4328,19.1,987.7,2.7,0.1,0.0,1007.7,988.1,20.4,1008.0,1007.5,...,19.118275,18.074217,19.062916,19.153437,18.749903,19.023693,18.865437,18.560026,20.5,8
4329,18.8,987.7,1.2,0.8,0.0,1007.6,988.1,20.4,1008.0,1007.5,...,18.680033,17.927664,19.059134,19.043219,18.519201,18.365210,18.420572,18.614449,20.5,9


In [None]:
x_train_new, x_val_new, y_train_new, y_val_new = train_test_split(train_y18[perm_col_list.iloc[:best_col_num].index], train_y18[target_col], random_state=1)

print(x_train_new, x_val_new, y_train_new, y_val_new)

In [None]:
perm_cols = []
for i in range(10, 30, 1):
    eval_res = evaluate(xgbr, trainIsNull[perm_feat_imp.iloc[:i].index], trainIsNull['Y00'])
    print(i, eval_res)
    perm_cols.append((i, eval_res))

In [None]:
sorted = pd.DataFrame(perm_cols, columns=['col_num', 'score']).sort_values('score', ascending=False)
col_num = sorted['col_num'].iloc[0]
col_num
## col_num = 16

In [None]:
eval_res = evaluate(xgbr, trainIsNull[perm_feat_imp.iloc[:col_num].index], trainIsNull['Y00'])
print(col_num, eval_res)