In [45]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split, StratifiedKFold ,KFold
# import lightgbm
from lightgbm import LGBMRegressor
import datetime

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

# training data load

In [3]:
train_path = './trainingdata'
infer_path = './inferencedata'

In [4]:
time_str = '20201225_105925'
train = pd.read_csv(
    os.path.join(train_path, f'training_{time_str}.csv')
)
X_test = pd.read_csv(
    os.path.join(infer_path, f'inference_{time_str}.csv')
)

# train test split

In [5]:
# 검증셋을 한번만 돌렸음
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(train.iloc[:, :-2], train.iloc[:, -2], test_size=0.3, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(train.iloc[:, :-2], train.iloc[:, -1], test_size=0.3, random_state=0)

In [6]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

# training code

In [23]:
X_train = train.iloc[:, :-2]
target_1 = train['Target1']
target_2 = train['Target2']

In [85]:
def LGBM_reg(X_train,y_train):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    # 랜덤모수 샘플 추론이 왜 안되지;;;
    # fold = StratifiedKFold(n_splits=5, shuffle = True)
    
    fold = KFold(n_splits=5, shuffle = True)

    oof_preds = np.zeros([X_train.shape[0], len(quantiles)])

    feature_importance_df = pd.DataFrame(np.zeros([len(X_train.columns), len(quantiles)]))
    feature_importance_df.columns = quantiles
    
    fold_metric = []
    loss_ls = []
    # training
    target = y_train
    for fold_, (train_idx, valid_idx) in enumerate(
        fold.split(np.array(X_train), target)
    ):

        train_x, train_y = X_train.iloc[train_idx], target.iloc[train_idx]
        valid_x, valid_y = X_train.iloc[valid_idx], target.iloc[valid_idx]

        oof_df = pd.DataFrame()
        for idx, q in enumerate(quantiles):
            print(f'\nquantile: {q}\n')
            model = LGBMRegressor(objective='quantile', alpha=q,
                                 n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   

            model.fit(train_x, train_y, eval_metric = ['quantile'], 
              eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=1500)

            oof_preds[[valid_idx],idx] = \
                model.predict(valid_x).round(2)

            oof_df[q] = model.predict(valid_x).round(2)
            # feature importance
            feature_importance_df[q] = model.feature_importances_ / 5
            
            
        metric = pinball(oof_df, valid_y)
        loss = np.mean(metric)
        print(f'\n FOLD {fold_}의 total pinball loss: {loss}\n')
        fold_metric.append(metric)
        loss_ls.append(loss)
        
        
        print('\n===================================================')
        print(f'FOLD {fold_} Success')
        print('===================================================\n')
        
    feature_importance_df.index = X_train.columns.tolist()
    
    return oof_preds, fold_metric, loss_ls, feature_importance_df

# Metric
- pinball loss

In [83]:
def pinball(pred, actual):
    quantile_col = pred.columns.tolist()
    pred2 = pred.copy()
    actual2 = actual.copy()
    
    pred2.index = range(len(pred2))
    actual2.index = range(len(actual2))

    pinball_loss = []
    for item in quantile_col:
        pre_pinball = []
        for idx in range(len(pred2)):
            y = actual2.iloc[idx]
            z = pred2[item].iloc[idx]
            q = item
            
            if y >= z:
                value = (y - z)*q
                pre_pinball.append(value)
            else:
                value = (z - y)*(1 - q)
                pre_pinball.append(value)
        loss_mean = np.mean(pre_pinball)
        pinball_loss.append(loss_mean)
    
    fin = np.mean(pinball_loss)
    print(f'Pinball Loss: {fin}')
    
    return pinball_loss         

# Training

In [86]:
oof_preds, fold_metric, loss_ls,feature_importance_df = LGBM_reg(X_train,target_1)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[320]	valid_0's quantile: 1.33946

quantile: 0.2

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[367]	valid_0's quantile: 2.12593

quantile: 0.3

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1131]	valid_0's quantile: 2.45818

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.59418
[3000]	valid_0's quantile: 2.57776
[4500]	valid_0's quantile: 2.56952
Early stopping, best iteration is:
[4722]	valid_0's quantile: 2.56899

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.51072
[3000]	valid_0's quantile: 2.48715
[4500]	valid_0's quantile: 2.47627
[6000]	valid_0's quantile: 2.47261
Early stopping, best iteration is:
[6665]	valid_0's quantile: 2.47019

quantile: 0.6

T

Early stopping, best iteration is:
[444]	valid_0's quantile: 1.35999

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.13934
Early stopping, best iteration is:
[1719]	valid_0's quantile: 2.12902

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.51735
[3000]	valid_0's quantile: 2.4929
Early stopping, best iteration is:
[3993]	valid_0's quantile: 2.48132

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.62689
[3000]	valid_0's quantile: 2.59884
[4500]	valid_0's quantile: 2.59108
[6000]	valid_0's quantile: 2.58498
[7500]	valid_0's quantile: 2.57537
Early stopping, best iteration is:
[8624]	valid_0's quantile: 2.57198

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.53713
[3000]	valid_0's quantile: 2.52419
[4500]	valid_0's quantile: 2.50926
[6000]	valid_0's quantile

In [87]:
oof_preds2, fold_metric2, loss_ls2, feature_importance_df2 = LGBM_reg(X_train,target_2)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[399]	valid_0's quantile: 1.3966

quantile: 0.2

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[919]	valid_0's quantile: 2.22785

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.60929
Early stopping, best iteration is:
[1506]	valid_0's quantile: 2.60908

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.75828
[3000]	valid_0's quantile: 2.73819
[4500]	valid_0's quantile: 2.72541
[6000]	valid_0's quantile: 2.72027
Early stopping, best iteration is:
[6176]	valid_0's quantile: 2.71899

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.65648
Early stopping, best iteration is:
[2585]	valid_0's quantile: 2.63826

quantile: 0.6

Training until validation scores don'

Early stopping, best iteration is:
[899]	valid_0's quantile: 2.25115

quantile: 0.3

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[716]	valid_0's quantile: 2.69157

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.80904
[3000]	valid_0's quantile: 2.778
Early stopping, best iteration is:
[3935]	valid_0's quantile: 2.77231

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.72055
[3000]	valid_0's quantile: 2.69148
[4500]	valid_0's quantile: 2.6764
[6000]	valid_0's quantile: 2.66782
[7500]	valid_0's quantile: 2.65919
Early stopping, best iteration is:
[8247]	valid_0's quantile: 2.65791

quantile: 0.6

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.42316
[3000]	valid_0's quantile: 2.40175
Early stopping, best iteration is:
[3939]	valid_0's quantile: 2.3981

quantile: 0.7

Training unti

In [89]:
day7 = np.mean(loss_ls)
day8 = np.mean(loss_ls2)
final = np.mean([day7, day8])
print(f'day7 pinball loss: {day7}')
print(f'day8 pinball loss: {day8}')
print(f'Total pinball loss: {final}')

day7 pinball loss: 1.9162037225315487
day8 pinball loss: 2.0084542828882768
Total pinball loss: 1.9623290027099127


# Recent CV value

In [55]:
# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    pred_valid = pd.Series(model.predict(X_valid).round(2))
    
    return pred, model, pred_valid

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()
    LGBM_valid_pred = pd.DataFrame()

    for q in quantiles:
        print(q)
        pred , model, pred_valid = LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model)
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)
        LGBM_valid_pred = pd.concat([LGBM_valid_pred,pred_valid],axis=1)

    LGBM_actual_pred.columns=quantiles
    LGBM_valid_pred.columns=quantiles
    
    return LGBM_models, LGBM_actual_pred, LGBM_valid_pred

In [56]:
# Target1
models_1, results_1, results_1_valid = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)

0.1
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 1.37212
Early stopping, best iteration is:
[676]	valid_0's quantile: 1.37108
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.16051
[1000]	valid_0's quantile: 2.14876
[1500]	valid_0's quantile: 2.14286
[2000]	valid_0's quantile: 2.13669
[2500]	valid_0's quantile: 2.13719
Early stopping, best iteration is:
[2316]	valid_0's quantile: 2.13589
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.57354
[1000]	valid_0's quantile: 2.54514
[1500]	valid_0's quantile: 2.53945
[2000]	valid_0's quantile: 2.53633
[2500]	valid_0's quantile: 2.53497
[3000]	valid_0's quantile: 2.53294
[3500]	valid_0's quantile: 2.5275
Early stopping, best iteration is:
[3505]	valid_0's quantile: 2.5274
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.6792
[1000]	valid_0's quantile: 2.65925
[1500]	va

In [57]:
# Target2
models_2, results_2, results_2_valid = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)

0.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[182]	valid_0's quantile: 1.4093
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.2503
[1000]	valid_0's quantile: 2.23588
Early stopping, best iteration is:
[1173]	valid_0's quantile: 2.23198
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.66486
[1000]	valid_0's quantile: 2.64927
[1500]	valid_0's quantile: 2.63853
Early stopping, best iteration is:
[1398]	valid_0's quantile: 2.63812
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.8087
[1000]	valid_0's quantile: 2.77833
[1500]	valid_0's quantile: 2.76475
Early stopping, best iteration is:
[1651]	valid_0's quantile: 2.76252
0.5
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.72555
[1000]	valid_0's quantile: 2.6931
Early stopping, best iteration is:
[1174]	valid_0

In [68]:
loss1 = pinball(results_1_valid, Y_valid_1)
loss2 = pinball(results_2_valid, Y_valid_2)
np.mean([np.mean(loss1),np.mean(loss2)])

Pinball Loss: 1.932411897632206
Pinball Loss: 2.0322045251092526


1.9823082113707293