# library

In [2]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
# import lightgbm
from lightgbm import LGBMRegressor
import datetime

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [4]:
train_path = './trainingdata'
infer_path = './inferencedata'
submission_path = './submission'
importance_path = './feature_importance'
oof_path = './oof_preds'
pil_path = './magok'

In [64]:
def pinball(pred, actual):
    quantile_col = pred.columns.tolist()
    pred2 = pred.copy()
    actual2 = actual.copy()
    
    pred2.index = range(len(pred2))
    actual2.index = range(len(actual2))

    pinball_loss = []
    for item in quantile_col:
        pre_pinball = []
        for idx in range(len(pred2)):
            y = actual2.iloc[idx]
            z = pred2[item].iloc[idx]
            q = item
            
            if y >= z:
                value = (y - z)*q
                pre_pinball.append(value)
            else:
                value = (z - y)*(1 - q)
                pre_pinball.append(value)
        loss_mean = np.mean(pre_pinball)
        pinball_loss.append(loss_mean)
    
    fin = np.mean(pinball_loss)
    print(f'Pinball Loss: {fin}')
    
    return pinball_loss         

In [92]:
# consistency_df = pd.DataFrame()

# Day4일차 -> 5,6일차 예측하는 inference 데이터셋 구축

In [40]:
tr_base_path = './data/train'
te_base_path = './data/test'
submission_path = './submission'
infer_path = './inferencedata'
train_path = './trainingdata'
cv_lb_path = './cv_lb_path'

In [34]:
# test load 
t_test = []

for i in range(81):
    file_path = './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
#     temp['label'] = i
    t_test.append(temp)
    
test = pd.concat(t_test)
df_test = test.copy()

In [35]:
day5_pred = df_test[df_test['Day'] == 5]['TARGET']
day6_pred = df_test[df_test['Day'] == 6]['TARGET']

In [56]:
test_final = day5_pred.append(day6_pred)

# data load

In [176]:
time_str = '20210110_185002'
train = pd.read_csv(
    os.path.join(cv_lb_path, f'training_{time_str}.csv')
)

test = pd.read_csv(
    os.path.join(cv_lb_path, f'inference_{time_str}.csv')
)


# CV 모델링

In [177]:
train_x, valid_x, train_y, valid_y = train_test_split(train.drop(columns = ['1day_after_target','2day_after_target']), train['1day_after_target'], test_size=0.3, random_state=0)
train_x2, valid_x2, train_y2, valid_y2 = train_test_split(train.drop(columns = ['1day_after_target','2day_after_target']), train['2day_after_target'], test_size=0.3, random_state=0)
test_df = test.copy()
valid_final = valid_y.append(valid_y2)

In [119]:
def lgbm_predict(train_x, valid_x, train_y, valid_y, test):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]


    feature_importance_df = pd.DataFrame(np.zeros([len(train_x.columns), len(quantiles)]))
    feature_importance_df.columns = quantiles

    test_df = pd.DataFrame(np.zeros([test.shape[0], len(quantiles)]))
    test_df.columns = quantiles

    oof_df = pd.DataFrame()
    for idx, q in enumerate(quantiles):
        print(f'\nquantile: {q}\n')
        model = LGBMRegressor(objective='quantile',
                              alpha=q,
                              n_estimators=10000, 
                              bagging_fraction=0.7, 
                              learning_rate=0.027, 
                              subsample=0.7
                             )   

        model.fit(train_x, train_y, eval_metric = ['quantile'], 
          eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=1500)

        oof_df[q] = model.predict(valid_x).round(2)
        test_df[q] = model.predict(test).round(2)

        feature_importance_df[q] = model.feature_importances_

#     metric = pinball(oof_df, valid_y)
#     loss = np.mean(metric)# 실제로 확인해야할 CV
    
    return oof_df, test_df

In [178]:
ood_df, test_df = lgbm_predict(train_x, valid_x, train_y, valid_y, test)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.26988
Early stopping, best iteration is:
[1612]	valid_0's quantile: 1.26845

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.93887
[3000]	valid_0's quantile: 1.91751
Early stopping, best iteration is:
[3559]	valid_0's quantile: 1.9122

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.27315
[3000]	valid_0's quantile: 2.22181
[4500]	valid_0's quantile: 2.20452
Early stopping, best iteration is:
[4849]	valid_0's quantile: 2.2013

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.38717
[3000]	valid_0's quantile: 2.33957
[4500]	valid_0's quantile: 2.30791
[6000]	valid_0's quantile: 2.29302
[7500]	valid_0's quantile: 2.2761
[9000]	valid_0's quantile: 2.26726
Did not meet early stopping. Best iteration is:
[10000]	valid

In [179]:
ood_df2, test_df2 = lgbm_predict(train_x2, valid_x2, train_y2, valid_y2, test)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.33809
Early stopping, best iteration is:
[1458]	valid_0's quantile: 1.3377

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.06149
Early stopping, best iteration is:
[1961]	valid_0's quantile: 2.0411

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.37936
[3000]	valid_0's quantile: 2.31897
[4500]	valid_0's quantile: 2.29752
[6000]	valid_0's quantile: 2.27882
[7500]	valid_0's quantile: 2.2667
[9000]	valid_0's quantile: 2.26114
Early stopping, best iteration is:
[9541]	valid_0's quantile: 2.25926

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.50565
[3000]	valid_0's quantile: 2.4214
[4500]	valid_0's quantile: 2.39187
[6000]	valid_0's quantile: 2.37688
[7500]	valid_0's quantile: 2.36887
[9000]	valid_0's quantile: 

In [180]:
oof_final = ood_df.append(ood_df2)

In [181]:
test_pred = test_df.append(test_df2)

In [182]:
loss = pinball(oof_final,valid_final)

Pinball Loss: 1.7914637216419795


In [183]:
loss2 = pinball(test_pred,test_final)

Pinball Loss: 2.2942480309448374


In [184]:
consistency_df0 = pd.DataFrame()
consistency_df0['time_str'] = [time_str]
consistency_df0['CV'] = [np.mean(loss)]
consistency_df0['LB'] = [np.mean(loss2)]
consistency_df0['column'] = [train.columns.tolist()]

In [185]:
consistency_df = consistency_df.append(consistency_df0)

In [186]:
consistency_df

Unnamed: 0,time_str,CV,LB,column
0,20210110_171258,1.857,2.3242,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
0,20210110_174452,1.8067,2.3105,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
0,20210110_174840,1.7843,2.3047,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
0,20210110_175753,1.7937,2.3056,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
0,20210110_180210,1.7801,2.306,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
0,20210110_181203,1.7971,2.31,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
0,20210110_185002,1.7915,2.2942,"[Hour, DHI, DNI, WS, RH, T, TARGET, 1day_after..."
