In [114]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
# import lightgbm
from lightgbm import LGBMRegressor
import datetime

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [115]:
train_path = './trainingdata'
infer_path = './inferencedata'
submission_path = './submission'
pil_path = './magok'

In [193]:
submission = pd.read_csv('./data/sample_submission.csv')
submission2 = pd.read_csv('./data/sample_submission.csv')

In [194]:
time_str = '20210102_215824'
train = pd.read_csv(
    os.path.join(train_path, f'training_{time_str}.csv')
)
X_test = pd.read_csv(
    os.path.join(infer_path, f'inference_{time_str}.csv')
)

In [195]:
train_day23 = pd.read_csv(
    os.path.join(train_path, f'training_{time_str}_day23.csv')
)
X_test_day23 = pd.read_csv(
    os.path.join(infer_path, f'inference_{time_str}_day23.csv')
)

# 필령 path

In [161]:
# 필령이형 path
train = pd.read_csv(
    os.path.join(pil_path, 'train_pilv2.csv')
)
X_test = pd.read_csv(
    os.path.join(pil_path, 'test_pilv2.csv')
)

In [102]:
train['Hour'] = train['Hour'].astype('category')
X_test['Hour'] = X_test['Hour'].astype('category')

In [103]:
X_train = train.drop(columns = ['1day_after_target', '2day_after_target'])
target_1 = train['1day_after_target']
target_2 = train['2day_after_target']

# Metric

In [119]:
def pinball(pred, actual):
    quantile_col = pred.columns.tolist()
    pred2 = pred.copy()
    actual2 = actual.copy()
    
    pred2.index = range(len(pred2))
    actual2.index = range(len(actual2))

    pinball_loss = []
    for item in quantile_col:
        pre_pinball = []
        for idx in range(len(pred2)):
            y = actual2.iloc[idx]
            z = pred2[item].iloc[idx]
            q = item
            
            if y >= z:
                value = (y - z)*q
                pre_pinball.append(value)
            else:
                value = (z - y)*(1 - q)
                pre_pinball.append(value)
        loss_mean = np.mean(pre_pinball)
        pinball_loss.append(loss_mean)
    
    fin = np.mean(pinball_loss)
#     print(f'Pinball Loss: {fin}')
    
    return pinball_loss         

# Inference

In [196]:
X_train = train.drop(columns = ['1day_after_target', '2day_after_target'])
target_1 = train['1day_after_target']
target_2 = train['2day_after_target']

In [197]:
X_train_day23 = train_day23.drop(columns = ['1day_after_target', '2day_after_target'])
target_1_day23 = train_day23['1day_after_target']
target_2_day23 = train_day23['2day_after_target']

In [198]:
def LGBM_reg(X_train,y_train, test):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    # 랜덤모수 샘플 추론이 왜 안되지;;;
    # fold = StratifiedKFold(n_splits=5, shuffle = True)
    
    fold = KFold(n_splits=7, shuffle = True)

    oof_preds = np.zeros([X_train.shape[0], len(quantiles)])
    
    feature_importance_df = pd.DataFrame()
    
    test_df = pd.DataFrame(np.zeros([test.shape[0], len(quantiles)]))
    test_df.columns = quantiles
    
    fold_metric = []
    # training
    target = y_train
    for fold_, (train_idx, valid_idx) in enumerate(
        fold.split(np.array(X_train), target)
    ):

        train_x, train_y = X_train.iloc[train_idx], target.iloc[train_idx]
        valid_x, valid_y = X_train.iloc[valid_idx], target.iloc[valid_idx]

        oof_df = pd.DataFrame()
        for idx, q in enumerate(quantiles):
            print(f'\nquantile: {q}\n')
            model = LGBMRegressor(objective='quantile',
                                  alpha=q,
                                  n_estimators=10000, 
                                  bagging_fraction=0.7, 
                                  learning_rate=0.027, 
                                  subsample=0.7
                                 )   

            model.fit(train_x, train_y, eval_metric = ['quantile'], 
              eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=1500)

            oof_preds[[valid_idx],idx] = \
                model.predict(valid_x).round(2)

            oof_df[q] = model.predict(valid_x).round(2)
            test_df[q] += model.predict(test).round(2) / fold.n_splits
            
        metric = pinball(oof_df, valid_y)
        loss = np.mean(metric)
        print(f'\n FOLD {fold_}의 total pinball loss: {loss}\n')
        fold_metric.append(metric)
        # feature importance
        
        print('\n===================================================')
        print(f'FOLD {fold_} Success')
        print('===================================================\n')
        
    return oof_preds, fold_metric, test_df

In [199]:
oof_preds, fold_metric, test_df = \
    LGBM_reg(X_train, target_1, X_test)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.32643
Early stopping, best iteration is:
[1601]	valid_0's quantile: 1.32622

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.02028
[3000]	valid_0's quantile: 1.9883
[4500]	valid_0's quantile: 1.96886
Early stopping, best iteration is:
[4311]	valid_0's quantile: 1.96713

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.37266
[3000]	valid_0's quantile: 2.31448
[4500]	valid_0's quantile: 2.28605
Early stopping, best iteration is:
[4304]	valid_0's quantile: 2.28441

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.50997
[3000]	valid_0's quantile: 2.44239
[4500]	valid_0's quantile: 2.3895
[6000]	valid_0's quantile: 2.3674
[7500]	valid_0's quantile: 2.35391
[9000]	valid_0's quantile: 2.34247
Did not meet early stoppin

[1500]	valid_0's quantile: 1.2828
Early stopping, best iteration is:
[1525]	valid_0's quantile: 1.28227

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.96875
Early stopping, best iteration is:
[1499]	valid_0's quantile: 1.96872

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.3131
[3000]	valid_0's quantile: 2.27674
[4500]	valid_0's quantile: 2.24946
[6000]	valid_0's quantile: 2.23137
Early stopping, best iteration is:
[6614]	valid_0's quantile: 2.22574

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.43447
[3000]	valid_0's quantile: 2.37082
[4500]	valid_0's quantile: 2.34699
[6000]	valid_0's quantile: 2.32865
[7500]	valid_0's quantile: 2.31772
[9000]	valid_0's quantile: 2.3091
Did not meet early stopping. Best iteration is:
[10000]	valid_0's quantile: 2.30474

quantile: 0.5

Training until validation scores don't 

Early stopping, best iteration is:
[5407]	valid_0's quantile: 2.01278

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.41676
[3000]	valid_0's quantile: 2.37547
[4500]	valid_0's quantile: 2.3554
[6000]	valid_0's quantile: 2.34097
[7500]	valid_0's quantile: 2.33285
[9000]	valid_0's quantile: 2.32264
Did not meet early stopping. Best iteration is:
[9998]	valid_0's quantile: 2.31741

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.55255
[3000]	valid_0's quantile: 2.50267
[4500]	valid_0's quantile: 2.47716
[6000]	valid_0's quantile: 2.46612
[7500]	valid_0's quantile: 2.45586
[9000]	valid_0's quantile: 2.44038
Did not meet early stopping. Best iteration is:
[10000]	valid_0's quantile: 2.43419

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.48727
[3000]	valid_0's quantile: 2.43084
[4500]	valid_0's quantile: 2.40129
[6000

In [200]:
oof_preds2, fold_metric2, test_df2 = \
    LGBM_reg(X_train, target_2, X_test)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[625]	valid_0's quantile: 1.37585

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.13685
[3000]	valid_0's quantile: 2.10464
[4500]	valid_0's quantile: 2.09002
Early stopping, best iteration is:
[5583]	valid_0's quantile: 2.07999

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.49549
[3000]	valid_0's quantile: 2.44578
[4500]	valid_0's quantile: 2.41934
[6000]	valid_0's quantile: 2.40853
[7500]	valid_0's quantile: 2.39629
[9000]	valid_0's quantile: 2.39047
Did not meet early stopping. Best iteration is:
[10000]	valid_0's quantile: 2.3888

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.59633
[3000]	valid_0's quantile: 2.54563
[4500]	valid_0's quantile: 2.50997
[6000]	valid_0's quantile: 2.48761
[7500]	vali

[1500]	valid_0's quantile: 2.09394
[3000]	valid_0's quantile: 2.06319
[4500]	valid_0's quantile: 2.0539
Early stopping, best iteration is:
[4571]	valid_0's quantile: 2.05369

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.49244
[3000]	valid_0's quantile: 2.41614
[4500]	valid_0's quantile: 2.39137
[6000]	valid_0's quantile: 2.37714
[7500]	valid_0's quantile: 2.36456
[9000]	valid_0's quantile: 2.35724
Did not meet early stopping. Best iteration is:
[9927]	valid_0's quantile: 2.35408

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.6229
[3000]	valid_0's quantile: 2.56804
[4500]	valid_0's quantile: 2.53448
[6000]	valid_0's quantile: 2.51214
[7500]	valid_0's quantile: 2.49988
[9000]	valid_0's quantile: 2.49059
Did not meet early stopping. Best iteration is:
[9999]	valid_0's quantile: 2.48641

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	

[1500]	valid_0's quantile: 2.19441
[3000]	valid_0's quantile: 2.16292
[4500]	valid_0's quantile: 2.13726
Early stopping, best iteration is:
[4239]	valid_0's quantile: 2.13643

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.53887
[3000]	valid_0's quantile: 2.48149
[4500]	valid_0's quantile: 2.45837
[6000]	valid_0's quantile: 2.44169
[7500]	valid_0's quantile: 2.43243
[9000]	valid_0's quantile: 2.4259
Did not meet early stopping. Best iteration is:
[9997]	valid_0's quantile: 2.42297

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.62817
[3000]	valid_0's quantile: 2.57379
[4500]	valid_0's quantile: 2.54563
[6000]	valid_0's quantile: 2.5298
[7500]	valid_0's quantile: 2.51249
[9000]	valid_0's quantile: 2.50497
Did not meet early stopping. Best iteration is:
[9995]	valid_0's quantile: 2.49749

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	

In [201]:
oof_preds_day23, fold_metric_day23, test_df_day23 = \
    LGBM_reg(X_train_day23, target_1_day23, X_test_day23)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.30925
Early stopping, best iteration is:
[2142]	valid_0's quantile: 1.30315

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.03113
[3000]	valid_0's quantile: 2.00939
Early stopping, best iteration is:
[3497]	valid_0's quantile: 2.00288

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.38949
[3000]	valid_0's quantile: 2.31302
[4500]	valid_0's quantile: 2.28437
[6000]	valid_0's quantile: 2.26775
[7500]	valid_0's quantile: 2.25704
[9000]	valid_0's quantile: 2.24702
Did not meet early stopping. Best iteration is:
[10000]	valid_0's quantile: 2.24448

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.45351
[3000]	valid_0's quantile: 2.39786
[4500]	valid_0's quantile: 2.37165
[6000]	valid_0's quantile: 2.3501
[7500]	val

[1500]	valid_0's quantile: 2.03606
[3000]	valid_0's quantile: 2.01084
Early stopping, best iteration is:
[3361]	valid_0's quantile: 2.00843

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.39594
[3000]	valid_0's quantile: 2.33087
[4500]	valid_0's quantile: 2.30898
[6000]	valid_0's quantile: 2.30034
[7500]	valid_0's quantile: 2.29693
[9000]	valid_0's quantile: 2.29366
Early stopping, best iteration is:
[8777]	valid_0's quantile: 2.29345

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.54631
[3000]	valid_0's quantile: 2.48096
[4500]	valid_0's quantile: 2.44651
[6000]	valid_0's quantile: 2.42706
[7500]	valid_0's quantile: 2.41487
[9000]	valid_0's quantile: 2.40346
Did not meet early stopping. Best iteration is:
[9983]	valid_0's quantile: 2.40095

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.46803
[3000]	valid_0's q

[1500]	valid_0's quantile: 2.10281
[3000]	valid_0's quantile: 2.07679
Early stopping, best iteration is:
[4177]	valid_0's quantile: 2.06499

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.51778
[3000]	valid_0's quantile: 2.47445
[4500]	valid_0's quantile: 2.44052
[6000]	valid_0's quantile: 2.42721
[7500]	valid_0's quantile: 2.41741
[9000]	valid_0's quantile: 2.4089
Early stopping, best iteration is:
[9666]	valid_0's quantile: 2.40566

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.61996
[3000]	valid_0's quantile: 2.56091
[4500]	valid_0's quantile: 2.53096
[6000]	valid_0's quantile: 2.5168
[7500]	valid_0's quantile: 2.50551
[9000]	valid_0's quantile: 2.49688
Did not meet early stopping. Best iteration is:
[10000]	valid_0's quantile: 2.49465

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.55365
[3000]	valid_0's qu

In [202]:
oof_preds2_day23, fold_metric2_day23, test_df2_day23 = \
    LGBM_reg(X_train_day23, target_2_day23, X_test_day23)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.3026
Early stopping, best iteration is:
[1733]	valid_0's quantile: 1.29951

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.02654
[3000]	valid_0's quantile: 1.99502
Early stopping, best iteration is:
[4027]	valid_0's quantile: 1.98609

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.36729
[3000]	valid_0's quantile: 2.31607
[4500]	valid_0's quantile: 2.29947
[6000]	valid_0's quantile: 2.29023
[7500]	valid_0's quantile: 2.27907
[9000]	valid_0's quantile: 2.26962
Did not meet early stopping. Best iteration is:
[9964]	valid_0's quantile: 2.2661

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.45013
[3000]	valid_0's quantile: 2.39361
[4500]	valid_0's quantile: 2.36738
[6000]	valid_0's quantile: 2.35198
[7500]	valid

[1500]	valid_0's quantile: 1.30292
Early stopping, best iteration is:
[1429]	valid_0's quantile: 1.30216

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.0112
[3000]	valid_0's quantile: 1.98002
[4500]	valid_0's quantile: 1.95031
[6000]	valid_0's quantile: 1.94342
[7500]	valid_0's quantile: 1.93581
[9000]	valid_0's quantile: 1.92644
Did not meet early stopping. Best iteration is:
[9840]	valid_0's quantile: 1.92206

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.36677
[3000]	valid_0's quantile: 2.31429
[4500]	valid_0's quantile: 2.28538
[6000]	valid_0's quantile: 2.26342
[7500]	valid_0's quantile: 2.24832
[9000]	valid_0's quantile: 2.2417
Did not meet early stopping. Best iteration is:
[9999]	valid_0's quantile: 2.23657

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.44631
[3000]	valid_0's quantile: 2.40236
[4500]	

Early stopping, best iteration is:
[2043]	valid_0's quantile: 1.31892

quantile: 0.2

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.0973
[3000]	valid_0's quantile: 2.06857
Early stopping, best iteration is:
[3651]	valid_0's quantile: 2.06158

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.46813
[3000]	valid_0's quantile: 2.425
[4500]	valid_0's quantile: 2.41028
[6000]	valid_0's quantile: 2.39555
[7500]	valid_0's quantile: 2.38559
[9000]	valid_0's quantile: 2.37652
Did not meet early stopping. Best iteration is:
[10000]	valid_0's quantile: 2.37154

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.5713
[3000]	valid_0's quantile: 2.52564
[4500]	valid_0's quantile: 2.49658
[6000]	valid_0's quantile: 2.48289
[7500]	valid_0's quantile: 2.47422
[9000]	valid_0's quantile: 2.46672
Did not meet early stopping. Best iteration is:
[9992]	v

# submission

In [179]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = test_df.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = test_df2.sort_index().values
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [180]:
submission2.loc[submission2.id.str.contains("Day7"), "q_0.1":] = test_df_day23.sort_index().values
submission2.loc[submission2.id.str.contains("Day8"), "q_0.1":] = test_df2_day23.sort_index().values
submission2

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# Ensemble

In [191]:
submission3 = submission[['id']]
sub_value = submission.iloc[:,1:] * 0.6 + submission2.iloc[:,1:] * 0.4

submission3 = pd.concat([submission3,sub_value],axis = 1)

# 제출

In [157]:
# time_str = datetime.datetime.strftime(
#     datetime.datetime.today(),
#     '%Y%m%d_%H%M%S'
# )

# training 데이터 저장한 시간이랑 맞춰서 사용
submission.to_csv(
    os.path.join(submission_path, f'제출용_{time_str}.csv'), 
    index=False
)

In [192]:
# 앙상블 결과 제출
submission3.to_csv(
    os.path.join(submission_path, f'제출용_{time_str}_ensemble_가중평균3.csv'), 
    index=False
)