In [12]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split, StratifiedKFold, KFold
# import lightgbm
from lightgbm import LGBMRegressor
import datetime

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [44]:
train_path = './trainingdata'
infer_path = './inferencedata'
submission_path = './submission'

In [42]:
submission = pd.read_csv('./data/sample_submission.csv')

In [4]:
time_str = '20201225_105925'
train = pd.read_csv(
    os.path.join(train_path, f'training_{time_str}.csv')
)
X_test = pd.read_csv(
    os.path.join(infer_path, f'inference_{time_str}.csv')
)

# Metric

In [35]:
def pinball(pred, actual):
    quantile_col = pred.columns.tolist()
    pred2 = pred.copy()
    actual2 = actual.copy()
    
    pred2.index = range(len(pred2))
    actual2.index = range(len(actual2))

    pinball_loss = []
    for item in quantile_col:
        pre_pinball = []
        for idx in range(len(pred2)):
            y = actual2.iloc[idx]
            z = pred2[item].iloc[idx]
            q = item
            
            if y >= z:
                value = (y - z)*q
                pre_pinball.append(value)
            else:
                value = (z - y)*(1 - q)
                pre_pinball.append(value)
        loss_mean = np.mean(pre_pinball)
        pinball_loss.append(loss_mean)
    
    fin = np.mean(pinball_loss)
#     print(f'Pinball Loss: {fin}')
    
    return pinball_loss         

# Inference

In [5]:
X_train = train.iloc[:, :-2]
target_1 = train['Target1']
target_2 = train['Target2']

In [36]:
def LGBM_reg(X_train,y_train, test):
    quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    
    # 랜덤모수 샘플 추론이 왜 안되지;;;
    # fold = StratifiedKFold(n_splits=5, shuffle = True)
    
    fold = KFold(n_splits=5, shuffle = True)

    oof_preds = np.zeros([X_train.shape[0], len(quantiles)])
    
    feature_importance_df = pd.DataFrame()
    
    test_df = pd.DataFrame(np.zeros([test.shape[0], len(quantiles)]))
    test_df.columns = quantiles
    
    fold_metric = []
    # training
    target = y_train
    for fold_, (train_idx, valid_idx) in enumerate(
        fold.split(np.array(X_train), target)
    ):

        train_x, train_y = X_train.iloc[train_idx], target.iloc[train_idx]
        valid_x, valid_y = X_train.iloc[valid_idx], target.iloc[valid_idx]

        oof_df = pd.DataFrame()
        for idx, q in enumerate(quantiles):
            print(f'\nquantile: {q}\n')
            model = LGBMRegressor(objective='quantile', alpha=q,
                                 n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   

            model.fit(train_x, train_y, eval_metric = ['quantile'], 
              eval_set=[(valid_x, valid_y)], early_stopping_rounds=300, verbose=1500)

            oof_preds[[valid_idx],idx] = \
                model.predict(valid_x).round(2)

            oof_df[q] = model.predict(valid_x).round(2)
            test_df[q] += model.predict(test).round(2) / fold.n_splits
            
        metric = pinball(oof_df, valid_y)
        loss = np.mean(metric)
        print(f'\n FOLD {fold_}의 total pinball loss: {loss}\n')
        fold_metric.append(metric)
        # feature importance
        
        print('\n===================================================')
        print(f'FOLD {fold_} Success')
        print('===================================================\n')
        
    return oof_preds, fold_metric, test_df

In [37]:
oof_preds, fold_metric, test_df = \
    LGBM_reg(X_train, target_1, X_test)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.35396
Early stopping, best iteration is:
[1789]	valid_0's quantile: 1.35139

quantile: 0.2

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1014]	valid_0's quantile: 2.119

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.49295
Early stopping, best iteration is:
[2354]	valid_0's quantile: 2.48577

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.60496
[3000]	valid_0's quantile: 2.58878
[4500]	valid_0's quantile: 2.58095
[6000]	valid_0's quantile: 2.57474
[7500]	valid_0's quantile: 2.57165
Early stopping, best iteration is:
[7288]	valid_0's quantile: 2.57121

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.5131
[3000]	valid_0's quantile: 2.49394
[4500]	valid_0's q

[1500]	valid_0's quantile: 2.08245
Early stopping, best iteration is:
[2164]	valid_0's quantile: 2.07375

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.45433
Early stopping, best iteration is:
[1401]	valid_0's quantile: 2.45412

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.59082
[3000]	valid_0's quantile: 2.56299
[4500]	valid_0's quantile: 2.55039
[6000]	valid_0's quantile: 2.54367
[7500]	valid_0's quantile: 2.53664
Early stopping, best iteration is:
[7447]	valid_0's quantile: 2.53636

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.50308
[3000]	valid_0's quantile: 2.47341
[4500]	valid_0's quantile: 2.4587
[6000]	valid_0's quantile: 2.44867
[7500]	valid_0's quantile: 2.44509
[9000]	valid_0's quantile: 2.44099
Early stopping, best iteration is:
[9619]	valid_0's quantile: 2.43951

quantile: 0.6

Training until v

In [40]:
oof_preds2, fold_metric2, test_df2 = \
    LGBM_reg(X_train, target_2, X_test)


quantile: 0.1

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[1086]	valid_0's quantile: 1.36559

quantile: 0.2

Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[681]	valid_0's quantile: 2.18399

quantile: 0.3

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.59448
[3000]	valid_0's quantile: 2.57828
Early stopping, best iteration is:
[2705]	valid_0's quantile: 2.57803

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.71621
[3000]	valid_0's quantile: 2.68641
[4500]	valid_0's quantile: 2.66715
Early stopping, best iteration is:
[4428]	valid_0's quantile: 2.66679

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.61659
[3000]	valid_0's quantile: 2.59397
[4500]	valid_0's quantile: 2.57819
[6000]	valid_0's quantile: 2.57401
Early stopping, 

[1500]	valid_0's quantile: 2.64224
[3000]	valid_0's quantile: 2.60913
Early stopping, best iteration is:
[2701]	valid_0's quantile: 2.60824

quantile: 0.4

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.76467
[3000]	valid_0's quantile: 2.7443
[4500]	valid_0's quantile: 2.73103
[6000]	valid_0's quantile: 2.72132
[7500]	valid_0's quantile: 2.71776
Early stopping, best iteration is:
[8066]	valid_0's quantile: 2.71682

quantile: 0.5

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.67756
Early stopping, best iteration is:
[1295]	valid_0's quantile: 2.67677

quantile: 0.6

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 2.39929
[3000]	valid_0's quantile: 2.38359
Early stopping, best iteration is:
[3026]	valid_0's quantile: 2.38331

quantile: 0.7

Training until validation scores don't improve for 300 rounds
[1500]	valid_0's quantile: 1.97144
Early stopping, best ite

# submission

In [None]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = test_df.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = test_df2.sort_index().values
submission

In [45]:
time_str = datetime.datetime.strftime(
    datetime.datetime.today(),
    '%Y%m%d_%H%M%S'
)

submission.to_csv(
    os.path.join(submission_path, f'제출용_{time_str}.csv'), 
    index=False
)