In [64]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split
import lightgbm
import datetime

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [63]:
tr_base_path = './data/train'
te_base_path = './data/test'
submission_path = './submission'

In [6]:
df = pd.read_csv(os.path.join(tr_base_path, 'train.csv'))
test = pd.read_csv(os.path.join(te_base_path, '1.csv'))

# Dacon baseline 방법론과 기존에 생각했던 방법론 체크

- 내가 생각한 것과는 차이가 있네
- 그리고 얘네 방식을 잘 이해할 수가 없는데?
    - target1: Day7 예측
        - 각 행별로 Day1 전날의 데이터로 현 시점의 Target을 예측
        - 즉, 하루 뒤의 데이터를 예측하는 데이터 셋 구축
    - target2: Day8 예측
        - 각 행별로 Day1 전날의 데이터로 Day1일 후의 Target을 예측
        - 즉, 하루전의 데이터로 이틀 뒤의 데이터를 예측하는 데이터 셋 구축

In [18]:
train = pd.read_csv(os.path.join(tr_base_path,'train.csv'))
# 파일명 불러오는 것: os.listdir과 다른점은 경로를 그대로 붙여서 불러와줌
test_files = glob.glob('./data/test/*.csv')
submission = pd.read_csv('./data/sample_submission.csv')

## helper function

In [120]:
# feature shift
# N(lag)행 만큼 데이터를 밀어냄
def create_lag_feats(data, lags, cols):
    
    lag_cols = []
    temp = data.copy()
    for col in cols:
        for lag in lags:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            temp['Target1'] = temp['TARGET']
            # fill values forward: 각 행의 앞단의 값을 채워 넣는 것
            temp['Target2'] = temp['TARGET'].shift(-48).fillna(method='ffill')  
            lag_cols.append(col + '_lag_%s'%lag)

    return temp, lag_cols

def preprocess_data(data, target_lags=[48], weather_lags=[48], is_train=True):
    
    temp = data.copy()

    if is_train==True:          
    
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
    
        return temp[['Hour'] + temp_lag_cols1 + temp_lag_cols2 + ['Target1', 'Target2']].dropna()

    elif is_train==False:    
        
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
                              
        return temp[['Hour'] + temp_lag_cols1 + temp_lag_cols2].dropna()

In [121]:
df_train = preprocess_data(train, target_lags=[48], weather_lags=[48], is_train=True)

In [31]:
# Day1을 즉, 48 row만큼 행을 밀어서 1일 이후의 데이터로 1일 이후의 target값을 예측하는 방식?
# 아... Day 데이터가 의미가 없음
df_train.iloc[:48]

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,Target1,Target2
48,0,0.0,0.0,0.0,1.5,69.08,-12.0,0.0,0.0
49,0,0.0,0.0,0.0,1.5,69.06,-12.0,0.0,0.0
50,1,0.0,0.0,0.0,1.6,71.78,-12.0,0.0,0.0
51,1,0.0,0.0,0.0,1.6,71.75,-12.0,0.0,0.0
52,2,0.0,0.0,0.0,1.6,75.2,-12.0,0.0,0.0
53,2,0.0,0.0,0.0,1.5,69.29,-11.0,0.0,0.0
54,3,0.0,0.0,0.0,1.5,72.56,-11.0,0.0,0.0
55,3,0.0,0.0,0.0,1.4,72.55,-11.0,0.0,0.0
56,4,0.0,0.0,0.0,1.3,74.62,-11.0,0.0,0.0
57,4,0.0,0.0,0.0,1.3,74.61,-11.0,0.0,0.0


In [81]:
# test 데이터셋
df_test = []

for test_file in test_files:
    temp = pd.read_csv(test_file)# .iloc[-48:]
    # 전처리한 데이터의 하루 전날 데이터만 이용해서 즉, 어떻게 보면 Day5 0시 ~ 23:30의 데이터만
    # 을 가지고 Day7, Day8을 예측하는 셈 
    # 아 이해갔다
    temp = preprocess_data(temp, target_lags=[48], weather_lags=[48], is_train=False).iloc[-48:]
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

(3888, 7)

In [122]:
df_test = []

for i in range(81):
    file_path = './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path)
    temp = preprocess_data(temp, target_lags=[48], weather_lags=[48], is_train=False).iloc[-48:]
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

(3888, 7)

In [136]:
df_test = []

for i in range(81):
    file_path = './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path).iloc[-48:]
#     temp = preprocess_data(temp, target_lags=[48], weather_lags=[48], is_train=False).iloc[-48:]
    df_test.append(temp)

X_test = pd.concat(df_test)

X_test.drop(columns = ['Day', 'Minute'], inplace = True)
X_test = X_test[['Hour','TARGET','DHI','DNI','WS','RH','T']]
X_test.columns = df_train.columns.tolist()[:-2]

X_test['DHI_lag_48'] = X_test['DHI_lag_48'].astype('float64')
X_test['DNI_lag_48'] = X_test['DNI_lag_48'].astype('float64')

X_test.shape

(3888, 7)

In [124]:
X_test.duplicated().sum()

45

In [138]:
X_train_1, X_valid_1, Y_train_1, Y_valid_1 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -2], test_size=0.3, random_state=0)
X_train_2, X_valid_2, Y_train_2, Y_valid_2 = train_test_split(df_train.iloc[:, :-2], df_train.iloc[:, -1], test_size=0.3, random_state=0)

In [139]:
quantiles = [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]

In [140]:
from lightgbm import LGBMRegressor

# Get the model and the predictions in (a) - (b)
def LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test):
    
    # (a) Modeling  
    model = LGBMRegressor(objective='quantile', alpha=q,
                         n_estimators=10000, bagging_fraction=0.7, learning_rate=0.027, subsample=0.7)                   
                         
                         
    model.fit(X_train, Y_train, eval_metric = ['quantile'], 
          eval_set=[(X_valid, Y_valid)], early_stopping_rounds=300, verbose=500)

    # (b) Predictions
    pred = pd.Series(model.predict(X_test).round(2))
    return pred, model

# Target 예측

def train_data(X_train, Y_train, X_valid, Y_valid, X_test):

    LGBM_models=[]
    LGBM_actual_pred = pd.DataFrame()

    for q in quantiles:
        print(q)
        pred , model = LGBM(q, X_train, Y_train, X_valid, Y_valid, X_test)
        LGBM_models.append(model)
        LGBM_actual_pred = pd.concat([LGBM_actual_pred,pred],axis=1)

    LGBM_actual_pred.columns=quantiles
    
    return LGBM_models, LGBM_actual_pred

In [141]:
# Target1
models_1, results_1 = train_data(X_train_1, Y_train_1, X_valid_1, Y_valid_1, X_test)

0.1
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 1.37212
Early stopping, best iteration is:
[676]	valid_0's quantile: 1.37108
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.16051
[1000]	valid_0's quantile: 2.14876
[1500]	valid_0's quantile: 2.14286
[2000]	valid_0's quantile: 2.13669
[2500]	valid_0's quantile: 2.13719
Early stopping, best iteration is:
[2316]	valid_0's quantile: 2.13589
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.57354
[1000]	valid_0's quantile: 2.54514
[1500]	valid_0's quantile: 2.53945
[2000]	valid_0's quantile: 2.53633
[2500]	valid_0's quantile: 2.53497
[3000]	valid_0's quantile: 2.53294
[3500]	valid_0's quantile: 2.5275
Early stopping, best iteration is:
[3505]	valid_0's quantile: 2.5274
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.6792
[1000]	valid_0's quantile: 2.65925
[1500]	va

In [142]:
# Target2
models_2, results_2 = train_data(X_train_2, Y_train_2, X_valid_2, Y_valid_2, X_test)

0.1
Training until validation scores don't improve for 300 rounds
Early stopping, best iteration is:
[182]	valid_0's quantile: 1.4093
0.2
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.2503
[1000]	valid_0's quantile: 2.23588
Early stopping, best iteration is:
[1173]	valid_0's quantile: 2.23198
0.3
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.66486
[1000]	valid_0's quantile: 2.64927
[1500]	valid_0's quantile: 2.63853
Early stopping, best iteration is:
[1398]	valid_0's quantile: 2.63812
0.4
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.8087
[1000]	valid_0's quantile: 2.77833
[1500]	valid_0's quantile: 2.76475
Early stopping, best iteration is:
[1651]	valid_0's quantile: 2.76252
0.5
Training until validation scores don't improve for 300 rounds
[500]	valid_0's quantile: 2.72555
[1000]	valid_0's quantile: 2.6931
Early stopping, best iteration is:
[1174]	valid_0

In [143]:
print(results_1.shape, results_2.shape)

(3888, 9) (3888, 9)


In [144]:
submission.loc[submission.id.str.contains("Day7"), "q_0.1":] = results_1.sort_index().values
submission.loc[submission.id.str.contains("Day8"), "q_0.1":] = results_2.sort_index().values
submission

Unnamed: 0,id,q_0.1,q_0.2,q_0.3,q_0.4,q_0.5,q_0.6,q_0.7,q_0.8,q_0.9
0,0.csv_Day7_0h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.csv_Day7_0h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.csv_Day7_1h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.csv_Day7_1h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.csv_Day7_2h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
7771,80.csv_Day8_21h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7772,80.csv_Day8_22h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7773,80.csv_Day8_22h30m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7774,80.csv_Day8_23h00m,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [145]:
time_str = datetime.datetime.strftime(
    datetime.datetime.today(),
    '%Y%m%d_%H%M%S'
)

submission.to_csv(
    os.path.join(submission_path, f'제출용_{time_str}.csv'), 
    index=False
)

- 기존 베이스라인보다 로직에 맞게 제출한 것 같은데 왜 더 떨어지지...