In [6]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob
from sklearn.model_selection import train_test_split
import lightgbm
import datetime

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [7]:
tr_base_path = './data/train'
te_base_path = './data/test'
submission_path = './submission'
infer_path = './inferencedata'
train_path = './trainingdata'

In [8]:
train = pd.read_csv(os.path.join(tr_base_path,'train.csv'))
submission = pd.read_csv('./data/sample_submission.csv')
# 파일명 불러오는 것: os.listdir과 다른점은 경로를 그대로 붙여서 불러와줌
# test_files = glob.glob('./data/test/*.csv')

# pre-processing

In [14]:
# feature shift
# N(lag)행 만큼 데이터를 밀어냄
def create_lag_feats(data, lags, cols):
    
    lag_cols = []
    temp = data.copy()
    for col in cols:
        for lag in lags:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            temp['Target1'] = temp['TARGET']
            # fill values forward: 각 행의 앞단의 값을 채워 넣는 것
            temp['Target2'] = temp['TARGET'].shift(-48).fillna(method='ffill')  
            lag_cols.append(col + '_lag_%s'%lag)

    return temp, lag_cols

def preprocess_data(data, target_lags=[48], weather_lags=[48], is_train=True):
    
    temp = data.copy()

    if is_train==True:          
    
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
        
        # Minute 추가
        return temp[['Hour', 'Minute'] + temp_lag_cols1 + temp_lag_cols2 + ['Target1', 'Target2']].dropna()

    elif is_train==False:    
        
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
                              
        return temp[['Hour', 'Minute'] + temp_lag_cols1 + temp_lag_cols2].dropna()

In [15]:
# 하루전날 데이터를 통해서 새로운 셋 구축
# 허나... 지금 현재 방법론을 봤을 때 Day6의 테스트 데이터로 추론을 해야하는데
# Day5로 추론하는 문제가 있는데 오히려 성능이 감소됨;;;
df_train = preprocess_data(train, target_lags=[48], weather_lags=[48], is_train=True)

In [16]:
# test셋 구축
df_test = []

for i in range(81):
    file_path = './data/test/' + str(i) + '.csv'
    temp = pd.read_csv(file_path).iloc[-48:]
#     temp = preprocess_data(temp, target_lags=[48], weather_lags=[48], is_train=False).iloc[-48:]
    df_test.append(temp)

X_test = pd.concat(df_test)

X_test.drop(columns = ['Day'], inplace = True)
X_test = X_test[['Hour','Minute','TARGET','DHI','DNI','WS','RH','T']]
X_test.columns = df_train.columns.tolist()[:-2]

X_test['DHI_lag_48'] = X_test['DHI_lag_48'].astype('float64')
X_test['DNI_lag_48'] = X_test['DNI_lag_48'].astype('float64')

X_test.shape

(3888, 8)

# EDA

# Feature Engineering

In [13]:
base_columns = df_train.columns.tolist()[:-2]

In [17]:
df_train

Unnamed: 0,Hour,Minute,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,Target1,Target2
48,0,0,0.0,0.0,0.0,1.5,69.08,-12.0,0.0,0.0
49,0,30,0.0,0.0,0.0,1.5,69.06,-12.0,0.0,0.0
50,1,0,0.0,0.0,0.0,1.6,71.78,-12.0,0.0,0.0
51,1,30,0.0,0.0,0.0,1.6,71.75,-12.0,0.0,0.0
52,2,0,0.0,0.0,0.0,1.6,75.20,-12.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...
52555,21,30,0.0,0.0,0.0,2.4,68.38,-2.0,0.0,0.0
52556,22,0,0.0,0.0,0.0,2.4,71.09,-3.0,0.0,0.0
52557,22,30,0.0,0.0,0.0,2.2,71.11,-3.0,0.0,0.0
52558,23,0,0.0,0.0,0.0,2.1,74.99,-4.0,0.0,0.0


# Save

In [18]:
time_str = datetime.datetime.strftime(
    datetime.datetime.today(),
    '%Y%m%d_%H%M%S'
)

df_train.to_csv(
    os.path.join(train_path, f'training_{time_str}.csv'), 
    index=False
)

X_test.to_csv(
    os.path.join(infer_path, f'inference_{time_str}.csv'), 
    index=False
)