In [17]:
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib import rc, rcParams
import seaborn as sns
import glob

import warnings
warnings.filterwarnings('ignore')

plt.style.use('ggplot')
pd.set_option('max_columns', 100)
pd.set_option("display.precision", 4)
rcParams['figure.figsize'] = (16, 8)
rc('font', family='AppleGothic')

In [4]:
tr_base_path = './data/train'
te_base_path = './data/test'

In [6]:
df = pd.read_csv(os.path.join(tr_base_path, 'train.csv'))

In [15]:
test = pd.read_csv(os.path.join(te_base_path, '1.csv'))

In [16]:
test

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,2.8,70.34,-6.1,0.0
1,0,0,30,0,0,2.9,71.97,-6.4,0.0
2,0,1,0,0,0,2.9,69.77,-6.7,0.0
3,0,1,30,0,0,2.9,71.95,-7.1,0.0
4,0,2,0,0,0,2.9,69.48,-7.4,0.0
...,...,...,...,...,...,...,...,...,...
331,6,21,30,0,0,1.7,43.65,-5.2,0.0
332,6,22,0,0,0,1.5,40.62,-4.9,0.0
333,6,22,30,0,0,1.5,40.62,-4.9,0.0
334,6,23,0,0,0,1.5,39.21,-4.9,0.0


In [9]:
df

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0
3,0,1,30,0,0,1.6,71.75,-12,0.0
4,0,2,0,0,0,1.6,75.20,-12,0.0
...,...,...,...,...,...,...,...,...,...
52555,1094,21,30,0,0,2.4,70.70,-4,0.0
52556,1094,22,0,0,0,2.4,66.79,-4,0.0
52557,1094,22,30,0,0,2.2,66.78,-4,0.0
52558,1094,23,0,0,0,2.1,67.72,-4,0.0


# Dacon baseline 방법론과 기존에 생각했던 방법론 체크

- 내가 생각한 것과는 차이가 있네
- 그리고 얘네 방식을 잘 이해할 수가 없는데?
    - target1: Day7 예측
        - 각 행별로 Day1 전날의 데이터로 현 시점의 Target을 예측
        - 즉, 하루 뒤의 데이터를 예측하는 데이터 셋 구축
    - target2: Day8 예측
        - 각 행별로 Day1 전날의 데이터로 Day1일 후의 Target을 예측
        - 즉, 하루전의 데이터로 이틀 뒤의 데이터를 예측하는 데이터 셋 구축

In [18]:
train = pd.read_csv(os.path.join(tr_base_path,'train.csv'))
# 파일명 불러오는 것: os.listdir과 다른점은 경로를 그대로 붙여서 불러와줌
test_files = glob.glob('./data/test/*.csv')

In [21]:
submission = pd.read_csv('./data/sample_submission.csv')

## helper function

In [24]:
# feature shift
# N(lag)행 만큼 데이터를 밀어냄
def create_lag_feats(data, lags, cols):
    
    lag_cols = []
    temp = data.copy()
    for col in cols:
        for lag in lags:
            temp[col + '_lag_%s'%lag] = temp[col].shift(lag)
            temp['Target1'] = temp['TARGET']
            # fill values forward: 각 행의 앞단의 값을 채워 넣는 것
            temp['Target2'] = temp['TARGET'].shift(-48).fillna(method='ffill')  
            lag_cols.append(col + '_lag_%s'%lag)

    return temp, lag_cols

def preprocess_data(data, target_lags=[48], weather_lags=[48], is_train=True):
    
    temp = data.copy()

    if is_train==True:          
    
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
    
        return temp[['Hour'] + temp_lag_cols1 + temp_lag_cols2 + ['Target1', 'Target2']].dropna()

    elif is_train==False:    
        
        temp, temp_lag_cols1 = create_lag_feats(temp, target_lags, ['TARGET'])
        temp, temp_lag_cols2 = create_lag_feats(temp, weather_lags, ['DHI', 'DNI', 'WS', 'RH', 'T'])
                              
        return temp[['Hour'] + temp_lag_cols1 + temp_lag_cols2].dropna()

In [26]:
df_train = preprocess_data(train, target_lags=[48], weather_lags=[48], is_train=True)

In [31]:
# Day1을 즉, 48 row만큼 행을 밀어서 1일 이후의 데이터로 1일 이후의 target값을 예측하는 방식?
# 아... Day 데이터가 의미가 없음
df_train.iloc[:48]

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,Target1,Target2
48,0,0.0,0.0,0.0,1.5,69.08,-12.0,0.0,0.0
49,0,0.0,0.0,0.0,1.5,69.06,-12.0,0.0,0.0
50,1,0.0,0.0,0.0,1.6,71.78,-12.0,0.0,0.0
51,1,0.0,0.0,0.0,1.6,71.75,-12.0,0.0,0.0
52,2,0.0,0.0,0.0,1.6,75.2,-12.0,0.0,0.0
53,2,0.0,0.0,0.0,1.5,69.29,-11.0,0.0,0.0
54,3,0.0,0.0,0.0,1.5,72.56,-11.0,0.0,0.0
55,3,0.0,0.0,0.0,1.4,72.55,-11.0,0.0,0.0
56,4,0.0,0.0,0.0,1.3,74.62,-11.0,0.0,0.0
57,4,0.0,0.0,0.0,1.3,74.61,-11.0,0.0,0.0


In [32]:
train.iloc[:48]

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
0,0,0,0,0,0,1.5,69.08,-12,0.0
1,0,0,30,0,0,1.5,69.06,-12,0.0
2,0,1,0,0,0,1.6,71.78,-12,0.0
3,0,1,30,0,0,1.6,71.75,-12,0.0
4,0,2,0,0,0,1.6,75.2,-12,0.0
5,0,2,30,0,0,1.5,69.29,-11,0.0
6,0,3,0,0,0,1.5,72.56,-11,0.0
7,0,3,30,0,0,1.4,72.55,-11,0.0
8,0,4,0,0,0,1.3,74.62,-11,0.0
9,0,4,30,0,0,1.3,74.61,-11,0.0


In [38]:
train.iloc[48:96]

Unnamed: 0,Day,Hour,Minute,DHI,DNI,WS,RH,T,TARGET
48,1,0,0,0,0,1.6,90.66,-10,0.0
49,1,0,30,0,0,1.6,90.68,-10,0.0
50,1,1,0,0,0,1.6,88.11,-11,0.0
51,1,1,30,0,0,1.6,88.11,-11,0.0
52,1,2,0,0,0,1.6,90.85,-11,0.0
53,1,2,30,0,0,1.6,90.84,-11,0.0
54,1,3,0,0,0,1.7,93.78,-12,0.0
55,1,3,30,0,0,1.7,93.77,-12,0.0
56,1,4,0,0,0,1.7,90.46,-12,0.0
57,1,4,30,0,0,1.6,90.46,-12,0.0


In [33]:
# test 데이터셋
df_test = []

for test_file in test_files:
    temp = pd.read_csv(test_file)
    # 전처리한 데이터의 하루 전날 데이터만 이용해서 즉, 어떻게 보면 Day5 0시 ~ 23:30의 데이터만
    # 을 가지고 Day7, Day8을 예측하는 셈 
    # 아 이해갔다
    temp = preprocess_data(temp, target_lags=[48], weather_lags=[48], is_train=False).iloc[-48:]
    df_test.append(temp)

X_test = pd.concat(df_test)
X_test.shape

(3888, 7)

In [36]:
X_test

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48
288,0,0.0,0.0,0.0,0.6,56.18,14.9
289,0,0.0,0.0,0.0,0.6,56.91,14.7
290,1,0.0,0.0,0.0,0.6,57.18,14.6
291,1,0.0,0.0,0.0,0.7,57.93,14.4
292,2,0.0,0.0,0.0,0.7,57.99,14.3
...,...,...,...,...,...,...,...
331,21,0.0,0.0,0.0,1.9,50.48,-5.1
332,22,0.0,0.0,0.0,2.0,49.75,-5.1
333,22,0.0,0.0,0.0,2.1,49.75,-5.1
334,23,0.0,0.0,0.0,2.1,49.22,-5.2


In [37]:
df_train

Unnamed: 0,Hour,TARGET_lag_48,DHI_lag_48,DNI_lag_48,WS_lag_48,RH_lag_48,T_lag_48,Target1,Target2
48,0,0.0,0.0,0.0,1.5,69.08,-12.0,0.0,0.0
49,0,0.0,0.0,0.0,1.5,69.06,-12.0,0.0,0.0
50,1,0.0,0.0,0.0,1.6,71.78,-12.0,0.0,0.0
51,1,0.0,0.0,0.0,1.6,71.75,-12.0,0.0,0.0
52,2,0.0,0.0,0.0,1.6,75.20,-12.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...
52555,21,0.0,0.0,0.0,2.4,68.38,-2.0,0.0,0.0
52556,22,0.0,0.0,0.0,2.4,71.09,-3.0,0.0,0.0
52557,22,0.0,0.0,0.0,2.2,71.11,-3.0,0.0,0.0
52558,23,0.0,0.0,0.0,2.1,74.99,-4.0,0.0,0.0
