## Import

In [1]:
import pandas as pd
import numpy as np
import random
import os

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import LabelEncoder

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader 

from tqdm.auto import tqdm

  from .autonotebook import tqdm as notebook_tqdm


## Fixed Random Seed

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

seed_everything(42) # Seed 고정

In [3]:
def SMAPE(data):
    data["Symmetric Absolute Percentage Error"] = abs((data["Real"] - data["Prediction"]))/((abs(data["Real"]) + abs(data["Prediction"])) / 2) * 100
    smape = data["Symmetric Absolute Percentage Error"].mean()
    return smape

## Data Load

In [25]:
train_df = pd.read_csv('./data/train.csv')
test_df = pd.read_csv('./data/test.csv')
sample_submission = pd.read_csv('./data/sample_submission.csv')

In [16]:
train_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28
1,1_20220601 01,1,20220601 01,18.0,,1.1,45.0,,,1047.36
2,1_20220601 02,1,20220601 02,17.7,,1.5,45.0,,,974.88
3,1_20220601 03,1,20220601 03,16.7,,1.4,48.0,,,953.76
4,1_20220601 04,1,20220601 04,18.4,,2.8,43.0,,,986.4


In [17]:
test_df.head()

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77


## Train Data Pre-processing

In [54]:
def preprocessing(df, target_column='전력소비량(kWh)'):
    # 일조, 일사 열 제거
    unused_columns = ['일조(hr)','일사(MJ/m2)']
    for uc in unused_columns:
        if uc in df.columns:
            df = df.drop(columns=[uc])
    if target_column in df.columns:
        y = df[target_column]
        df = df.drop(columns=[target_column])
    else:
        y = None
    print("===== 결측치")
    print(df.isna().sum())

    # 강수량 결측치 0.0으로 채우기
    df['강수량(mm)'].fillna(0.0, inplace=True)

    # 풍속, 습도 결측치 평균으로 채우고 반올림하기
    df['풍속(m/s)'].fillna(round(df['풍속(m/s)'].mean(),2), inplace=True)
    df['습도(%)'].fillna(round(df['습도(%)'].mean(),2), inplace=True)

    df['month'] = df['일시'].apply(lambda x : float(x[4:6]))
    df['day'] = df['일시'].apply(lambda x : float(x[6:8]))
    df['time'] = df['일시'].apply(lambda x : float(x[9:11]))
    df = df.drop(columns=['num_date_time', '건물번호', '일시'])
    return df, y

In [55]:
X_train, y_train = preprocessing(train_df)

===== 결측치
num_date_time         0
건물번호                  0
일시                    0
기온(C)                 0
강수량(mm)          160069
풍속(m/s)              19
습도(%)                 9
dtype: int64


In [56]:
X_test, y_test = preprocessing(test_df)

===== 결측치
num_date_time    0
건물번호             0
일시               0
기온(C)            0
강수량(mm)          0
풍속(m/s)          0
습도(%)            0
month            0
day              0
time             0
dtype: int64


In [57]:
X_train

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,18.6,0.0,0.9,42.0,6.0,1.0,0.0
1,18.0,0.0,1.1,45.0,6.0,1.0,1.0
2,17.7,0.0,1.5,45.0,6.0,1.0,2.0
3,16.7,0.0,1.4,48.0,6.0,1.0,3.0
4,18.4,0.0,2.8,43.0,6.0,1.0,4.0
...,...,...,...,...,...,...,...
203995,23.1,0.0,0.9,86.0,8.0,24.0,19.0
203996,22.4,0.0,1.3,86.0,8.0,24.0,20.0
203997,21.3,0.0,1.0,92.0,8.0,24.0,21.0
203998,21.0,0.0,0.3,94.0,8.0,24.0,22.0


In [36]:
X_test

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,8.0,25.0,0.0
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,8.0,25.0,1.0
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,8.0,25.0,2.0
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,8.0,25.0,3.0
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,8.0,25.0,4.0
...,...,...,...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84,8.0,31.0,19.0
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95,8.0,31.0,20.0
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98,8.0,31.0,21.0
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97,8.0,31.0,22.0


## Hyperparameter Setting

In [138]:
# 하이퍼파라미터
input_size = 7  # feature의 개수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 5
window_size = 24  # 예측에 사용될 시간 윈도우 크기
batch_size = 64
learning_rate = 0.001

## Dataset

In [41]:
X_train

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),month,day,time
0,1_20220601 00,1,20220601 00,18.6,0.0,0.9,42.0,6.0,1.0,0.0
1,1_20220601 01,1,20220601 01,18.0,0.0,1.1,45.0,6.0,1.0,1.0
2,1_20220601 02,1,20220601 02,17.7,0.0,1.5,45.0,6.0,1.0,2.0
3,1_20220601 03,1,20220601 03,16.7,0.0,1.4,48.0,6.0,1.0,3.0
4,1_20220601 04,1,20220601 04,18.4,0.0,2.8,43.0,6.0,1.0,4.0
...,...,...,...,...,...,...,...,...,...,...
203995,100_20220824 19,100,20220824 19,23.1,0.0,0.9,86.0,8.0,24.0,19.0
203996,100_20220824 20,100,20220824 20,22.4,0.0,1.3,86.0,8.0,24.0,20.0
203997,100_20220824 21,100,20220824 21,21.3,0.0,1.0,92.0,8.0,24.0,21.0
203998,100_20220824 22,100,20220824 22,21.0,0.0,0.3,94.0,8.0,24.0,22.0


In [164]:
class TimeSeriesDataset(Dataset):
    def __init__(self, X, y, window_size):
        """
        Args:
            X: pd.DataFrame
            y: pd.Seires
        """
        # to numpy
        self.X = X
        if y is not None:
            print(len(self.X), len(y))
            assert len(self.X) == len(y), 'len(X) != len(y)'
            self.y = y.values
        else:
            self.y = y
        self.window_size = window_size

    def __len__(self):
        return len(self.X) - self.window_size

    def __getitem__(self, idx):
        x = torch.tensor(self.X[idx:idx+self.window_size], dtype=torch.float)
        if self.y is not None:
            y = torch.tensor(self.y[idx+self.window_size], dtype=torch.float)
            return x, y
        else:
            return x

def create_data_loader(X, y, window_size, batch_size):
    dataset = TimeSeriesDataset(X, y, window_size)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader

In [94]:
# normalization
scaler = MinMaxScaler()
train_data = scaler.fit_transform(X_train)

In [95]:
train_loader = create_data_loader(train_data, y_train, window_size, batch_size)

204000 204000


## Model Define

In [129]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()

        self.hidden_size = hidden_size
        self.num_layers = num_layers

        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device) 
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)

        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])

        return out

In [139]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"current device: {device}")

model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)

current device: cpu


## Train

In [200]:
for epoch in range(3):
    for i, (inputs, labels) in enumerate(train_loader):
        inputs = inputs.to(device)
        labels = labels.unsqueeze(1).to(device)

        # Forward
        outputs = model(inputs)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        if (i+1) % 300 == 0:
            print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                   .format(epoch+1, num_epochs, i+1, len(train_loader), loss.item()))

Epoch [1/5], Step [300/3188], Loss: 14438441.0000
Epoch [1/5], Step [600/3188], Loss: 2671875.5000
Epoch [1/5], Step [900/3188], Loss: 1366537.1250
Epoch [1/5], Step [1200/3188], Loss: 3030152.5000
Epoch [1/5], Step [1500/3188], Loss: 1076236.1250
Epoch [1/5], Step [1800/3188], Loss: 3130744.0000
Epoch [1/5], Step [2100/3188], Loss: 8264.1748
Epoch [1/5], Step [2400/3188], Loss: 387863.5000
Epoch [1/5], Step [2700/3188], Loss: 4240804.0000
Epoch [1/5], Step [3000/3188], Loss: 369167.0000
Epoch [2/5], Step [300/3188], Loss: 13395888.0000
Epoch [2/5], Step [600/3188], Loss: 2280156.2500
Epoch [2/5], Step [900/3188], Loss: 1073284.6250
Epoch [2/5], Step [1200/3188], Loss: 2706743.5000
Epoch [2/5], Step [1500/3188], Loss: 809189.1250
Epoch [2/5], Step [1800/3188], Loss: 2701905.0000
Epoch [2/5], Step [2100/3188], Loss: 10944.8262
Epoch [2/5], Step [2400/3188], Loss: 241722.7969
Epoch [2/5], Step [2700/3188], Loss: 3708126.7500
Epoch [2/5], Step [3000/3188], Loss: 230960.7188
Epoch [3/5], S

## Test Data Pre-processing

In [152]:
X_test, y_test = preprocessing(test_df)

===== 결측치
num_date_time    0
건물번호             0
일시               0
기온(C)            0
강수량(mm)          0
풍속(m/s)          0
습도(%)            0
month            0
day              0
time             0
dtype: int64


In [153]:
X_test = pd.concat([X_train[204000-24:], X_test]).reset_index(drop=True)

In [155]:
X_test = scaler.transform(X_test) # train과 동일하게 scaling

In [None]:
# Dataset & DataLoader
test_dataset = TimeSeriesDataset(test_data, window_size)
test_loader = DataLoader(test_dataset, batch_size=1, shuffle=False)

In [165]:
test_loader = create_data_loader(X_test, None, window_size, batch_size)

## Test Dataset

## Inference

In [201]:
test_predictions = []
for inputs in test_loader:
    inputs = inputs.to(device)
    # Forward
    outputs = model(inputs)
    test_predictions.append(outputs)

In [179]:
torch.tensor(X_test, dtype=torch.float)

tensor([[0.4444, 0.0000, 0.0451,  ..., 1.0000, 0.7667, 0.0000],
        [0.4111, 0.0000, 0.0000,  ..., 1.0000, 0.7667, 0.0435],
        [0.4000, 0.0000, 0.0150,  ..., 1.0000, 0.7667, 0.0870],
        ...,
        [0.3741, 0.0000, 0.0301,  ..., 1.0000, 1.0000, 0.9130],
        [0.3704, 0.0000, 0.0827,  ..., 1.0000, 1.0000, 0.9565],
        [0.3333, 0.0000, 0.0075,  ..., 1.0000, 1.0000, 1.0000]])

In [178]:
model(torch.tensor(X_test, dtype=torch.float))

RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors

In [173]:
model(X_test)

TypeError: 'int' object is not callable

In [78]:
model.eval()

test_predictions = []

with torch.no_grad():
    for i in range(test_data.shape[0] - window_size):
        x = torch.Tensor(test_data[i:i+window_size,:]).to(device)
        new_x = model(x.view(1,window_size,-1))
        
        test_data[i+window_size,-1] = new_x # 입력 업데이트
        test_predictions.append(new_x.detach().cpu().numpy().item()) # 예측 결과 저장

## Submit

In [80]:
predictions = scaler.inverse_transform(test_data)[24:,-1] # 원래 scale로 복구

In [81]:
sample_submission['answer'] = predictions
sample_submission.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,470.485145
1,1_20220825 01,509.742162
2,1_20220825 02,539.874985
3,1_20220825 03,556.041365
4,1_20220825 04,605.924235


In [82]:
sample_submission.to_csv('lstm_baseline_submission.csv', index=False)