In [1]:
import os
import random

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader

import warnings
warnings.filterwarnings(action='ignore')

In [2]:
def fix_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True

fix_seed(42)

In [3]:
DATA_DIR = '/Users/mungeonhui/git/AI_model/dacon/2023전력사용량예측AI경진대회/open'
train_csv = os.path.join(DATA_DIR, 'train.csv')
test_csv = os.path.join(DATA_DIR, 'test.csv')
building_csv = os.path.join(DATA_DIR, 'building_info.csv')

In [4]:
train_set = pd.read_csv(train_csv)
test_set = pd.read_csv(test_csv)
building_info = pd.read_csv(building_csv)

train_df = pd.merge(train_set, building_info, left_on='건물번호', right_on='건물번호')
test_df = pd.merge(test_set, building_info, left_on='건물번호', right_on='건물번호')

In [5]:
# feature, label 나누기
train_label = train_df['전력소비량(kWh)']
train_feature = train_df.drop(columns=['전력소비량(kWh)'])

train_label.shape, train_feature.shape

((204000,), (204000, 15))

In [33]:
class DropField(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing drop field")
        self.cols = ["num_date_time", "건물번호", "일조(hr)", "일사(MJ/m2)"]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        columns = X.columns
        cols = [col for col in self.cols if col in columns]
        X_ = X.copy()
        X_ = X_.drop(columns=cols)
        return X_


class GetTimeData(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing time transformer")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        X_['month'] = X_['일시'].apply(lambda x : int(x[4:6]))
        X_['day'] = X_['일시'].apply(lambda x : int(x[6:8]))
        X_['time'] = X_['일시'].apply(lambda x : int(x[9:11]))
        X_ = X_.drop(columns=['일시'])
        return X_


class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, ):
        print("initialising text transformer")
        self.cols = ["태양광용량(kW)", "ESS저장용량(kWh)", "PCS용량(kW)"]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        
        for col in self.cols:
            X_[col] = X_[col].replace('-', 0).astype("float64")
        return X_

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self,):
        self.imputer = SimpleImputer()
        self.cols = ['풍속(m/s)', '습도(%)']
        
    def fit(self, X, y=None):
        self.imputer.fit(X[self.cols])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = self.imputer.transform(X_[self.cols])
        return X_

class ValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols = ['강수량(mm)']
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = X_[self.cols].fillna(0)
        return X_
    
scale_cols = [
    '풍속(m/s)', '습도(%)',
    '강수량(mm)', '기온(C)',
    '연면적(m2)', '냉방면적(m2)',
    '태양광용량(kW)', 'ESS저장용량(kWh)',
    'PCS용량(kW)', 'month', 'time', 'day'
]
    
column_transformer = make_column_transformer(
    (StandardScaler(), scale_cols),
    (OneHotEncoder(), ['건물유형']), 
    remainder='passthrough'
)

pipeline = Pipeline([
    ('drop_field', DropField()), 
    ('time_spliter', GetTimeData()),
    ('text_imputer', TextImputer()),
    ('mean_imputer', MeanImputer()),
    ('value_imputer', ValueImputer()),
    ('column_transformer', column_transformer),
])

initializing drop field
initializing time transformer
initialising text transformer


In [34]:
test_feature = train_feature.copy()
transformed = pipeline.fit_transform(test_feature)

In [40]:
class TimeSeriesDataset(Dataset):
    def __init__(self, data, window_size, is_train, labels=None):
        self.data = data
        self.window_size = window_size
        self.is_train = is_train
        if is_train:
            self.labels = labels
    
    def __len__(self):
        return len(self.data)
    
    def __getitem__(self, idx):
        quotient = idx // self.window_size
        remainder = idx % self.window_size
        label_idx = idx
        if remainder == 0:
            label_idx += self.window_size
        label_idx -= 1
            
        x = torch.concat([
            torch.tensor(self.data[idx:self.window_size * (quotient + 1), :], dtype=torch.float),
            torch.tensor(self.data[self.window_size * (quotient):idx, :], dtype=torch.float)
        ], dim=0)
        if self.is_train:
            y = torch.tensor(self.labels[label_idx], dtype=torch.float)
        else:
            y = None        
        return x, y


def create_data_loader(data, window_size=24, batch_size=64, is_train=False, labels=None):
    dataset = TimeSeriesDataset(data, window_size, is_train, labels=labels)
    data_loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
    return data_loader


In [41]:
class LSTM(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size):
        super(LSTM, self).__init__()
        
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.fc = nn.Sequential(
            nn.Linear(hidden_size, 1000),
            nn.Linear(1000, output_size)
        )
    
    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(x.device)
        
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

In [42]:
input_size = 24 # feature의 수
hidden_size = 64
num_layers = 2
output_size = 1
num_epochs = 30
window_size = 24
batch_size = 64
learning_rate = 0.01
device = torch.device('mps' if torch.backends.mps.is_available() else 'cpu')
print(f"device is {device}")
print(f"MPS 장치를 지원하도록 build 되었는지: {torch.backends.mps.is_built()}")
print(f"MPS 장치가 사용 가능한지: {torch.backends.mps.is_available()}") 
!python -c 'import platform;print(platform.platform())'

device is mps
MPS 장치를 지원하도록 build 되었는지: True
MPS 장치가 사용 가능한지: True
macOS-13.5.1-arm64-arm-64bit


In [43]:
import time
import copy


model = LSTM(input_size, hidden_size, num_layers, output_size).to(device)

criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)

def train(dataloader, num_epochs, device):
    since = time.time()
    
    best_model_wts = copy.deepcopy(model.state_dict())
    best_loss = float('inf')
    
    train_loss_list = []
    
    model.train()
    
    for epoch in range(num_epochs):
        running_loss = 0.0
        iteration_count = 0
        
        for i, (inputs, labels) in enumerate(dataloader):
            iteration_count += len(inputs)
            inputs = inputs.to(device)
            labels = labels.to(device)
            
            # Forward
            preds = model(inputs)
            loss = criterion(preds, labels)
            
            # Backward
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item() * inputs.size(0)
            
            if (i+1) % 1000 == 0:
                print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}' 
                    .format(epoch+1, num_epochs, i+1, len(dataloader), loss.item()))

        scheduler.step()
        
        epoch_loss = running_loss / iteration_count
        train_loss_list.append(epoch_loss)
    
        if best_loss > epoch_loss:
            print(f"Best model detected prev loss {best_loss} new loss {epoch_loss}")
            best_loss = epoch_loss
            best_model_wts = copy.deepcopy(model.state_dict())
    
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    print('Best RMSE: {:4f}'.format(best_loss))

    # load best model weights
    model.load_state_dict(best_model_wts)
    return model, train_loss_list


In [49]:
train_loader = create_data_loader(transformed, window_size, batch_size, True, train_label.to_numpy())
best_model, train_losses = train(train_loader, num_epochs, device)

Epoch [1/30], Step [1000/3188], Loss: 41203828.0000
Epoch [1/30], Step [2000/3188], Loss: 66347.8281
Epoch [1/30], Step [3000/3188], Loss: 1816913.0000
Best model detected prev loss inf new loss 2649738.9288301165
Epoch [2/30], Step [1000/3188], Loss: 34675420.0000
Epoch [2/30], Step [2000/3188], Loss: 219814.8750
Epoch [2/30], Step [3000/3188], Loss: 2602596.5000
Epoch [3/30], Step [1000/3188], Loss: 50347652.0000
Epoch [3/30], Step [2000/3188], Loss: 518166.0000
Epoch [3/30], Step [3000/3188], Loss: 463499.1562
Epoch [4/30], Step [1000/3188], Loss: 48745908.0000
Epoch [4/30], Step [2000/3188], Loss: 366325.4375
Epoch [4/30], Step [3000/3188], Loss: 682140.5625
Epoch [5/30], Step [1000/3188], Loss: 39316840.0000
Epoch [5/30], Step [2000/3188], Loss: 218175.7500


KeyboardInterrupt: 

In [41]:
def evaluate(model, dataloader, device):
    model.eval()
    
    preds = []
    for inputs, _ in dataloader:
        inputs = inputs.to(device)
        
        pred = model(inputs)
        preds += pred.cpu().detach().numpy().flatten().tolist()
    
    return preds

train_pred = evaluate(best_model, train_loader, device)
train_pred

[1967.9224853515625,
 1949.348388671875,
 1938.040283203125,
 1909.1368408203125,
 1947.8382568359375,
 1959.7479248046875,
 1998.9208984375,
 2009.8302001953125,
 2026.4693603515625,
 2020.349853515625,
 2008.127685546875,
 2017.8560791015625,
 1993.3424072265625,
 1971.8951416015625,
 1982.6602783203125,
 1987.4810791015625,
 1989.1185302734375,
 1986.798828125,
 1996.712890625,
 2008.58740234375,
 2009.71533203125,
 2032.5,
 2029.6109619140625,
 2046.98828125,
 2069.7744140625,
 2123.124755859375,
 2162.072021484375,
 2088.489013671875,
 2056.927978515625,
 1994.8843994140625,
 1935.1673583984375,
 1984.8453369140625,
 1985.5836181640625,
 2003.9725341796875,
 2006.4134521484375,
 1996.1915283203125,
 1956.8990478515625,
 1944.0560302734375,
 1917.0892333984375,
 1862.2413330078125,
 1873.6143798828125,
 1895.4090576171875,
 1928.900634765625,
 1953.6298828125,
 1964.83056640625,
 1961.4482421875,
 1924.5106201171875,
 1949.3924560546875,
 1888.792724609375,
 2008.2318115234375,
 20