In [14]:
import os
import random
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline

from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.utils.plotting import plot_series

import xgboost as xgb
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [15]:
def fix_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

fix_seed(42)

In [16]:
DATA_DIR = '/Users/mungeonhui/git/AI_model/dacon/2023전력사용량예측AI경진대회/open'
train_csv = os.path.join(DATA_DIR, 'train.csv')
test_csv = os.path.join(DATA_DIR, 'test.csv')
building_csv = os.path.join(DATA_DIR, 'building_info.csv')

In [17]:
train_set = pd.read_csv(train_csv)
test_set = pd.read_csv(test_csv)
building_info = pd.read_csv(building_csv)

train_df = pd.merge(train_set, building_info, left_on='건물번호', right_on='건물번호')
test_df = pd.merge(test_set, building_info, left_on='건물번호', right_on='건물번호')

In [18]:
# feature, label 나누기
train_label = train_df['전력소비량(kWh)']
train_feature = train_df.drop(columns=['전력소비량(kWh)'])

train_label.shape, train_feature.shape

((204000,), (204000, 15))

In [19]:
class GetTimeData(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing time transformer")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        X_['month'] = X_['일시'].apply(lambda x : int(x[4:6]))
        X_['day'] = X_['일시'].apply(lambda x : int(x[6:8]))
        X_['time'] = X_['일시'].apply(lambda x : int(x[9:11]))
        X_ = X_.drop(columns=['일시'])
        return X_


class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, ):
        print("initialising text transformer")
        self.cols = ["태양광용량(kW)", "ESS저장용량(kWh)", "PCS용량(kW)"]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        
        for col in self.cols:
            X_[col] = X_[col].replace('-', 0).astype("float64")
        return X_

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self,):
        print("Initializing Mean Imputer")
        self.imputer = SimpleImputer()
        self.cols = ['풍속(m/s)', '습도(%)']
        
    def fit(self, X, y=None):
        self.imputer.fit(X[self.cols])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = self.imputer.transform(X_[self.cols])
        return X_

class ValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("Initializing Value Imputer")
        self.cols = ['강수량(mm)']
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = X_[self.cols].fillna(0)
        return X_

class GetDrivenVar(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("Initializing Get Driven variable estimator")
        
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X
        X_['THI'] = 9 / 5 * X_['기온(C)'] - 0.55   \
                    * (1 - X_['습도(%)'] / 100)    \
                    * (9 / 5 * X_['습도(%)'] - 26) \
                    + 32
                    
        return X_

class DropField(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing drop field")
        self.cols = ["num_date_time", "건물번호", "일조(hr)", "일사(MJ/m2)"]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        columns = X.columns
        cols = [col for col in self.cols if col in columns]
        X_ = X.copy()
        X_ = X_.drop(columns=cols)
        return X_
      
      
scale_cols = [
    '풍속(m/s)', '습도(%)',
    '강수량(mm)', '기온(C)',
    '연면적(m2)', '냉방면적(m2)',
    '태양광용량(kW)', 'ESS저장용량(kWh)',
    'PCS용량(kW)', 'month', 'time', 'day', 'THI'
]
    
column_transformer = make_column_transformer(
    (StandardScaler(), scale_cols),
    (OneHotEncoder(), ['건물유형']), 
    remainder='passthrough'
)

pipeline = Pipeline([
    ('time_spliter', GetTimeData()),
    ('text_imputer', TextImputer()),
    ('mean_imputer', MeanImputer()),
    ('value_imputer', ValueImputer()),
    ('driven_variable', GetDrivenVar()),
    ('drop_field', DropField()), 
    ('column_transformer', column_transformer),
])

initializing time transformer
initialising text transformer
Initializing Mean Imputer
Initializing Value Imputer
Initializing Get Driven variable estimator
initializing drop field


In [20]:
transformed = pipeline.fit_transform(train_feature)
transformed_test = pipeline.fit_transform(test_df)

transformed.shape, transformed_test.shape


((204000, 25), (16800, 25))

In [21]:
def train_test_split(features, labels, val_hour):
    if val_hour == 0:
        return labels, None, features, None
    else:
        y_train, y_valid, x_train, x_valid = temporal_train_test_split(y=labels, X=features, test_size=val_hour)
        return y_train, y_valid, x_train, x_valid        

def SMAPE(y_true, y_pred):
    return np.mean((np.abs(y_true - y_pred)) / (np.abs(y_true) + np.abs(y_pred))) * 100

def weighted_mse(alpha=1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype('float')
        grad = np.where(residual > 0, -2 * alpha * residual, -2 * residual)
        hess = np.where(residual > 0, 2 * alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [22]:
def fit_XGB(train, test, num, path='./', param=None, seed=42):
    features, labels = train
    y_train, y_valid, x_train, x_valid = train_test_split(features, labels, 24 * 7)
    
    if param is None:
        print(f"Setting Default Param")
        min_child_weight = 6
        max_depth = 5
        colsample_bytree = .8
        subsample = .9
    else:
        assert len(param) == 6, f"Param Invalid, param have to have 6 length but recieved param {len(param)} length"
        min_child_weight = param.min_child_weight
        max_depth = int(param.max_depth)
        colsample_bytree = param.colsample_bytree
        subsample = param.subsample
        
    xgb_reg = XGBRegressor(
        n_estimators=10000, eta=0.01,
        min_child_weight=min_child_weight,
        max_depth=max_depth,
        colsample_bytree=colsample_bytree,
        subsample=subsample,
        seed=seed,
        gpu_id=1,
        tree_method='gpu_hist',
        predictor='gpu_predictor'
    )
    xgb_reg.fit(
        x_train, y_train, eval_set=[(x_train, y_train), (x_valid, y_valid)],
        early_stopping_rounds=300,
        verbose=True
    )
    
    pred = xgb_reg.predict(x_valid)
    pred = pd.Series(pred)
    pred.index = np.arange(y_valid.index[0], y_valid.index[-1] + 1)
    
    # Test
    # 건물 유형에 따른 예측 결과 확인
    result = []
    
    for i in range(13, 25):
        x_test_i = test[test[i] == 1]
        y_pred_test = xgb_reg.predict(x_test_i)
        test_series = pd.Series(
            y_pred_test, 
            index=np.arange((y_valid.index.max() + 1), (y_valid.index.max() + 1 + len(y_pred_test)))
        )
        plot_series(y_train, y_valid, pred, test_series, markers=[',',',',',',','])
        plt.title(f"{num}_{i}")
        os.makedirs(os.path.join(path, 'results'), exist_ok=True)
        plt.savefig(os.path.join(path, 'results', f"{num}_{i}"))
        
        smape_val = SMAPE(y_valid, pred)
        print(f"best iterations: {xgb_reg.best_iteration}")        
        print(f"SMAPE: {smape_val}")
        result.append([smape_val, y_valid, pred, y_pred_test])
    
    return result

In [23]:

result = fit_XGB((transformed, train_label), transformed_test, 0)

Setting Default Param


XGBoostError: [15:34:29] /Users/runner/work/xgboost/xgboost/python-package/build/temp.macosx-11.0-arm64-cpython-38/xgboost/src/gbm/../common/common.h:239: XGBoost version not compiled with GPU support.
Stack trace:
  [bt] (0) 1   libxgboost.dylib                    0x0000000176be59f8 dmlc::LogMessageFatal::~LogMessageFatal() + 124
  [bt] (1) 2   libxgboost.dylib                    0x0000000176ccb248 xgboost::gbm::GBTree::ConfigureUpdaters() + 476
  [bt] (2) 3   libxgboost.dylib                    0x0000000176cc63a0 xgboost::gbm::GBTree::Configure(std::__1::vector<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>, std::__1::allocator<std::__1::pair<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>, std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char>>>>> const&) + 952
  [bt] (3) 4   libxgboost.dylib                    0x0000000176ce61e0 xgboost::LearnerConfiguration::Configure() + 1124
  [bt] (4) 5   libxgboost.dylib                    0x0000000176bfe86c XGBoosterBoostedRounds + 104
  [bt] (5) 6   libffi.8.dylib                      0x00000001031f804c ffi_call_SYSV + 76
  [bt] (6) 7   libffi.8.dylib                      0x00000001031f5834 ffi_call_int + 1404
  [bt] (7) 8   _ctypes.cpython-311-darwin.so       0x000000010345c140 _ctypes_callproc + 752
  [bt] (8) 9   _ctypes.cpython-311-darwin.so       0x00000001034564a4 PyCFuncPtr_call + 228

