In [1]:
!pip install sktime

Collecting sktime
  Downloading sktime-0.22.0-py3-none-any.whl (17.5 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m17.5/17.5 MB[0m [31m62.2 MB/s[0m eta [36m0:00:00[0m
Collecting scikit-base<0.6.0 (from sktime)
  Downloading scikit_base-0.5.1-py3-none-any.whl (118 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m118.7/118.7 kB[0m [31m12.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: scikit-base, sktime
Successfully installed scikit-base-0.5.1 sktime-0.22.0


In [2]:
import os
import random
import sys

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, OrdinalEncoder
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import GridSearchCV

from sktime.utils.plotting import plot_series

import xgboost as xgb
from xgboost import XGBRegressor

import warnings
warnings.filterwarnings(action='ignore')



In [3]:
def fix_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

fix_seed(42)

In [4]:
DATA_DIR = '/kaggle/input/electro-data'
train_csv = os.path.join(DATA_DIR, 'train.csv')
test_csv = os.path.join(DATA_DIR, 'test.csv')
building_csv = os.path.join(DATA_DIR, 'building_info.csv')

In [5]:
train_set = pd.read_csv(train_csv)
test_set = pd.read_csv(test_csv)
building_info = pd.read_csv(building_csv)

train_df = pd.merge(train_set, building_info, left_on='건물번호', right_on='건물번호')
test_df = pd.merge(test_set, building_info, left_on='건물번호', right_on='건물번호')

In [6]:
# feature, label 나누기
train_label = train_df['전력소비량(kWh)']
train_feature = train_df.drop(columns=['전력소비량(kWh)'])

train_label.shape, train_feature.shape

((204000,), (204000, 15))

In [7]:
class GetTimeData(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing time transformer")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_['month'] = X_['일시'].apply(lambda x : int(x[4:6]))
        X_['day'] = X_['일시'].apply(lambda x : int(x[6:8]))
        X_['time'] = X_['일시'].apply(lambda x : int(x[9:11]))
        X_ = X_.drop(columns=['일시'])
        return X_


class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, ):
        print("initialising text transformer")
        self.cols = ["태양광용량(kW)", "ESS저장용량(kWh)", "PCS용량(kW)"]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()

        for col in self.cols:
            X_[col] = X_[col].replace('-', 0).astype("float64")
        return X_

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self,):
        print("Initializing Mean Imputer")
        self.imputer = SimpleImputer()
        self.cols = ['풍속(m/s)', '습도(%)']

    def fit(self, X, y=None):
        self.imputer.fit(X[self.cols])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = self.imputer.transform(X_[self.cols])
        return X_

class ValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("Initializing Value Imputer")
        self.cols = ['강수량(mm)']

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = X_[self.cols].fillna(0)
        return X_

class GetDrivenVar(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("Initializing Get Driven variable estimator")

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X
        X_['THI'] = 9 / 5 * X_['기온(C)'] - 0.55   \
                    * (1 - X_['습도(%)'] / 100)    \
                    * (9 / 5 * X_['습도(%)'] - 26) \
                    + 32

        return X_

class DropField(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing drop field")
        self.cols = ["num_date_time", "건물번호", "일조(hr)", "일사(MJ/m2)"]

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        columns = X.columns
        cols = [col for col in self.cols if col in columns]
        X_ = X.copy()
        X_ = X_.drop(columns=cols)
        return X_


scale_cols = [
    '풍속(m/s)', '습도(%)',
    '강수량(mm)', '기온(C)',
    '연면적(m2)', '냉방면적(m2)',
    '태양광용량(kW)', 'ESS저장용량(kWh)',
    'PCS용량(kW)', 'THI'
]

column_transformer = make_column_transformer(
    (StandardScaler(), scale_cols),
    (OrdinalEncoder(), ['건물유형']),
    remainder='passthrough'
)

pipeline = Pipeline([
    ('time_spliter', GetTimeData()),
    ('text_imputer', TextImputer()),
    ('mean_imputer', MeanImputer()),
    ('value_imputer', ValueImputer()),
    ('driven_variable', GetDrivenVar()),
    ('drop_field', DropField()),
    ('column_transformer', column_transformer),
])

initializing time transformer
initialising text transformer
Initializing Mean Imputer
Initializing Value Imputer
Initializing Get Driven variable estimator
initializing drop field


In [8]:
transformed = pipeline.fit_transform(train_feature)
transformed_test = pipeline.transform(test_df)

transformed.shape, transformed_test.shape

((204000, 14), (16800, 14))

In [9]:
def change_order(array):
    times = array[:, -3:].copy()
    array[:, -1] = array[:, -4]
    array[:, -4:-1] = times
    return array

transformed = change_order(transformed)
transformed_test = change_order(transformed_test)

In [10]:
num_type = len(np.unique(transformed[:, -1]))
num_type

12

In [11]:
transformed_ = transformed.copy()
mask = np.logical_and(transformed_[:, -4] >= 8, transformed_[:, -3] >= 20)
x_valid, y_valid = transformed_[mask], train_label[mask]
x_train, y_train = transformed_[~mask], train_label[~mask]
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape

((192000, 14), (192000,), (12000, 14), (12000,))

In [12]:
def SMAPE(y_true, y_pred):
    return np.mean((np.abs(y_true - y_pred)) / (np.abs(y_true) + np.abs(y_pred))) * 100

def MAE(y_true, y_pred):
    return np.mean(abs(y_true - y_pred))

In [13]:
def fit_XGB(
    train, valid,
    path='./', param=None, seed=42,
    early_stopping_rounds=300
):
    x_train, y_train = train
    x_valid, y_valid = valid

    models = []
    valid_score = []
    best_params = []
    for i in range(num_type):
        train_mask = x_train[:, -1] == i
        valid_mask = x_valid[:, -1] == i

        new_x_train, new_y_train = x_train[train_mask][:, :-1], y_train[train_mask]
        new_x_valid, new_y_valid = x_valid[valid_mask][:, :-1], y_valid[valid_mask]
        print(f"건물 유형 {i}")
        model, smape = grid_search((new_x_train, new_y_train), (new_x_valid, new_y_valid))

        models.append(model)
        best_params.append(model.get_params())
        valid_score.append(smape)

    return models, best_params, valid_score

import copy
def grid_search(train, valid):
    x_train, y_train = train
    x_valid, y_valid = valid

    default_param = {
        "learning_rate":0.01,
        "n_estimators":10000,
        "max_depth":5,
        "min_child_weight":1,
        "gamma":0,
        "subsample":0.8,
        "colsample_bytree":0.8,
        "objective":'reg:squarederror',
        "nthread":-1,
        "scale_pos_weight":1,
        "seed":42,
        "gpu_id":0,
        "tree_method":'gpu_hist',
        "predictor":'gpu_predictor'
    }
    params = {
        'max_depth': range(3,10,3),
        'min_child_weight': range(1,6,2),
        'gamma': [i/10.0 for i in range(0, 5)],
        'subsample':[i/10.0 for i in range(6,10)],
        'colsample_bytree':[i/10.0 for i in range(6,10)],
        'reg_alpha':[1e-5, 1e-2, 0.1, 1, 100]
    }

    best_model = []
    best_smape = float('inf')
    print(f"find params {len(params.keys())}")
    for key, values in params.items():
        for value in values:
            print(f"check param {key}: {value}")
            cur_params = copy.copy(default_param)
            cur_params[key] = value

            xgb = XGBRegressor(
                **cur_params,
            )
            xgb.fit(x_train, y_train)
            valid_pred = xgb.predict(x_valid)
            smape = SMAPE(y_valid, valid_pred)
            if best_smape > smape:
                print(f"Find best params SMAPE: {smape}")
                best_smape = smape
                best_model = xgb
                default_param = xgb.get_params()
    best_pred = best_model.predict(x_valid)
    smape = SMAPE(y_valid, best_pred)

    print(f"best iterations: {best_model.best_iteration}")
    print(f"SMAPE: {smape}")
    print(f"First 5 preds: {best_pred[:5]}")
    print(f"First 5 actual: {y_valid.values[:5]}")

    return best_model, smape

In [14]:
models, best_params, results = fit_XGB(
    (x_train, y_train),
    (x_valid, y_valid)
)

건물 유형 0
find params 6
check param max_depth: 3
Find best params SMAPE: 6.945647102639032
check param max_depth: 6
check param max_depth: 9
Find best params SMAPE: 6.535881597339184
check param min_child_weight: 1
check param min_child_weight: 3
check param min_child_weight: 5
check param gamma: 0.0
check param gamma: 0.1
check param gamma: 0.2
check param gamma: 0.3
check param gamma: 0.4
check param subsample: 0.6
Find best params SMAPE: 6.468421832095121
check param subsample: 0.7
check param subsample: 0.8
check param subsample: 0.9
check param colsample_bytree: 0.6
check param colsample_bytree: 0.7
check param colsample_bytree: 0.8
check param colsample_bytree: 0.9
check param reg_alpha: 1e-05
check param reg_alpha: 0.01
Find best params SMAPE: 6.468320879083903
check param reg_alpha: 0.1
check param reg_alpha: 1
check param reg_alpha: 100
best iterations: 9999
SMAPE: 6.468320879083903
First 5 preds: [2125.3958 2051.8215 1967.0786 1883.6881 1884.9529]
First 5 actual: [1921.44 1851.

In [15]:
def fit_xgb(train, test, params):
    assert len(params) == num_type
    
    x_train, y_train = train
    models = []
    for i, param in enumerate(params):
        print(f"건물 번호 {i} 학습시작")
        mask = x_train[:, -1] == i
        x_train_per_id, y_train_per_id = x_train[mask][:, :-1], y_train[mask]
        
        xgb = XGBRegressor(
            **param
        )
        xgb.fit(x_train_per_id, y_train_per_id)
        
        preds = xgb.predict(x_train_per_id)
        smape = SMAPE(y_train_per_id, preds)
        print(f"Train SMAPE Value: {smape}")
        
        models.append(xgb)
    
    predicted = []
    for i, model in enumerate(models):
        print(f"건물 번호 {i} 예측 시작")
        mask = test[:, -1] == i
        x_test_per_id = test[mask][:, :-1]
        preds = model.predict(x_test_per_id)
        predicted += list(preds)
        print(f"First 5 preds: {preds[:5]}")
        
    return models, predicted

In [16]:
models, predicted = fit_xgb((x_train, y_train), transformed_test, best_params)
submission = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
submission['answer'] = predicted
submission.to_csv('/kaggle/working/xgb_submission4.csv', index=False)

건물 번호 0 학습시작
Train SMAPE Value: 0.3629444269475403
건물 번호 1 학습시작
Train SMAPE Value: 0.24747673691111055
건물 번호 2 학습시작
Train SMAPE Value: 0.1348179975042275
건물 번호 3 학습시작
Train SMAPE Value: 0.19257492466385662
건물 번호 4 학습시작
Train SMAPE Value: 1.4568398169668346
건물 번호 5 학습시작
Train SMAPE Value: 0.2427117099680049
건물 번호 6 학습시작
Train SMAPE Value: 0.2071741366298913
건물 번호 7 학습시작
Train SMAPE Value: 0.043508497944179506
건물 번호 8 학습시작
Train SMAPE Value: 0.2015375037612495
건물 번호 9 학습시작
Train SMAPE Value: 0.19025242973810355
건물 번호 10 학습시작
Train SMAPE Value: 0.10708929076526305
건물 번호 11 학습시작
Train SMAPE Value: 0.3537774415067843
건물 번호 0 예측 시작
First 5 preds: [1921.3185 1892.0387 1736.4286 1698.3036 1744.0267]
건물 번호 1 예측 시작
First 5 preds: [1283.4211 1212.3324 1229.7523 1225.2346 1335.1439]
건물 번호 2 예측 시작
First 5 preds: [8101.6064 7988.197  7707.436  7528.633  7505.6265]
건물 번호 3 예측 시작
First 5 preds: [9731.793 9713.965 9696.593 9664.512 9614.711]
건물 번호 4 예측 시작
First 5 preds: [1076.4344   934.87634 1043.6346