In [1]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 한글 폰트 사용을 위해서 세팅
# from matplotlib import font_manager, rc
# font_path = "C:/Windows/Fonts/H2GTRM.TTF"
# font = font_manager.FontProperties(fname=font_path).get_name()
# rc('font', family=font)

# feature importance
def vis_feature_importances_(est, X_train):
    ftr_importances_values = est.feature_importances_
    ftr_importances = pd.Series(ftr_importances_values,index=X_train.columns  )
    ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
    plt.figure(figsize=(10,4))
    plt.title('Feature importances Top 20')
    sns.barplot(x=ftr_top20 , y = ftr_top20.index)
    plt.show()

## Data Load

In [3]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [67]:
train = pd.read_csv('/content/drive/MyDrive/work/_now/input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/work/_now/input/test.csv')
building_info = pd.read_csv('/content/drive/MyDrive/work/_now/input/building_info.csv')
submission = pd.read_csv('/content/drive/MyDrive/work/_now/input/sample_submission.csv')
train.shape, test.shape, building_info.shape, submission.shape

# train = pd.read_csv('input/train.csv')
# test = pd.read_csv('input/test.csv')
# building_info = pd.read_csv('input/building_info.csv')
# submission = pd.read_csv('input/sample_submission.csv')
# train.shape, test.shape, building_info.shape, submission.shape

((204000, 10), (16800, 7), (100, 7), (16800, 2))

## Train Data Pre-Processing

In [68]:
# 빌딩 데이터 정리
## null 처리
building_info['태양광용량(kW)'] = building_info['태양광용량(kW)'].str.replace('-','0').astype(float)
building_info['ESS저장용량(kWh)'] = building_info['ESS저장용량(kWh)'].str.replace('-','0').astype(float)
building_info['PCS용량(kW)'] = building_info['PCS용량(kW)'].str.replace('-','0').astype(float)

In [69]:
# 정리된 빌딩과 merge
train = train.merge(building_info, on='건물번호')
test = test.merge(building_info, on='건물번호')

In [70]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train['month'] = train['일시'].apply(lambda x : int(x[4:6]))
train['day'] = train['일시'].apply(lambda x : int(x[6:8]))
train['time'] = train['일시'].apply(lambda x : int(x[9:11]))

test['month'] = test['일시'].apply(lambda x : int(x[4:6]))
test['day'] = test['일시'].apply(lambda x : int(x[6:8]))
test['time'] = test['일시'].apply(lambda x : int(x[9:11]))

In [71]:
# 요일
train['weekday'] =  pd.to_datetime(train['일시'].str[:8]).dt.weekday
test['weekday'] =  pd.to_datetime(test['일시'].str[:8]).dt.weekday

In [43]:
# outlier 제거
train = train[train['전력소비량(kWh)'] > 1]

In [44]:
train = train.drop(train[train['num_date_time'].isin(['56_20220608 16'
                                                  ,'34_20220808 21'
                                                  ,'10_20220607 06','10_20220607 06'
                                                  ,'31_20220808 13','31_20220809 13', '31_20220808 14','31_20220808 13'
                                                  , '8_20220806 05', '8_20220707 08', '8_20220628 09'
                                                  ,'13_20220606 16','13_20220724 06','13_20220606 17','13_20220607 07'
                                                  ,'58_20220804 16'
                                                  ,'65_20220807 18'
                                                  ,'68_20220706 20','68_20220706 19'
                                                  ,'71_20220604 10','71_20220810 15','71_20220810 16'
                                                  , '73_20220707 12', '73_20220707 14', '73_20220707 11','73_20220707 13', '73_20220707 15', '73_20220808 13','73_20220803 14', '73_20220808 12', '73_20220803 15','73_20220808 11'
                                                 ])].index)

In [45]:
target_encoding = train.groupby(['건물유형']).agg({'전력소비량(kWh)':['min', 'mean','max']}).reset_index()
target_encoding.columns = ['건물유형', 'khw_min', 'khw_mean', 'khw_max']
train = train.merge(target_encoding, on=['건물유형'], how='left')
test = test.merge(target_encoding, on=['건물유형'], how='left')

In [46]:
train.drop( train[(train['건물번호'].isin([1,16])) & (train['month'] == 6)].index, inplace=True)

In [47]:
no_work_day = train.groupby(['건물번호', '건물유형','month', 'day']).agg({'전력소비량(kWh)':['mean','max']}).reset_index()
no_work_day.columns = ['건물번호', '건물유형','month', 'day', 'khw_mean', 'khw_max']
total_mean = no_work_day[no_work_day['건물유형'].isin(['백화점및아울렛','할인마트'])].groupby(['건물번호']).khw_mean.mean().reset_index().rename(columns={'khw_mean':'b_mean'})

no_work_day = no_work_day.merge(total_mean, on='건물번호')
no_work_day.loc[no_work_day.b_mean > no_work_day.khw_max, '백화점휴무일'] = 1

train = train.merge(no_work_day[['건물번호', '건물유형','month', 'day', '백화점휴무일']], on=['건물번호', '건물유형','month', 'day'], how='left')
train['백화점휴무일'] = train['백화점휴무일'].fillna(0).astype(int)
test['백화점휴무일'] = 0

In [48]:
train['냉방비율'] = ((train['냉방면적(m2)'] / train['연면적(m2)'])*100 ).astype(int)
test['냉방비율'] = ((test['냉방면적(m2)'] / test['연면적(m2)'])*100 ).astype(int)

train = train.drop(columns=['연면적(m2)','냉방면적(m2)'])
test = test.drop(columns=['연면적(m2)','냉방면적(m2)'])

In [49]:
#결측값을 0으로 채웁니다
train = train.fillna(0)
test = test.fillna(0)

In [50]:
# 공휴일 제외
print(train.shape)
train['dates'] = train['일시'].apply(lambda x : int(x[:8]))
# train = train[~((train['dates'].isin([20220606,20220815])) & (train['건물유형'].isin(['공공','병원'])))]
train['holiday'] = 0
test['holiday'] = 0
train.loc[train['dates'].isin([20220606,20220615]), 'holiday'] = 1

train.drop(['dates'], axis=1, inplace=True)
train.shape

(202528, 23)


(202528, 24)

In [51]:
## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
train['THI'] = 9/5*train['기온(C)'] - 0.55*(1-train['습도(%)']/100)*(9/5*train['습도(%)']-26)+32
test['THI'] = 9/5*test['기온(C)'] - 0.55*(1-test['습도(%)']/100)*(9/5*test['습도(%)']-26)+32

In [52]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])

for num in train['건물번호'].unique():
    temp = train[train['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
train['CDH'] = cdhs

cdhs = np.array([])
for num in test['건물번호'].unique():
    temp = test[test['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
test['CDH'] = cdhs

In [53]:
############## 체감온도, https://www.weather.go.kr/plus/life/li_asset/HELP/basic/help_01_07.jsp
train['체감온도'] = 13.12 + 0.6215*train['기온(C)'] - 11.37*(train['풍속(m/s)']*3.6)**0.16 + 0.3965*(train['풍속(m/s)']*3.6)**0.16*train['기온(C)']
test['체감온도'] = 13.12 + 0.6215*test['기온(C)'] - 11.37*(test['풍속(m/s)']*3.6)**0.16 + 0.3965*(test['풍속(m/s)']*3.6)**0.16*test['기온(C)']

In [54]:
train.drop('풍속(m/s)', axis=1, inplace=True)
test.drop('풍속(m/s)', axis=1, inplace=True)

In [55]:
#!pip install autogluon

In [56]:
from autogluon.tabular import TabularDataset, TabularPredictor

In [72]:
train = TabularDataset(train.drop(['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)'], axis=1))
test = TabularDataset(test.drop(['num_date_time', '일시'], axis=1))

In [73]:
train.columns

Index(['건물번호', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)', '전력소비량(kWh)', '건물유형',
       '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)', 'PCS용량(kW)',
       'month', 'day', 'time', 'weekday'],
      dtype='object')

In [74]:
%%time
predictor = TabularPredictor(label='전력소비량(kWh)', eval_metric='mae').fit(train, num_gpus=1)

No path specified. Models will be saved in: "AutogluonModels/ag-20230820_091210/"
	Consider setting `time_limit` to ensure training finishes within an expected duration or experiment with a small portion of `train_data` to identify an ideal `presets` and `hyperparameters` configuration.
Beginning AutoGluon training ...
AutoGluon will save models to "AutogluonModels/ag-20230820_091210/"
AutoGluon Version:  0.8.2
Python Version:     3.10.12
Operating System:   Linux
Platform Machine:   x86_64
Platform Version:   #1 SMP Fri Jun 9 10:57:30 UTC 2023
Disk Space Avail:   46.42 GB / 83.96 GB (55.3%)
Train Data Rows:    204000
Train Data Columns: 15
Label Column: 전력소비량(kWh)
Preprocessing data ...
AutoGluon infers your prediction problem is: 'regression' (because dtype of label-column == float and many unique label-values observed).
	Label info (max, min, mean, stddev): (25488.4, 0.0, 2451.03646, 2440.64886)
	If 'regression' is not the correct problem_type, please manually specify the problem_ty

[1000]	valid_set's l1: 176.167
[2000]	valid_set's l1: 153.253
[3000]	valid_set's l1: 141.801
[4000]	valid_set's l1: 133.955
[5000]	valid_set's l1: 128.495
[6000]	valid_set's l1: 124.485
[7000]	valid_set's l1: 121.028
[8000]	valid_set's l1: 118.318
[9000]	valid_set's l1: 116.074
[10000]	valid_set's l1: 113.918


	-113.9184	 = Validation score   (-mean_absolute_error)
	143.13s	 = Training   runtime
	3.16s	 = Validation runtime
Fitting model: LightGBM ...
	Training LightGBM with GPU, note that this may negatively impact model quality compared to CPU training.


[1000]	valid_set's l1: 132.541
[2000]	valid_set's l1: 107.28
[3000]	valid_set's l1: 96.2132
[4000]	valid_set's l1: 89.7736
[5000]	valid_set's l1: 85.2621
[6000]	valid_set's l1: 82.5657
[7000]	valid_set's l1: 79.8648
[8000]	valid_set's l1: 77.8949
[9000]	valid_set's l1: 76.2268
[10000]	valid_set's l1: 75.0438


	-75.0438	 = Validation score   (-mean_absolute_error)
	115.35s	 = Training   runtime
	1.9s	 = Validation runtime
Fitting model: RandomForestMSE ...
	-78.2092	 = Validation score   (-mean_absolute_error)
	296.47s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: CatBoost ...
	Training CatBoost with GPU, note that this may negatively impact model quality compared to CPU training.
Default metric period is 5 because MAE is/are not implemented for GPU
	-105.7275	 = Validation score   (-mean_absolute_error)
	228.74s	 = Training   runtime
	0.05s	 = Validation runtime
Fitting model: ExtraTreesMSE ...
	-86.2608	 = Validation score   (-mean_absolute_error)
	91.96s	 = Training   runtime
	0.26s	 = Validation runtime
Fitting model: NeuralNetFastAI ...
	-197.8269	 = Validation score   (-mean_absolute_error)
	124.44s	 = Training   runtime
	0.04s	 = Validation runtime
Fitting model: XGBoost ...
	-77.0368	 = Validation score   (-mean_absolute_error)
	37.1s	 = Training   runtime
	4.24s	

[1000]	valid_set's l1: 95.4614
[2000]	valid_set's l1: 82.235
[3000]	valid_set's l1: 75.9093
[4000]	valid_set's l1: 72.286
[5000]	valid_set's l1: 69.271
[6000]	valid_set's l1: 67.0879
[7000]	valid_set's l1: 65.5779
[8000]	valid_set's l1: 64.2102
[9000]	valid_set's l1: 63.2307
[10000]	valid_set's l1: 62.488


	-62.4877	 = Validation score   (-mean_absolute_error)
	156.79s	 = Training   runtime
	2.48s	 = Validation runtime
Fitting model: WeightedEnsemble_L2 ...
	-61.3114	 = Validation score   (-mean_absolute_error)
	0.62s	 = Training   runtime
	0.0s	 = Validation runtime
AutoGluon training complete, total runtime = 1502.15s ... Best model: "WeightedEnsemble_L2"
TabularPredictor saved. To load, use: predictor = TabularPredictor.load("AutogluonModels/ag-20230820_091210/")


CPU times: user 26min 40s, sys: 1min 33s, total: 28min 14s
Wall time: 25min 2s


In [75]:
predictor.leaderboard(silent = True)

Unnamed: 0,model,score_val,pred_time_val,fit_time,pred_time_val_marginal,fit_time_marginal,stack_level,can_infer,fit_order
0,WeightedEnsemble_L2,-61.311425,6.977735,490.971763,0.000773,0.618631,2,True,12
1,LightGBMLarge,-62.487669,2.481737,156.785725,2.481737,156.785725,1,True,11
2,LightGBM,-75.043771,1.900702,115.353309,1.900702,115.353309,1,True,4
3,XGBoost,-77.036759,4.235417,37.098244,4.235417,37.098244,1,True,9
4,RandomForestMSE,-78.209182,0.259808,296.469163,0.259808,296.469163,1,True,5
5,ExtraTreesMSE,-86.260798,0.260829,91.95612,0.260829,91.95612,1,True,7
6,CatBoost,-105.727521,0.051066,228.741448,0.051066,228.741448,1,True,6
7,LightGBMXT,-113.91839,3.157548,143.133607,3.157548,143.133607,1,True,3
8,NeuralNetTorch,-130.226615,0.021319,274.118759,0.021319,274.118759,1,True,10
9,KNeighborsDist,-187.780316,0.121702,0.599805,0.121702,0.599805,1,True,2


In [76]:
predictor.fit_summary()

*** Summary of fit() ***
Estimated performance of each model:
                  model   score_val  pred_time_val    fit_time  pred_time_val_marginal  fit_time_marginal  stack_level  can_infer  fit_order
0   WeightedEnsemble_L2  -61.311425       6.977735  490.971763                0.000773           0.618631            2       True         12
1         LightGBMLarge  -62.487669       2.481737  156.785725                2.481737         156.785725            1       True         11
2              LightGBM  -75.043771       1.900702  115.353309                1.900702         115.353309            1       True          4
3               XGBoost  -77.036759       4.235417   37.098244                4.235417          37.098244            1       True          9
4       RandomForestMSE  -78.209182       0.259808  296.469163                0.259808         296.469163            1       True          5
5         ExtraTreesMSE  -86.260798       0.260829   91.956120                0.260829      

{'model_types': {'KNeighborsUnif': 'KNNModel',
  'KNeighborsDist': 'KNNModel',
  'LightGBMXT': 'LGBModel',
  'LightGBM': 'LGBModel',
  'RandomForestMSE': 'RFModel',
  'CatBoost': 'CatBoostModel',
  'ExtraTreesMSE': 'XTModel',
  'NeuralNetFastAI': 'NNFastAiTabularModel',
  'XGBoost': 'XGBoostModel',
  'NeuralNetTorch': 'TabularNeuralNetTorchModel',
  'LightGBMLarge': 'LGBModel',
  'WeightedEnsemble_L2': 'WeightedEnsembleModel'},
 'model_performance': {'KNeighborsUnif': -204.32044856640627,
  'KNeighborsDist': -187.7803162739258,
  'LightGBMXT': -113.91838973876955,
  'LightGBM': -75.04377075317383,
  'RandomForestMSE': -78.2091824506836,
  'CatBoost': -105.72752058862305,
  'ExtraTreesMSE': -86.2607983852539,
  'NeuralNetFastAI': -197.82689155273437,
  'XGBoost': -77.03675877685546,
  'NeuralNetTorch': -130.22661528808595,
  'LightGBMLarge': -62.487668689331066,
  'WeightedEnsemble_L2': -61.31142506445312},
 'model_best': 'WeightedEnsemble_L2',
 'model_paths': {'KNeighborsUnif': 'Autogl

In [77]:
model_to_use = predictor.get_model_best()
model_to_use

'WeightedEnsemble_L2'

In [78]:
pred_y = predictor.predict(test, model=model_to_use)

In [79]:
submission['answer'] = pred_y

In [80]:
submission.to_csv('20230820-2.csv', index=False)

In [None]:
days = [18, 19, 20, 21, 22, 23, 24]

X_train = train[~((train['month'] == 8) & (train['day'].isin(days)))].drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_train = train[~((train['month'] == 8) & (train['day'].isin(days)))]['전력소비량(kWh)']

X_test = train[(train['month'] == 8) & (train['day'].isin(days))].drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_test = train[(train['month'] == 8) & (train['day'].isin(days))]['전력소비량(kWh)']


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(185758, 18) (16800, 18) (185758,) (16800,)


In [None]:
#결측값을 0으로 채웁니다
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

## Regression Model Fit

In [None]:
from xgboost import XGBRegressor

#### alpha를 argument로 받는 함수로 실제 objective function을 wrapping하여 alpha값을 쉽게 조정할 수 있도록 작성했습니다.
# custom objective function for forcing model not to underestimate
def weighted_mse(alpha = 1):
    def weighted_mse_fixed(label, pred):
        residual = (label - pred).astype("float")
        grad = np.where(residual>0, -2*alpha*residual, -2*residual)
        hess = np.where(residual>0, 2*alpha, 2.0)
        return grad, hess
    return weighted_mse_fixed

In [None]:
%%time
metrics_type = pd.DataFrame()
for i in train['건물유형'].unique():
    temp_X_train = X_train[X_train['건물유형'] == i].drop(['건물유형'], axis=1)
    temp_y_train = y_train.loc[temp_X_train.index]
    temp_X_test = X_test[X_test['건물유형'] == i].drop(['건물유형'], axis=1)
    temp_y_test = y_test.loc[temp_X_test.index]

    model = XGBRegressor(random_state=42)
    model.set_params(**{'objective' : weighted_mse(100)})

    model.fit(temp_X_train, temp_y_train)
    y_preds = model.predict(temp_X_test)

    temp = pd.DataFrame([i, mean_absolute_percentage_error(temp_y_test, y_preds)]).T
    temp.columns = ['건물유형', '유형별_score']
    metrics_type = metrics_type.append(temp)
    display(temp)
    # vis_feature_importances_(model, temp_X_train)


Unnamed: 0,건물유형,유형별_score
0,건물기타,0.163063


Unnamed: 0,건물유형,유형별_score
0,공공,0.096262


Unnamed: 0,건물유형,유형별_score
0,대학교,0.072435


Unnamed: 0,건물유형,유형별_score
0,데이터센터,0.006914


Unnamed: 0,건물유형,유형별_score
0,백화점및아울렛,0.09306


Unnamed: 0,건물유형,유형별_score
0,병원,0.051668


Unnamed: 0,건물유형,유형별_score
0,상용,0.1179


Unnamed: 0,건물유형,유형별_score
0,아파트,0.090393


Unnamed: 0,건물유형,유형별_score
0,연구소,0.107763


Unnamed: 0,건물유형,유형별_score
0,지식산업센터,0.082045


Unnamed: 0,건물유형,유형별_score
0,할인마트,0.094304


Unnamed: 0,건물유형,유형별_score
0,호텔및리조트,0.131929


CPU times: user 56.3 s, sys: 169 ms, total: 56.5 s
Wall time: 32.6 s


In [None]:
%%time
metrics_bno = pd.DataFrame()
for i in train['건물번호'].unique():

    temp_X_train = X_train[X_train['건물번호'] == i].drop(['건물번호','건물유형'], axis=1)
    temp_y_train = y_train.loc[temp_X_train.index]
    temp_X_test = X_test[X_test['건물번호'] == i].drop(['건물번호','건물유형'], axis=1)
    temp_y_test = y_test.loc[temp_X_test.index]

    model = XGBRegressor(random_state=42)
    model.set_params(**{'objective' : weighted_mse(100)})

    model.fit(temp_X_train, temp_y_train)
    y_preds = model.predict(temp_X_test)

    temp = pd.DataFrame([i, mean_absolute_percentage_error(temp_y_test, y_preds)]).T
    temp.columns = ['건물번호', '건물별_score']
    metrics_bno = metrics_bno.append(temp)
    display(temp)
    vis_feature_importances_(model, temp_X_train)
    print()

Output hidden; open in https://colab.research.google.com to view.

In [None]:
metrics_bno['건물번호'] = metrics_bno['건물번호'].astype(int)

In [None]:
metrics_bno = metrics_bno.merge(building_info[['건물유형','건물번호']], on=['건물번호'])

In [None]:
metrics = metrics_bno.merge(metrics_type, on=['건물유형'])
metrics[:2]

Unnamed: 0,건물번호,건물별_score,건물유형,유형별_score
0,1,0.0782,건물기타,0.163063
1,2,0.124499,건물기타,0.163063


In [None]:
# 유형보다 건물이 더 좋은 스코어를 가지는 건물번호 list
bno = metrics[metrics['건물별_score'] < metrics['유형별_score']]['건물번호'].values
len(bno), bno

(83,
 array([  1,   2,   4,   5,   6,   7,   8,   9,  10,  11,  12,  13,  15,
         16,  18,  20,  21,  22,  23,  24,  25,  26,  27,  29,  31,  32,
         33,  35,  36,  37,  38,  39,  41,  42,  43,  44,  45,  47,  48,
         49,  50,  52,  55,  56,  57,  58,  59,  60,  61,  62,  63,  64,
         66,  67,  68,  69,  70,  71,  72,  73,  74,  75,  76,  77,  78,
         79,  80,  81,  82,  83,  84,  85,  86,  88,  89,  90,  92,  93,
         94,  96,  97,  99, 100]))

In [None]:
metrics_type['유형별_score'].mean(), metrics_bno['건물별_score'].mean()

(0.09231144061624053, 0.06948307684608199)

In [None]:
# 가중 평균
metrics_type['유형별_score'].mean() * ((100-len(bno))/100) + metrics_bno['건물별_score'].mean() * (len(bno)/100)

0.07336389868700893

In [None]:
metrics['score'] = metrics['유형별_score']
metrics.loc[metrics['건물번호'].isin(bno), 'score'] = metrics['건물별_score']

In [None]:
metrics['건물이낫다'] = metrics['건물별_score'] < metrics['유형별_score']

In [None]:
metrics.to_excel('오차큰것부터_20230802-1.xlsx')

## Inference & Submit

In [None]:
%%time
result_type = pd.DataFrame()
for i in train['건물유형'].unique():

    temp_X_train = train[train['건물유형'] == i].drop(['건물유형','num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'], axis=1)
    temp_y_train = train.loc[temp_X_train.index][['전력소비량(kWh)']]

    model = XGBRegressor(random_state=42)
    model.set_params(**{'objective' : weighted_mse(100)})

    model.fit(temp_X_train, temp_y_train)

    temp_test = test[test['건물유형'] == i].drop(['건물유형','num_date_time', '일시'], axis=1)
    y_preds = model.predict(temp_test)
    result_type = result_type.append( pd.concat([test[test['건물유형'] == i].reset_index(), pd.DataFrame(y_preds, columns=['answer_유형별']).reset_index()], axis=1) )

result_type.shape

CPU times: user 1min 1s, sys: 179 ms, total: 1min 1s
Wall time: 36.8 s


(16800, 23)

In [None]:
%%time
result_bno = pd.DataFrame()
for i in train['건물번호'].unique():

    temp_X_train = train[train['건물번호'] == i].drop(['건물번호','건물유형','num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'], axis=1)
    temp_y_train = train.loc[temp_X_train.index][['전력소비량(kWh)']]

    model = XGBRegressor(random_state=42)
    model.set_params(**{'objective' : weighted_mse(100)})

    model.fit(temp_X_train, temp_y_train)

    temp_test = test[test['건물번호'] == i].drop(['건물번호','건물유형','num_date_time', '일시'], axis=1)
    y_preds = model.predict(temp_test)
    result_bno = result_bno.append( pd.concat([test[test['건물번호'] == i].reset_index(), pd.DataFrame(y_preds, columns=['answer_건물별']).reset_index()], axis=1) )

result_bno.shape

CPU times: user 1min 26s, sys: 778 ms, total: 1min 26s
Wall time: 50.5 s


(16800, 23)

In [None]:
result = result_type[['num_date_time','건물번호','answer_유형별']].merge(result_bno[['num_date_time','건물번호','answer_건물별']], on=['num_date_time','건물번호'])
result['answer'] = result['answer_유형별']
result.loc[result['건물번호'].isin(bno), 'answer'] = result['answer_건물별']
result[['num_date_time','answer']].to_csv('20230802-1.csv', index=False)