In [1]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error
import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

# 한글 폰트 사용을 위해서 세팅
from matplotlib import font_manager, rc
font_path = "C:/Windows/Fonts/H2GTRM.TTF"
font = font_manager.FontProperties(fname=font_path).get_name()
rc('font', family=font)

# feature importance 
def vis_feature_importances_(est, X_train):
    ftr_importances_values = est.feature_importances_
    ftr_importances = pd.Series(ftr_importances_values,index=X_train.columns  )
    ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]
    plt.figure(figsize=(10,4))
    plt.title('Feature importances Top 20')
    sns.barplot(x=ftr_top20 , y = ftr_top20.index)
    plt.show()
    
# https://dacon.io/forum/401657    
def smape(true, pred):
    v = 2 * abs(pred - true) / (abs(pred) + abs(true))
    output = np.mean(v) * 100
    return output    

## Data Load

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

# train = pd.read_csv('/content/drive/MyDrive/work/input/train.csv')
# test = pd.read_csv('/content/drive/MyDrive/work/input/test.csv')
# building_info = pd.read_csv('/content/drive/MyDrive/work/input/building_info.csv')
# submission = pd.read_csv('/content/drive/MyDrive/work/input/sample_submission.csv')
# train.shape, test.shape, building_info.shape, submission.shape

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
building_info = pd.read_csv('input/building_info.csv')
submission = pd.read_csv('input/sample_submission.csv')
train.shape, test.shape, building_info.shape, submission.shape

((204000, 10), (16800, 7), (100, 7), (16800, 2))

## Train Data Pre-Processing

In [4]:
# 빌딩 데이터 정리
## null 처리
building_info['태양광용량(kW)'] = building_info['태양광용량(kW)'].str.replace('-','0').astype(float)
building_info['ESS저장용량(kWh)'] = building_info['ESS저장용량(kWh)'].str.replace('-','0').astype(float)
building_info['PCS용량(kW)'] = building_info['PCS용량(kW)'].str.replace('-','0').astype(float)

In [5]:
# 정리된 빌딩과 merge
train = train.merge(building_info, on='건물번호')
test = test.merge(building_info, on='건물번호')

In [6]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train['month'] = train['일시'].apply(lambda x : int(x[4:6]))
train['day'] = train['일시'].apply(lambda x : int(x[6:8]))
train['time'] = train['일시'].apply(lambda x : int(x[9:11]))

test['month'] = test['일시'].apply(lambda x : int(x[4:6]))
test['day'] = test['일시'].apply(lambda x : int(x[6:8]))
test['time'] = test['일시'].apply(lambda x : int(x[9:11]))

In [7]:
# 요일
train['weekday'] =  pd.to_datetime(train['일시'].str[:8]).dt.weekday
test['weekday'] =  pd.to_datetime(test['일시'].str[:8]).dt.weekday

In [8]:
# outlier 제거 
train = train[train['전력소비량(kWh)'] > 1]

In [9]:
train = train.drop(train[train['num_date_time'].isin(['56_20220608 16'
                                                  ,'34_20220808 21'
                                                  ,'10_20220607 06','10_20220607 06'
                                                  ,'31_20220808 13','31_20220809 13', '31_20220808 14','31_20220808 13'
                                                  , '8_20220806 05', '8_20220707 08', '8_20220628 09'
                                                  ,'13_20220606 16','13_20220724 06','13_20220606 17','13_20220607 07'
                                                  ,'58_20220804 16'
                                                  ,'65_20220807 18'
                                                  ,'68_20220706 20','68_20220706 19'
                                                  ,'71_20220604 10','71_20220810 15','71_20220810 16'
                                                  , '73_20220707 12', '73_20220707 14', '73_20220707 11','73_20220707 13', '73_20220707 15', '73_20220808 13','73_20220803 14', '73_20220808 12', '73_20220803 15','73_20220808 11'
                                                 ])].index)

In [10]:
target_encoding = train.groupby(['건물유형']).agg({'전력소비량(kWh)':['min', 'mean','max']}).reset_index()
target_encoding.columns = ['건물유형', 'khw_min', 'khw_mean', 'khw_max']
train = train.merge(target_encoding, on=['건물유형'], how='left')
test = test.merge(target_encoding, on=['건물유형'], how='left')

In [11]:
train.drop( train[(train['건물번호'].isin([1,16])) & (train['month'] == 6)].index, inplace=True)

In [12]:
no_work_day = train.groupby(['건물번호', '건물유형','month', 'day']).agg({'전력소비량(kWh)':['mean','max']}).reset_index()
no_work_day.columns = ['건물번호', '건물유형','month', 'day', 'khw_mean', 'khw_max']
total_mean = no_work_day[no_work_day['건물유형'].isin(['백화점및아울렛','할인마트'])].groupby(['건물번호']).khw_mean.mean().reset_index().rename(columns={'khw_mean':'b_mean'})

no_work_day = no_work_day.merge(total_mean, on='건물번호')
no_work_day.loc[no_work_day.b_mean > no_work_day.khw_max, '백화점휴무일'] = 1

train = train.merge(no_work_day[['건물번호', '건물유형','month', 'day', '백화점휴무일']], on=['건물번호', '건물유형','month', 'day'], how='left')
train['백화점휴무일'] = train['백화점휴무일'].fillna(0).astype(int)
test['백화점휴무일'] = 0

In [13]:
train['냉방비율'] = ((train['냉방면적(m2)'] / train['연면적(m2)'])*100 ).astype(int)
test['냉방비율'] = ((test['냉방면적(m2)'] / test['연면적(m2)'])*100 ).astype(int)

train = train.drop(columns=['연면적(m2)','냉방면적(m2)'])
test = test.drop(columns=['연면적(m2)','냉방면적(m2)'])

In [14]:
#결측값을 0으로 채웁니다
train = train.fillna(0)
test = test.fillna(0)

In [15]:
# 공휴일 제외 
print(train.shape)
train['dates'] = train['일시'].apply(lambda x : int(x[:8]))
# train = train[~((train['dates'].isin([20220606,20220615])) & (train['건물유형'].isin(['공공','병원'])))]
train['holiday'] = 0
test['holiday'] = 0
train.loc[train['dates'].isin([20220606,20220615,20220815]), 'holiday'] = 1

train.drop(['dates'], axis=1, inplace=True)
train.shape

(202528, 23)


(202528, 24)

In [16]:
## https://dacon.io/competitions/official/235736/codeshare/2743?page=1&dtype=recent
train['THI'] = 9/5*train['기온(C)'] - 0.55*(1-train['습도(%)']/100)*(9/5*train['습도(%)']-26)+32
test['THI'] = 9/5*test['기온(C)'] - 0.55*(1-test['습도(%)']/100)*(9/5*test['습도(%)']-26)+32

In [17]:
def CDH(xs):
    ys = []
    for i in range(len(xs)):
        if i < 11:
            ys.append(np.sum(xs[:(i+1)]-26))
        else:
            ys.append(np.sum(xs[(i-11):(i+1)]-26))
    return np.array(ys)

cdhs = np.array([])

for num in train['건물번호'].unique():
    temp = train[train['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
train['CDH'] = cdhs

cdhs = np.array([])
for num in test['건물번호'].unique():
    temp = test[test['건물번호'] == num]
    cdh = CDH(temp['기온(C)'].values)
    cdhs = np.concatenate([cdhs, cdh])
test['CDH'] = cdhs

In [18]:
############## 체감온도, https://www.weather.go.kr/plus/life/li_asset/HELP/basic/help_01_07.jsp
train['체감온도'] = 13.12 + 0.6215*train['기온(C)'] - 11.37*(train['풍속(m/s)']*3.6)**0.16 + 0.3965*(train['풍속(m/s)']*3.6)**0.16*train['기온(C)']
test['체감온도'] = 13.12 + 0.6215*test['기온(C)'] - 11.37*(test['풍속(m/s)']*3.6)**0.16 + 0.3965*(test['풍속(m/s)']*3.6)**0.16*test['기온(C)']

In [19]:
train.drop('풍속(m/s)', axis=1, inplace=True)
test.drop('풍속(m/s)', axis=1, inplace=True)

In [20]:
# train_x = train.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
# train_y = train['전력소비량(kWh)']

# test_x = test.drop(columns=['num_date_time', '일시'])

# X_train, X_test, y_train, y_test = train_test_split(train_x , train_y ,test_size=0.2, shuffle=True, random_state=42, stratify=train_x['건물번호'])
# print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

In [21]:
days = [18, 19, 20, 21, 22, 23, 24]

X_train = train[~((train['month'] == 8) & (train['day'].isin(days)))].drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_train = train[~((train['month'] == 8) & (train['day'].isin(days)))]['전력소비량(kWh)']

X_test = train[(train['month'] == 8) & (train['day'].isin(days))].drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
y_test = train[(train['month'] == 8) & (train['day'].isin(days))]['전력소비량(kWh)']


print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(185728, 21) (16800, 21) (185728,) (16800,)


In [22]:
#결측값을 0으로 채웁니다
X_train = X_train.fillna(0)
X_test = X_test.fillna(0)

## Regression Model Fit

In [23]:
#!pip install optuna
import optuna

In [111]:
def objective(trial):
    params = {
        "random_state": 42,
        "n_estimators": trial.suggest_int("n_estimators", 70, 120),
        "max_depth": trial.suggest_int("max_depth", 5, 25)
    }
    
    model = RandomForestRegressor(**params)
    print(model)
    model.fit(temp_X_train, temp_y_train)
    y_preds = model.predict(temp_X_test)
    return smape(temp_y_test, y_preds)

In [112]:
%%time 
metrics_type = pd.DataFrame()
for i in train['건물유형'].unique():
    temp_X_train = X_train[X_train['건물유형'] == i].drop(['건물유형'], axis=1)
    temp_y_train = y_train.loc[temp_X_train.index]
    temp_X_test = X_test[X_test['건물유형'] == i].drop(['건물유형'], axis=1)
    temp_y_test = y_test.loc[temp_X_test.index]
    
#     model = RandomForestRegressor(random_state=42)
#     model.fit(temp_X_train, temp_y_train)
#     y_preds = model.predict(temp_X_test)
#     temp = pd.DataFrame([i, smape(temp_y_test, y_preds)]).T
#     temp.columns = ['건물유형', '유형별_score']
    
    study = optuna.create_study()
    study.optimize(objective, n_trials=10)
    optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    temp = pd.DataFrame([i, study.best_value, study.best_trial.params]).T
    temp.columns = ['건물유형', '유형별_score', '유형별_param']
    
    metrics_type = metrics_type.append(temp)
#     display(temp)
#     vis_feature_importances_(model, temp_X_train)    
    

RandomForestRegressor(max_depth=15, min_samples_split=0.047011378408343796,
                      n_estimators=108, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.11266369862000802,
                      n_estimators=110, random_state=42)
RandomForestRegressor(max_depth=7, min_samples_split=0.37689940540858136,
                      n_estimators=94, random_state=42)
RandomForestRegressor(max_depth=15, min_samples_split=0.9575349552207315,
                      n_estimators=115, random_state=42)
RandomForestRegressor(max_depth=13, min_samples_split=0.7176489842863294,
                      n_estimators=72, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.17943251436564056,
                      n_estimators=112, random_state=42)
RandomForestRegressor(max_depth=12, min_samples_split=0.5117399916614808,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=24, min_samples_split=0.688338394137048,
         

RandomForestRegressor(max_depth=16, min_samples_split=0.09597293495480652,
                      n_estimators=103, random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.9683261044550276,
                      n_estimators=106, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.5700340860160561,
                      n_estimators=117, random_state=42)
RandomForestRegressor(max_depth=13, min_samples_split=0.5914005707191317,
                      n_estimators=102, random_state=42)
RandomForestRegressor(max_depth=8, min_samples_split=0.9675535495212181,
                      n_estimators=102, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.07348561937118725,
                      n_estimators=118, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.3786801926624356,
                      n_estimators=115, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.2746842337813402,
        

In [113]:
%%time 
metrics_bno = pd.DataFrame()
for i in train['건물번호'].unique():

    temp_X_train = X_train[X_train['건물번호'] == i].drop(['건물번호','건물유형'], axis=1)
    temp_y_train = y_train.loc[temp_X_train.index]
    temp_X_test = X_test[X_test['건물번호'] == i].drop(['건물번호','건물유형'], axis=1)
    temp_y_test = y_test.loc[temp_X_test.index]
        
#     model = RandomForestRegressor(random_state=42)
#     model.fit(temp_X_train, temp_y_train)
#     y_preds = model.predict(temp_X_test)
#     temp = pd.DataFrame([i, smape(temp_y_test, y_preds)]).T

    study = optuna.create_study()
    study.optimize(objective, n_trials=10)
    optuna.logging.set_verbosity(optuna.logging.CRITICAL)
    temp = pd.DataFrame([i, study.best_value, study.best_trial.params]).T
    temp.columns = ['건물번호', '건물별_score', '건물별_param']
    
    metrics_bno = metrics_bno.append(temp)
#     display(temp)
#     vis_feature_importances_(model, temp_X_train)    
#     print()

RandomForestRegressor(max_depth=14, min_samples_split=0.0892407500617679,
                      n_estimators=120, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.3163859392025713,
                      n_estimators=84, random_state=42)
RandomForestRegressor(max_depth=9, min_samples_split=0.32535379944521037,
                      n_estimators=94, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.8406389818960457,
                      n_estimators=110, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.7363638106350939,
                      n_estimators=88, random_state=42)
RandomForestRegressor(max_depth=23, min_samples_split=0.8916826738391177,
                      n_estimators=79, random_state=42)
RandomForestRegressor(max_depth=12, min_samples_split=0.3136833522891648,
                      n_estimators=103, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.4720858208747013,
              

RandomForestRegressor(max_depth=12, min_samples_split=0.25758799274732946,
                      n_estimators=116, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.5236845761344554,
                      n_estimators=97, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.5352942373784392,
                      n_estimators=120, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.9021702900105918,
                      n_estimators=108, random_state=42)
RandomForestRegressor(max_depth=16, min_samples_split=0.8179867518620392,
                      n_estimators=119, random_state=42)
RandomForestRegressor(max_depth=18, min_samples_split=0.6516241481637205,
                      n_estimators=72, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.7038512386005006,
                      n_estimators=110, random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.03738166649310748,
          

RandomForestRegressor(max_depth=12, min_samples_split=0.9698673838019747,
                      n_estimators=83, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.5029078171977417,
                      n_estimators=78, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.5969672482313595,
                      n_estimators=95, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.06709211717689079,
                      n_estimators=71, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.2913473523911573,
                      n_estimators=114, random_state=42)
RandomForestRegressor(max_depth=14, min_samples_split=0.05469836876197054,
                      n_estimators=90, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.6220739731299019,
                      n_estimators=110, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.946377346446522,
              

RandomForestRegressor(max_depth=18, min_samples_split=0.6775794656841253,
                      n_estimators=74, random_state=42)
RandomForestRegressor(max_depth=12, min_samples_split=0.6242896287892763,
                      n_estimators=98, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.3517863376808503,
                      n_estimators=92, random_state=42)
RandomForestRegressor(max_depth=16, min_samples_split=0.3623836446479364,
                      n_estimators=76, random_state=42)
RandomForestRegressor(max_depth=7, min_samples_split=0.25615248564044624,
                      n_estimators=118, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.24500539611295336,
                      n_estimators=72, random_state=42)
RandomForestRegressor(max_depth=23, min_samples_split=0.08706142077270596,
                      random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.8057453185765439,
                      n_estimat

RandomForestRegressor(max_depth=12, min_samples_split=0.04545452581385656,
                      n_estimators=91, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.22153058402790637,
                      n_estimators=75, random_state=42)
RandomForestRegressor(max_depth=16, min_samples_split=0.4012844661133469,
                      n_estimators=90, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.9947903514669513,
                      n_estimators=83, random_state=42)
RandomForestRegressor(max_depth=15, min_samples_split=0.23310496914549161,
                      n_estimators=105, random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.6189753080535818,
                      n_estimators=77, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.8007900360641431,
                      n_estimators=111, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.60447289689864,
               

RandomForestRegressor(max_depth=15, min_samples_split=0.018448616708266363,
                      n_estimators=83, random_state=42)
RandomForestRegressor(max_depth=15, min_samples_split=0.004632872074423755,
                      n_estimators=76, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.8969410901332152,
                      n_estimators=85, random_state=42)
RandomForestRegressor(max_depth=7, min_samples_split=0.4595393739752863,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.20443126787515808,
                      n_estimators=76, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.19741500673795664,
                      n_estimators=82, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.8896987612214157,
                      n_estimators=106, random_state=42)
RandomForestRegressor(max_depth=24, min_samples_split=0.029659536816519783,
         

RandomForestRegressor(max_depth=7, min_samples_split=0.4899009989623607,
                      n_estimators=118, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.8857004299852447,
                      n_estimators=108, random_state=42)
RandomForestRegressor(max_depth=24, min_samples_split=0.817924511605091,
                      n_estimators=98, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.06748153075362262,
                      n_estimators=82, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.9128176358001898,
                      n_estimators=108, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.19756527880226082,
                      n_estimators=104, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.9415585112598196,
                      n_estimators=97, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.8520237051934225,
             

RandomForestRegressor(max_depth=10, min_samples_split=0.5987136480457926,
                      n_estimators=111, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.5735901564143344,
                      n_estimators=79, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.47213426463005503,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=24, min_samples_split=0.12465161058945262,
                      n_estimators=113, random_state=42)
RandomForestRegressor(max_depth=24, min_samples_split=0.43153340458442324,
                      n_estimators=113, random_state=42)
RandomForestRegressor(max_depth=9, min_samples_split=0.7329641643356875,
                      n_estimators=109, random_state=42)
RandomForestRegressor(max_depth=7, min_samples_split=0.8420422464589826,
                      random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.5528663736124112,
                      n_esti

RandomForestRegressor(max_depth=9, min_samples_split=0.7428384571077299,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=7, min_samples_split=0.07826727498118313,
                      n_estimators=120, random_state=42)
RandomForestRegressor(max_depth=14, min_samples_split=0.8009475440701144,
                      random_state=42)
RandomForestRegressor(max_depth=23, min_samples_split=0.406918676475304,
                      n_estimators=79, random_state=42)
RandomForestRegressor(max_depth=22, min_samples_split=0.6770447023911638,
                      n_estimators=85, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.9188401705260603,
                      n_estimators=99, random_state=42)
RandomForestRegressor(max_depth=9, min_samples_split=0.6074883052114156,
                      n_estimators=80, random_state=42)
RandomForestRegressor(max_depth=15, min_samples_split=0.9452328701031811,
                      n_estimators=

RandomForestRegressor(max_depth=17, min_samples_split=0.9097757658661126,
                      n_estimators=117, random_state=42)
RandomForestRegressor(max_depth=8, min_samples_split=0.5310984157321954,
                      n_estimators=70, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.39003577449872706,
                      n_estimators=97, random_state=42)
RandomForestRegressor(max_depth=13, min_samples_split=0.8182791099491828,
                      n_estimators=101, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.663977645072652,
                      n_estimators=119, random_state=42)
RandomForestRegressor(max_depth=15, min_samples_split=0.127644636636138,
                      n_estimators=80, random_state=42)
RandomForestRegressor(max_depth=13, min_samples_split=0.4803292298262519,
                      n_estimators=110, random_state=42)
RandomForestRegressor(max_depth=23, min_samples_split=0.5441611482722551,
               

RandomForestRegressor(max_depth=24, min_samples_split=0.8425721074897737,
                      n_estimators=94, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.32036839004185136,
                      n_estimators=77, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.44035117792867773,
                      n_estimators=115, random_state=42)
RandomForestRegressor(max_depth=23, min_samples_split=0.1445498618430433,
                      n_estimators=112, random_state=42)
RandomForestRegressor(max_depth=12, min_samples_split=0.003232284865235413,
                      n_estimators=86, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.34796970651011583,
                      n_estimators=85, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.5963776130950175,
                      n_estimators=79, random_state=42)
RandomForestRegressor(max_depth=18, min_samples_split=0.8087271366009072,
         

RandomForestRegressor(max_depth=18, min_samples_split=0.29338264658393287,
                      n_estimators=103, random_state=42)
RandomForestRegressor(max_depth=14, min_samples_split=0.9742315388511171,
                      random_state=42)
RandomForestRegressor(max_depth=16, min_samples_split=0.556843103535987,
                      n_estimators=73, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.6097351445418586,
                      n_estimators=73, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.6650378642109296,
                      n_estimators=78, random_state=42)
RandomForestRegressor(max_depth=9, min_samples_split=0.21633267450302252,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=10, min_samples_split=0.14696529075744447,
                      n_estimators=86, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.8910217135672424,
                      n_estimator

RandomForestRegressor(max_depth=14, min_samples_split=0.6037996425313953,
                      n_estimators=73, random_state=42)
RandomForestRegressor(max_depth=11, min_samples_split=0.25889115309837474,
                      n_estimators=78, random_state=42)
RandomForestRegressor(max_depth=7, min_samples_split=0.34835075245238434,
                      n_estimators=107, random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.10331269404882137,
                      n_estimators=111, random_state=42)
RandomForestRegressor(max_depth=21, min_samples_split=0.962608945849689,
                      n_estimators=90, random_state=42)
RandomForestRegressor(max_depth=9, min_samples_split=0.26240993983720207,
                      n_estimators=105, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.8897794462899272,
                      n_estimators=76, random_state=42)
RandomForestRegressor(max_depth=16, min_samples_split=0.22844311771587977,
           

RandomForestRegressor(max_depth=12, min_samples_split=0.9831774660198386,
                      n_estimators=114, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.9534822809661168,
                      n_estimators=85, random_state=42)
RandomForestRegressor(max_depth=11, min_samples_split=0.5231244748597605,
                      n_estimators=70, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.3575501097734777,
                      n_estimators=118, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.18084517025712799,
                      n_estimators=73, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.006308226748038659,
                      n_estimators=101, random_state=42)
RandomForestRegressor(max_depth=12, min_samples_split=0.6571303351503103,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.20746576992473653,
           

RandomForestRegressor(max_depth=10, min_samples_split=0.3836520479553288,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=13, min_samples_split=0.42880364126940596,
                      n_estimators=103, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.17232854777577533,
                      n_estimators=94, random_state=42)
RandomForestRegressor(max_depth=5, min_samples_split=0.6439572919887829,
                      n_estimators=93, random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.8023474332526261,
                      n_estimators=120, random_state=42)
RandomForestRegressor(max_depth=16, min_samples_split=0.994355320668841,
                      n_estimators=71, random_state=42)
RandomForestRegressor(max_depth=17, min_samples_split=0.12950371053923215,
                      n_estimators=80, random_state=42)
RandomForestRegressor(max_depth=24, min_samples_split=0.9551734357056599,
              

RandomForestRegressor(max_depth=25, min_samples_split=0.7180888431151375,
                      n_estimators=92, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.16324432530992994,
                      n_estimators=87, random_state=42)
RandomForestRegressor(max_depth=8, min_samples_split=0.9129192052675025,
                      n_estimators=75, random_state=42)
RandomForestRegressor(max_depth=9, min_samples_split=0.3472807162373962,
                      n_estimators=90, random_state=42)
RandomForestRegressor(max_depth=19, min_samples_split=0.5529389149878191,
                      n_estimators=79, random_state=42)
RandomForestRegressor(max_depth=25, min_samples_split=0.4737121746896351,
                      n_estimators=78, random_state=42)
RandomForestRegressor(max_depth=20, min_samples_split=0.12695063272580054,
                      n_estimators=101, random_state=42)
RandomForestRegressor(max_depth=6, min_samples_split=0.09523547804026755,
               

In [114]:
metrics_bno['건물번호'] = metrics_bno['건물번호'].astype(int)

In [115]:
metrics_bno = metrics_bno.merge(building_info[['건물유형','건물번호']], on=['건물번호'])

In [116]:
metrics = metrics_bno.merge(metrics_type, on=['건물유형'])
metrics[:2]

Unnamed: 0,건물번호,건물별_score,건물별_param,건물유형,유형별_score,유형별_param
0,1,8.811794,"{'n_estimators': 120, 'max_depth': 14, 'min_sa...",건물기타,17.322529,"{'n_estimators': 108, 'max_depth': 15, 'min_sa..."
1,2,8.092768,"{'n_estimators': 118, 'max_depth': 8, 'min_sam...",건물기타,17.322529,"{'n_estimators': 108, 'max_depth': 15, 'min_sa..."


In [117]:
# 유형보다 건물이 더 좋은 스코어를 가지는 건물번호 list
bno = metrics[metrics['건물별_score'] < metrics['유형별_score']]['건물번호'].values
len(bno), bno

(81,
 array([ 1,  2,  3,  4,  5,  6,  8,  9, 10, 11, 12, 13, 14, 15, 16, 17, 18,
        19, 20, 21, 22, 23, 25, 26, 27, 28, 29, 30, 32, 33, 35, 36, 38, 39,
        41, 42, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 55, 56, 57, 58, 59,
        60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 73, 76, 77, 78, 79, 80,
        81, 82, 83, 84, 85, 86, 88, 89, 90, 91, 92, 96, 99]))

In [118]:
metrics_type['유형별_score'].mean(), metrics_bno['건물별_score'].mean()

(13.0683367361839, 6.96299198189438)

In [119]:
# 가중 평균 
metrics_type['유형별_score'].mean() * ((100-len(bno))/100) + metrics_bno['건물별_score'].mean() * (len(bno)/100)

8.123007485209389

In [33]:
# 5.003717955344962

In [34]:
metrics['score'] = metrics['유형별_score'] 
metrics.loc[metrics['건물번호'].isin(bno), 'score'] = metrics['건물별_score']

In [35]:
metrics['건물이낫다'] = metrics['건물별_score'] < metrics['유형별_score']

In [36]:
# metrics.to_excel('오차큰것부터_20230822-3.xlsx')

## Inference & Submit

In [58]:
# %%time
# result_type = pd.DataFrame()
# for i in train['건물유형'].unique():
    
#     temp_X_train = train[train['건물유형'] == i].drop(['건물유형','num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'], axis=1)
#     temp_y_train = train.loc[temp_X_train.index][['전력소비량(kWh)']]
    
#     model = RandomForestRegressor(n_estimators=metrics_type[metrics_type['건물유형'] == i]['유형별_param'][0]['n_estimators']
#                                   , max_depth=metrics_type[metrics_type['건물유형'] == i]['유형별_param'][0]['max_depth']
#                                   , min_samples_leaf=metrics_type[metrics_type['건물유형'] == i]['유형별_param'][0]['min_samples_leaf']
#                                   , min_samples_split=metrics_type[metrics_type['건물유형'] == i]['유형별_param'][0]['min_samples_split']
#                                   , random_state=42)
#     print(model)
#     model.fit(temp_X_train, temp_y_train)
                                                 
#     temp_test = test[test['건물유형'] == i].drop(['건물유형','num_date_time', '일시'], axis=1)
#     y_preds = model.predict(temp_test)
#     result_type = result_type.append( pd.concat([test[test['건물유형'] == i].reset_index(), pd.DataFrame(y_preds, columns=['answer_유형별']).reset_index()], axis=1) )
    
# result_type.shape    

KeyError: 'max_depth'

In [38]:
# %%time
# result_bno = pd.DataFrame()
# for i in train['건물번호'].unique():
    
#     temp_X_train = train[train['건물번호'] == i].drop(['건물번호','건물유형','num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'], axis=1)
#     temp_y_train = train.loc[temp_X_train.index][['전력소비량(kWh)']]
    
#     model = RandomForestRegressor(n_estimators=metrics_bno[metrics_bno['건물번호'] == i]['건물별_param'].values[0]['n_estimators']
#                                   , max_depth=metrics_bno[metrics_bno['건물번호'] == i]['건물별_param'].values[0]['max_depth']
#                                   , min_samples_leaf=metrics_bno[metrics_bno['건물번호'] == i]['건물별_param'].values[0]['min_samples_leaf']
#                                   , min_samples_split=metrics_bno[metrics_bno['건물번호'] == i]['건물별_param'].values[0]['min_samples_split']
#                                   , random_state=42)
#     print(model)
#     model.fit(temp_X_train, temp_y_train)
                                                 
#     temp_test = test[test['건물번호'] == i].drop(['건물번호','건물유형','num_date_time', '일시'], axis=1)
#     y_preds = model.predict(temp_test)
#     result_bno = result_bno.append( pd.concat([test[test['건물번호'] == i].reset_index(), pd.DataFrame(y_preds, columns=['answer_건물별']).reset_index()], axis=1) )
    
# result_bno.shape    

In [39]:
result = result_type[['num_date_time','건물번호','answer_유형별','weekday', 'time']].merge(result_bno[['num_date_time','건물번호','answer_건물별']], on=['num_date_time','건물번호'])
result['answer'] = result['answer_유형별'] 
result.loc[result['건물번호'].isin(bno), 'answer'] = result['answer_건물별']

# 후처리 
postprocessing = train.groupby(['건물번호', 'weekday','time'])['전력소비량(kWh)'].min().reset_index()
result = result.merge(postprocessing, on=['건물번호', 'weekday','time'])
result.loc[result['answer'] < result['전력소비량(kWh)'], 'answer'] = result['전력소비량(kWh)']

result[['num_date_time','answer']].to_csv('20230827-2_r.csv', index=False)