##### Symmetric Mean Absolute Percentage Error 정의

In [20]:
import numpy as np

def smape(true, predict):  # Symmetric Mean Absolute Percentage Error
    return 100/len(true) * np.sum(2 * np.abs(true - predict) / (np.abs(true) + np.abs(predict)))

##### 데이터 전처리

In [21]:
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
import datetime
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
import datetime


building_info = pd.read_csv('./data/building_info.csv')
test = pd.read_csv('./data/test.csv')
train = pd.read_csv('./data/train.csv')
sample = pd.read_csv('./data/sample_submission.csv')
display(train.head(1))
display(test.head(1))
display(building_info.head(1))

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),일조(hr),일사(MJ/m2),전력소비량(kWh)
0,1_20220601 00,1,20220601 00,18.6,,0.9,42.0,,,1085.28


Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72


Unnamed: 0,건물번호,건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1,건물기타,110634.0,39570.0,-,-,-


In [22]:
building_info.replace('-', 0, inplace=True)  # 기존 데이터에서 Null 값이 - 로 표기되어 있는 부분 변환
onehotencoder = OneHotEncoder(sparse_output=False)
building_type = onehotencoder.fit_transform(building_info[['건물유형']])  # 건물유형을 onehotencoder를 활용하여 분석 가능한 데이터로 변환

temp = pd.DataFrame(building_type)
temp.columns = onehotencoder.categories_[0]

building = pd.concat([building_info, temp], axis=1)
del building['건물유형']
building = building.astype('float')
# building.head(1)

In [23]:
def data_preprocessing(data):  
    data.fillna(0, inplace=True)
    data['연'] = data['num_date_time'].apply(lambda x: int(x.split('_')[1][:4]))  # 연, 월, 일, 요일 데이터 생성
    data['월'] = data['num_date_time'].apply(lambda x: int(x.split('_')[1][4:6]))
    data['일'] = data['num_date_time'].apply(lambda x: int(x.split('_')[1][6:8]))
    data['시'] = data['num_date_time'].apply(lambda x: int(x.split(' ')[1]))
    data['요일'] = data['num_date_time'].apply(lambda x: datetime.date(int(x.split('_')[1][:4]), int(x.split('_')[1][4:6]), int(x.split('_')[1][6:8])).isoweekday())  # 월: 1, 일: 7
    del data['일시']
    del data['num_date_time']
    
    data = pd.merge(data, building, on='건물번호')
    del data['건물번호']
    display(data.head(3))
    return data

train.drop(['일조(hr)', '일사(MJ/m2)'], axis=1, inplace=True)
train = data_preprocessing(train)

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),전력소비량(kWh),연,월,일,시,요일,...,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
0,18.6,0.0,0.9,42.0,1085.28,2022,6,1,0,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,18.0,0.0,1.1,45.0,1047.36,2022,6,1,1,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,17.7,0.0,1.5,45.0,974.88,2022,6,1,2,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [24]:
y = train['전력소비량(kWh)']
del train['전력소비량(kWh)']

##### 변수 선정 코드

In [25]:
# # 변수 선정
# # 이거 자동화좀 해보자 계속 보고있는거 귀찮네
# # 자동화 작업 끝나면 XR 을 RFR로 변경해줘야함
# from itertools import *
# from sklearn.ensemble import RandomForestRegressor
# from sklearn.metrics import r2_score
# from sklearn.model_selection import train_test_split
# from tqdm import tqdm
# from xgboost import XGBRegressor

# def modeling(X_data, y_data):
#     X_train, X_test, y_train, y_test = train_test_split(X_data, y_data, test_size=0.4, random_state=987)
    
#     # XR = XGBRegressor()
#     # XR.fit(X_train, y_train)
#     # y_pred = XR.predict(X_test)
    
#     RFR = RandomForestRegressor()
#     RFR.fit(X_train, y_train)
#     y_pred = RFR.predict(X_test)
#     score = smape(y_test, y_pred)
#     return score


# origin_train = train.copy()
# # origin_train.drop(['연', '풍속(m/s)', '습도(%)', 'ESS저장용량(kWh)', '강수량(mm)'], axis=1, inplace=True) # XR 제거변수 선정
# # origin_train.drop(['연', '풍속(m/s)', '기온(C)', '습도(%)', '강수량(mm)', '태양광용량(kW)', 'PCS용량(kW)'], axis=1, inplace=True)  # RFR 제거변수 선정
# origin_train.drop(['연'], axis=1, inplace=True)

# start_score = modeling(origin_train, y)

# remove_columns = ['연']

# while True:
#     result = []
#     for i in tqdm(range(len(origin_train.columns) - 12)):
#         col = list(origin_train.columns[:i])
#         col.extend(origin_train.columns[i+1:])
#         X = origin_train[col]
        
#         score = modeling(X, y)
        
#         result.append((origin_train.columns[i], score))
        
#     result.sort(key=lambda x: x[1])
    
#     for i in result[:2]:
#         print(i)
#     print()
    
#     if result[0][1] < start_score:
#         print(f"{result[0][0]} 열을 제거한 결과 기존 score인 {start_score:.3f}보다 낮은 {result[0][1]:.3f} 값이 나왔습니다.")
#         remove_columns.append(result[0][0])
#         start_score = result[0][1]
#         origin_train.drop([result[0][0]], axis=1, inplace=True)
#     else:
#         print("현재 score보다 낮은 score 값을 가지는 후보가 없습니다.")
#         print("제거한 열은 ", end="")
#         for i in remove_columns[:-1]:
#             print(i, end=', ')
#         print(f"{remove_columns[-1]} 입니다.")
#         print("반복문을 종료합니다.")
#         break

In [30]:
import seaborn as sns
import matplotlib.pyplot as plt

# cmap = sns.light_palette("darkgray", as_cmap=True)
# sns.heatmap(data.corr(), annot=True, cmap=cmap)
# plt.show()
data[data.columns[:-12]].corr()

Unnamed: 0,월,일,시,요일,연면적(m2),냉방면적(m2),ESS저장용량(kWh)
월,1.0,-0.1333186,-8.672975e-17,-0.0377367,5.220941e-15,5.22373e-15,6.512695e-15
일,-0.1333186,1.0,2.2105570000000003e-17,0.02227065,-5.970802e-16,-6.037663e-16,1.905329e-16
시,-8.672975e-17,2.2105570000000003e-17,1.0,-2.866971e-18,-3.897512e-16,-3.556055e-16,-6.568382e-17
요일,-0.0377367,0.02227065,-2.866971e-18,1.0,1.69289e-15,1.685817e-15,6.936187e-16
연면적(m2),5.220941e-15,-5.970802e-16,-3.897512e-16,1.69289e-15,1.0,0.9988839,-0.02173528
냉방면적(m2),5.22373e-15,-6.037663e-16,-3.556055e-16,1.685817e-15,0.9988839,1.0,-0.01920016
ESS저장용량(kWh),6.512695e-15,1.905329e-16,-6.568382e-17,6.936187e-16,-0.02173528,-0.01920016,1.0


##### 모델 학습

In [31]:
columns = ['월', '일', '시', '요일', '냉방면적(m2)', 'ESS저장용량(kWh)', '건물기타', '공공', '대학교', '데이터센터',
           '백화점및아울렛', '병원', '상용', '아파트', '연구소', '지식산업센터', '할인마트', '호텔및리조트']

data = train[columns]

X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=0.25, random_state=987)

In [32]:
RFR = RandomForestRegressor()
RFR.fit(X_train, y_train)
y_pred = RFR.predict(X_test)
smape(y_test, y_pred)

6.073335556817791

In [15]:
# from xgboost import XGBRegressor

# XR = XGBRegressor()
# XR.fit(X_train, y_train)
# y_pred = XR.predict(X_test)
# smape(y_test, y_pred)

# from lightgbm import LGBMRegressor

# LR = LGBMRegressor()
# LR.fit(X_train, y_train)
# y_pred = LR.predict(X_test)
# smape(y_test, y_pred)

##### 제출 자료 생성

In [19]:
test = pd.read_csv('./data/test.csv')
test_data = data_preprocessing(test)
test_data = test_data[columns]
answer = RFR.predict(test_data)

submission = pd.DataFrame()
submission['num_date_time'] = sample['num_date_time']
submission['answer'] = answer

now = datetime.datetime.now().strftime("%Y-%m-%d_%H시%M분%S초")
submission.to_csv(f'{now}.csv', index=False)

Unnamed: 0,기온(C),강수량(mm),풍속(m/s),습도(%),연,월,일,시,요일,연면적(m2),...,대학교,데이터센터,백화점및아울렛,병원,상용,아파트,연구소,지식산업센터,할인마트,호텔및리조트
0,23.5,0.0,2.2,72,2022,8,25,0,4,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,23.0,0.0,0.9,72,2022,8,25,1,4,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,22.7,0.0,1.5,75,2022,8,25,2,4,110634.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
