In [1]:
import pandas as pd
import numpy as np
import random
import os

from tqdm import tqdm

from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_percentage_error

import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Data Load

In [3]:
# from google.colab import drive
# drive.mount('/content/drive')

# train = pd.read_csv('/content/drive/MyDrive/work/input/train.csv')
# test = pd.read_csv('/content/drive/MyDrive/work/input/test.csv')
# building_info = pd.read_csv('/content/drive/MyDrive/work/input/building_info.csv')
# submission = pd.read_csv('/content/drive/MyDrive/work/input/sample_submission.csv')
# train.shape, test.shape, building_info.shape, submission.shape

train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
building_info = pd.read_csv('input/building_info.csv')
submission = pd.read_csv('input/sample_submission.csv')
train.shape, test.shape, building_info.shape, submission.shape

((204000, 10), (16800, 7), (100, 7), (16800, 2))

## Train Data Pre-Processing

In [4]:
# 빌딩 데이터 정리
## null 처리
building_info['태양광용량(kW)'] = building_info['태양광용량(kW)'].str.replace('-','0').astype(float)
building_info['ESS저장용량(kWh)'] = building_info['ESS저장용량(kWh)'].str.replace('-','0').astype(float)
building_info['PCS용량(kW)'] = building_info['PCS용량(kW)'].str.replace('-','0').astype(float)

In [5]:
# 정리된 빌딩과 merge
train = train.merge(building_info, on='건물번호')
test = test.merge(building_info, on='건물번호')

In [6]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train['month'] = train['일시'].apply(lambda x : int(x[4:6]))
train['day'] = train['일시'].apply(lambda x : int(x[6:8]))
train['time'] = train['일시'].apply(lambda x : int(x[9:11]))

test['month'] = test['일시'].apply(lambda x : int(x[4:6]))
test['day'] = test['일시'].apply(lambda x : int(x[6:8]))
test['time'] = test['일시'].apply(lambda x : int(x[9:11]))

In [7]:
# 요일
train['weekday'] =  pd.to_datetime(train['일시'].str[:8]).dt.weekday
test['weekday'] =  pd.to_datetime(test['일시'].str[:8]).dt.weekday

In [8]:
# target encoding
target_encoding = train.groupby(['건물유형']).agg({'전력소비량(kWh)':['min', 'mean','max']}).reset_index()
target_encoding.columns = ['건물유형', 'khw_min', 'khw_mean', 'khw_max']
train = train.merge(target_encoding, on=['건물유형'], how='left')
test = test.merge(target_encoding, on=['건물유형'], how='left')

In [9]:
# temp = train[train['건물번호'] == 1]
# temp['yyyymmdd'] = pd.to_datetime(temp['일시']).dt.strftime('%Y%m%d')
# rain_day = pd.DataFrame(temp[temp['강수량(mm)'] > 0.1].yyyymmdd.unique(), columns=['yyyymmdd'])
# rain_day['rain'] = 1

# train['yyyymmdd'] = pd.to_datetime(train['일시']).dt.strftime('%Y%m%d')
# train = train.merge(rain_day, on='yyyymmdd', how='left').fillna(0)
# train['rain'] = train['rain'].astype(int)
# print(train.yyyymmdd.min(), train.yyyymmdd.max())
# train.drop(['yyyymmdd','강수량(mm)'], axis=1, inplace=True)
# display(train['rain'].value_counts())

# temp = test[test['건물번호'] == 1]
# temp['yyyymmdd'] = pd.to_datetime(temp['일시']).dt.strftime('%Y%m%d')
# rain_day = pd.DataFrame(temp[temp['강수량(mm)'] > 0.1].yyyymmdd.unique(), columns=['yyyymmdd'])
# rain_day['rain'] = 1

# test['yyyymmdd'] = pd.to_datetime(test['일시']).dt.strftime('%Y%m%d')
# test = test.merge(rain_day, on='yyyymmdd', how='left').fillna(0)
# test['rain'] = test['rain'].astype(int)
# print(test.yyyymmdd.min(), test.yyyymmdd.max())
# test.drop(['yyyymmdd','강수량(mm)'], axis=1, inplace=True)
# display(test['rain'].value_counts())

20220601 20220824


0    105600
1     98400
Name: rain, dtype: int64

20220825 20220831


1    9600
0    7200
Name: rain, dtype: int64

In [12]:
train_x = train.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y = train['전력소비량(kWh)']

test_x = test.drop(columns=['num_date_time', '일시'])

In [13]:
one_hot_col = ['건물유형']#,'weekday']
train_x = pd.get_dummies(train_x, columns=one_hot_col, drop_first=True)
test_x = pd.get_dummies(test_x, columns=one_hot_col, drop_first=True)

In [14]:
#결측값을 0으로 채웁니다
train_x = train_x.fillna(0)
train_y = train_y.fillna(0)

In [15]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(train_x , train_y ,test_size=0.2, shuffle=True, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(163200, 28) (40800, 28) (163200,) (40800,)


## Regression Model Fit

In [16]:
%%time
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)
y_preds = model.predict(X_test)
mean_absolute_percentage_error(y_test, y_preds)

Wall time: 1min 35s


0.04086195531433143

## Inference & Submit

In [18]:
%%time
model = RandomForestRegressor(random_state=42)
model.fit(train_x, train_y)
preds = model.predict(test_x)
submission['answer'] = preds
submission.to_csv('20230719-3.csv', index=False)

Wall time: 1min 49s
