In [1]:
import pandas as pd
import numpy as np
import random
import os

import matplotlib.pyplot as plt
import seaborn as sns

from tqdm import tqdm
from statsmodels.tsa.arima.model import ARIMA

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings("ignore")

In [2]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

## Data Load

from google.colab import drive
drive.mount('/content/drive')

#### colab
train = pd.read_csv('/content/drive/MyDrive/work/input/train.csv')
test = pd.read_csv('/content/drive/MyDrive/work/input/test.csv')
building_info = pd.read_csv('/content/drive/MyDrive/work/input/building_info.csv')
submission = pd.read_csv('/content/drive/MyDrive/work/input/sample_submission.csv')
train.shape, test.shape, building_info.shape, submission.shape

In [3]:
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')
building_info = pd.read_csv('input/building_info.csv')
submission = pd.read_csv('input/sample_submission.csv')
train.shape, test.shape, building_info.shape, submission.shape

((204000, 10), (16800, 7), (100, 7), (16800, 2))

In [4]:
print(train.columns)
print(test.columns)
print(building_info.columns)

Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)',
       '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'],
      dtype='object')
Index(['num_date_time', '건물번호', '일시', '기온(C)', '강수량(mm)', '풍속(m/s)', '습도(%)'], dtype='object')
Index(['건물번호', '건물유형', '연면적(m2)', '냉방면적(m2)', '태양광용량(kW)', 'ESS저장용량(kWh)',
       'PCS용량(kW)'],
      dtype='object')


## Train Data Pre-Processing

In [5]:
# 빌딩 데이터 정리
## null 처리
building_info['태양광용량(kW)'] = building_info['태양광용량(kW)'].str.replace('-','0').astype(float)
building_info['ESS저장용량(kWh)'] = building_info['ESS저장용량(kWh)'].str.replace('-','0').astype(float)
building_info['PCS용량(kW)'] = building_info['PCS용량(kW)'].str.replace('-','0').astype(float)

In [6]:
# 정리된 빌딩과 merge
train = train.merge(building_info, on='건물번호')
test = test.merge(building_info, on='건물번호')

In [7]:
#시계열 특성을 학습에 반영하기 위해 일시를 월, 일, 시간으로 나눕니다
train['month'] = train['일시'].apply(lambda x : int(x[4:6]))
train['day'] = train['일시'].apply(lambda x : int(x[6:8]))
train['time'] = train['일시'].apply(lambda x : int(x[9:11]))

test['month'] = test['일시'].apply(lambda x : int(x[4:6]))
test['day'] = test['일시'].apply(lambda x : int(x[6:8]))
test['time'] = test['일시'].apply(lambda x : int(x[9:11]))

In [8]:
# 요일 2023-07-17-2
train['weekday'] =  pd.to_datetime(train['일시'].str[:8]).dt.weekday
test['weekday'] =  pd.to_datetime(test['일시'].str[:8]).dt.weekday

In [9]:
# 2023-07-18-3 
# target_encoding
target_encoding = train.groupby(['건물번호','month', 'weekday','time']).agg({'전력소비량(kWh)':['min', 'mean','max']}).reset_index()
target_encoding.columns = ['건물번호','month', 'weekday','time', 'khw_min', 'khw_mean', 'khw_max']
target_encoding['khw_min'] = target_encoding['khw_min'] * 0.01
target_encoding['khw_mean'] = target_encoding['khw_mean'] * 0.01
target_encoding['khw_max'] = target_encoding['khw_max'] * 0.01
train = train.merge(target_encoding, on=['건물번호','month', 'weekday','time'], how='left')
test = test.merge(target_encoding, on=['건물번호','month', 'weekday','time'], how='left')

# 온습도
train['기온_습도'] = train['기온(C)'] * train['습도(%)']
test['기온_습도'] = test['기온(C)'] * test['습도(%)']

# 비오는 날 
temp = train[train['건물번호'] == 1]#[:24]
temp['yyyymmdd'] = pd.to_datetime(temp['일시']).dt.strftime('%Y%m%d')
rain_day = pd.DataFrame(temp[temp['강수량(mm)'] > 0.1].yyyymmdd.unique(), columns=['yyyymmdd'])
rain_day['rain'] = 1

train['yyyymmdd'] = pd.to_datetime(train['일시']).dt.strftime('%Y%m%d')
train = train.merge(rain_day, on='yyyymmdd', how='left').fillna(0)
train.drop('yyyymmdd', axis=1, inplace=True)

temp = test[test['건물번호'] == 1]#[:24]
temp['yyyymmdd'] = pd.to_datetime(temp['일시']).dt.strftime('%Y%m%d')
rain_day = pd.DataFrame(temp[temp['강수량(mm)'] > 0.1].yyyymmdd.unique(), columns=['yyyymmdd'])
rain_day['rain'] = 1

test['yyyymmdd'] = pd.to_datetime(test['일시']).dt.strftime('%Y%m%d')
test = test.merge(rain_day, on='yyyymmdd', how='left').fillna(0)
test.drop('yyyymmdd', axis=1, inplace=True)

# 기온 증감
train['diff'] = train['기온(C)'].diff().fillna(0)
test['diff'] = test['기온(C)'].diff().fillna(0)

# 기온 shift -2 
train['기온_shift2'] = train['기온(C)'].shift(-2)
train['기온_shift2'] = train['기온_shift2'].fillna(train['기온_shift2'].iloc[-3])

test['기온_shift2'] = test['기온(C)'].shift(-2)
test['기온_shift2'] = test['기온_shift2'].fillna(test['기온_shift2'].iloc[-3])

train.drop('기온(C)', axis=1, inplace=True)
test.drop('기온(C)', axis=1, inplace=True)

In [10]:
# 상관관계
#corr = train.corr()
#corr.to_excel('corr.xlsx')

In [11]:
train_x = train.drop(columns=['num_date_time', '일시', '일조(hr)', '일사(MJ/m2)', '전력소비량(kWh)'])
train_y = train['전력소비량(kWh)']

test_x = test.drop(columns=['num_date_time', '일시'])

In [12]:
# ont-hot
train_x = pd.get_dummies(train_x, columns=['건물유형','weekday', '태양광용량(kW)', 'ESS저장용량(kWh)','PCS용량(kW)'], drop_first=True)
test_x = pd.get_dummies(test_x, columns=['건물유형','weekday', '태양광용량(kW)', 'ESS저장용량(kWh)','PCS용량(kW)'], drop_first=True)

In [13]:
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.tree import DecisionTreeRegressor

In [14]:
e = ExtraTreesRegressor(n_jobs=-1, random_state=42)
d = DecisionTreeRegressor(random_state=42)
r = RandomForestRegressor(n_jobs=-1, random_state=42)

In [15]:
%%time
e.fit(train_x, train_y)
p1 = e.predict(test_x)

Wall time: 1min 49s


In [16]:
%%time
d.fit(train_x, train_y)
p2 = d.predict(test_x)

Wall time: 5.55 s


In [17]:
%%time
r.fit(train_x, train_y)
p3 = r.predict(test_x)

Wall time: 2min 4s


In [None]:
from sklearn.model_selection import train_test_split, KFold, cross_val_score
X_train, X_test, y_train, y_test = train_test_split(train_x , train_y ,test_size=0.2, shuffle=True, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

## Regression Model Fit

In [None]:
%%time
model = RandomForestRegressor(random_state=42)
model.fit(X_train, y_train)

In [None]:
y_preds = model.predict(X_test)

In [None]:
from sklearn.metrics import mean_absolute_percentage_error
mean_absolute_percentage_error(y_test, y_preds)

## Inference

In [None]:
model = RandomForestRegressor(random_state=42)
model.fit(train_x, train_y)

In [None]:
preds = model.predict(test_x)

## Submit

In [None]:
submission['answer'] = preds
submission

In [18]:
preds = (p1+p2+p3)/3
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1890.030383
1,1_20220825 01,1785.873200
2,1_20220825 02,1660.972300
3,1_20220825 03,1590.257233
4,1_20220825 04,1651.570233
...,...,...
16795,100_20220831 19,1157.149700
16796,100_20220831 20,1007.652300
16797,100_20220831 21,961.301300
16798,100_20220831 22,804.941000


In [19]:
submission.to_csv('20230718-3.csv', index=False)