<a href="https://colab.research.google.com/github/noweahc/energy/blob/main/RandomForest.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [33]:
import random
import pandas as pd
import numpy as np
import os

from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [34]:
def seed_everything(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

seed_everything(42) # Seed 고정

In [35]:
train = pd.read_csv('/content/drive/MyDrive/23energy/23data/train.csv',encoding = 'utf-8')
test = pd.read_csv('/content/drive/MyDrive/23energy/23data/test.csv',encoding = 'utf-8')
building = pd.read_csv('/content/drive/MyDrive/23energy/23data/building_info.csv',encoding = 'utf-8')

In [36]:
train = train.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
train.drop('num_date_time', axis = 1, inplace=True)

In [37]:
test = test.rename(columns={
    '건물번호': 'building_number',
    '일시': 'date_time',
    '기온(C)': 'temperature',
    '강수량(mm)': 'rainfall',
    '풍속(m/s)': 'windspeed',
    '습도(%)': 'humidity',
    '일조(hr)': 'sunshine',
    '일사(MJ/m2)': 'solar_radiation',
    '전력소비량(kWh)': 'power_consumption'
})
test.drop('num_date_time', axis = 1, inplace=True)

In [38]:
building = building.rename(columns={
    '건물번호': 'building_number',
    '건물유형': 'building_type',
    '연면적(m2)': 'total_area',
    '냉방면적(m2)': 'cooling_area',
    '태양광용량(kW)': 'solar_power_capacity',
    'ESS저장용량(kWh)': 'ess_capacity',
    'PCS용량(kW)': 'pcs_capacity'
})

In [39]:
train = pd.merge(train, building, on='building_number', how='left')
test = pd.merge(test, building, on='building_number', how='left')

In [40]:
def make_time(train):
    """
    시간 관련 변수를 추가하기 위한 함수
    """
    train['date_time'] = pd.to_datetime(train.date_time)

    train['month'] = train.date_time.dt.month                    # 월(숫자)
    train['day'] = train.date_time.dt.day                        # 일(숫자)
    train['hour'] = train.date_time.dt.hour                      # 시(숫자)
    train['weekday'] = train.date_time.dt.weekday                # 요일(숫자)
    train['dayofyear'] = train.date_time.dt.dayofyear            # 연 기준 몇일째(숫자)

    return train

In [41]:
train = make_time(train)
test = make_time(test)

In [42]:
#현충일,광복절 주말로 처리
train.loc[train.weekday >= 5, 'holiday'] = 1
train.loc[train.weekday < 5, 'holiday'] = 0

test.loc[test.weekday >= 5, 'holiday'] = 1
test.loc[test.weekday < 5, 'holiday'] = 0

In [44]:
#건물유형 숫자로 처리하는 라벨인코딩
def make_label_map(dataframe):
    label_maps = {}
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            label_map = {'unknown':0}
            for i, key in enumerate(dataframe[col].unique()):
                label_map[key] = i+1  #새로 등장하는 유니크 값들에 대해 1부터 1씩 증가시켜 키값을 부여해줍니다.
            label_maps[col] = label_map
    print(label_maps)
    return label_maps


def label_encoder(dataframe, label_map):
    for col in dataframe.columns:
        if dataframe[col].dtype=='object':
            dataframe[col] = dataframe[col].map(label_map[col])
            dataframe[col] = dataframe[col].fillna(label_map[col]['unknown']) #혹시 모를 결측값은 unknown의 값(0)으로 채워줍니다.
    return dataframe

In [45]:
train_le = make_label_map(train[['building_type']])
builtype_df = label_encoder(train[['building_type']], train_le)
train['building_type'] = builtype_df[['building_type']]

{'building_type': {'unknown': 0, '건물기타': 1, '공공': 2, '대학교': 3, '데이터센터': 4, '백화점및아울렛': 5, '병원': 6, '상용': 7, '아파트': 8, '연구소': 9, '지식산업센터': 10, '할인마트': 11, '호텔및리조트': 12}}


In [46]:
#파생변수 : 체감온도
train['sensory_temp'] = 13.12 + 0.6215 * train['temperature'] - 11.37 * (train['windspeed'] ** 0.16) + 0.3965 * (train['windspeed'] ** 0.16) * train['temperature']
train

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,sunshine,solar_radiation,power_consumption,building_type,...,solar_power_capacity,ess_capacity,pcs_capacity,month,day,hour,weekday,dayofyear,holiday,sensory_temp
0,1,2022-06-01 00:00:00,18.6,,0.9,42.0,,,1085.28,1,...,-,-,-,6,1,0,2,152,0.0,20.751584
1,1,2022-06-01 01:00:00,18.0,,1.1,45.0,,,1047.36,1,...,-,-,-,6,1,1,2,152,0.0,20.008954
2,1,2022-06-01 02:00:00,17.7,,1.5,45.0,,,974.88,1,...,-,-,-,6,1,2,2,152,0.0,19.476910
3,1,2022-06-01 03:00:00,16.7,,1.4,48.0,,,953.76,1,...,-,-,-,6,1,3,2,152,0.0,18.487958
4,1,2022-06-01 04:00:00,18.4,,2.8,43.0,,,986.40,1,...,-,-,-,6,1,4,2,152,0.0,19.751534
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
203995,100,2022-08-24 19:00:00,23.1,,0.9,86.0,0.5,,881.04,12,...,-,-,-,8,24,19,2,236,0.0,25.302757
203996,100,2022-08-24 20:00:00,22.4,,1.3,86.0,0.0,,798.96,12,...,-,-,-,8,24,20,2,236,0.0,24.446518
203997,100,2022-08-24 21:00:00,21.3,,1.0,92.0,,,825.12,12,...,-,-,-,8,24,21,2,236,0.0,23.433400
203998,100,2022-08-24 22:00:00,21.0,,0.3,94.0,,,640.08,12,...,-,-,-,8,24,22,2,236,0.0,23.661275


In [56]:
test['sensory_temp'] = 13.12 + 0.6215 * test['temperature'] - 11.37 * (test['windspeed'] ** 0.16) + 0.3965 * (test['windspeed'] ** 0.16) * test['temperature']
test

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,month,day,hour,weekday,dayofyear,holiday,sensory_temp
0,1,2022-08-25 00:00:00,23.5,0.0,2.2,72,1,110634.00,39570.00,-,-,-,8,25,0,3,237,0.0,25.397063
1,1,2022-08-25 01:00:00,23.0,0.0,0.9,72,1,110634.00,39570.00,-,-,-,8,25,1,3,237,0.0,25.201620
2,1,2022-08-25 02:00:00,22.7,0.0,1.5,75,1,110634.00,39570.00,-,-,-,8,25,2,3,237,0.0,24.699788
3,1,2022-08-25 03:00:00,22.1,0.0,1.3,78,1,110634.00,39570.00,-,-,-,8,25,3,3,237,0.0,24.136018
4,1,2022-08-25 04:00:00,21.8,0.0,1.0,77,1,110634.00,39570.00,-,-,-,8,25,4,3,237,0.0,23.942400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100,2022-08-31 19:00:00,22.5,0.0,0.9,84,12,57497.84,40035.23,-,-,-,8,31,19,2,243,0.0,24.695934
16796,100,2022-08-31 20:00:00,20.7,0.0,0.4,95,12,57497.84,40035.23,-,-,-,8,31,20,2,243,0.0,23.253851
16797,100,2022-08-31 21:00:00,20.2,0.0,0.4,98,12,57497.84,40035.23,-,-,-,8,31,21,2,243,0.0,22.771886
16798,100,2022-08-31 22:00:00,20.1,0.0,1.1,97,12,57497.84,40035.23,-,-,-,8,31,22,2,243,0.0,22.159549


In [47]:
train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 204000 entries, 0 to 203999
Data columns (total 22 columns):
 #   Column                Non-Null Count   Dtype         
---  ------                --------------   -----         
 0   building_number       204000 non-null  int64         
 1   date_time             204000 non-null  datetime64[ns]
 2   temperature           204000 non-null  float64       
 3   rainfall              43931 non-null   float64       
 4   windspeed             203981 non-null  float64       
 5   humidity              203991 non-null  float64       
 6   sunshine              128818 non-null  float64       
 7   solar_radiation       116087 non-null  float64       
 8   power_consumption     204000 non-null  float64       
 9   building_type         204000 non-null  int64         
 10  total_area            204000 non-null  float64       
 11  cooling_area          204000 non-null  float64       
 12  solar_power_capacity  204000 non-null  object        
 13 

In [48]:
train_x = train.drop(columns=['building_number','date_time', 'rainfall', 'sunshine', 'solar_radiation', 'power_consumption','total_area','solar_power_capacity','ess_capacity','pcs_capacity'])
train_y = train['power_consumption']

In [49]:
train_x = train_x.fillna(0)

In [50]:
train_x

Unnamed: 0,temperature,windspeed,humidity,building_type,cooling_area,month,day,hour,weekday,dayofyear,holiday,sensory_temp
0,18.6,0.9,42.0,1,39570.00,6,1,0,2,152,0.0,20.751584
1,18.0,1.1,45.0,1,39570.00,6,1,1,2,152,0.0,20.008954
2,17.7,1.5,45.0,1,39570.00,6,1,2,2,152,0.0,19.476910
3,16.7,1.4,48.0,1,39570.00,6,1,3,2,152,0.0,18.487958
4,18.4,2.8,43.0,1,39570.00,6,1,4,2,152,0.0,19.751534
...,...,...,...,...,...,...,...,...,...,...,...,...
203995,23.1,0.9,86.0,12,40035.23,8,24,19,2,236,0.0,25.302757
203996,22.4,1.3,86.0,12,40035.23,8,24,20,2,236,0.0,24.446518
203997,21.3,1.0,92.0,12,40035.23,8,24,21,2,236,0.0,23.433400
203998,21.0,0.3,94.0,12,40035.23,8,24,22,2,236,0.0,23.661275


In [51]:
model = RandomForestRegressor()
model.fit(train_x, train_y)

In [58]:
test

Unnamed: 0,building_number,date_time,temperature,rainfall,windspeed,humidity,building_type,total_area,cooling_area,solar_power_capacity,ess_capacity,pcs_capacity,month,day,hour,weekday,dayofyear,holiday,sensory_temp
0,1,2022-08-25 00:00:00,23.5,0.0,2.2,72,1,110634.00,39570.00,-,-,-,8,25,0,3,237,0.0,25.397063
1,1,2022-08-25 01:00:00,23.0,0.0,0.9,72,1,110634.00,39570.00,-,-,-,8,25,1,3,237,0.0,25.201620
2,1,2022-08-25 02:00:00,22.7,0.0,1.5,75,1,110634.00,39570.00,-,-,-,8,25,2,3,237,0.0,24.699788
3,1,2022-08-25 03:00:00,22.1,0.0,1.3,78,1,110634.00,39570.00,-,-,-,8,25,3,3,237,0.0,24.136018
4,1,2022-08-25 04:00:00,21.8,0.0,1.0,77,1,110634.00,39570.00,-,-,-,8,25,4,3,237,0.0,23.942400
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100,2022-08-31 19:00:00,22.5,0.0,0.9,84,12,57497.84,40035.23,-,-,-,8,31,19,2,243,0.0,24.695934
16796,100,2022-08-31 20:00:00,20.7,0.0,0.4,95,12,57497.84,40035.23,-,-,-,8,31,20,2,243,0.0,23.253851
16797,100,2022-08-31 21:00:00,20.2,0.0,0.4,98,12,57497.84,40035.23,-,-,-,8,31,21,2,243,0.0,22.771886
16798,100,2022-08-31 22:00:00,20.1,0.0,1.1,97,12,57497.84,40035.23,-,-,-,8,31,22,2,243,0.0,22.159549


In [59]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 16800 entries, 0 to 16799
Data columns (total 19 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   building_number       16800 non-null  int64         
 1   date_time             16800 non-null  datetime64[ns]
 2   temperature           16800 non-null  float64       
 3   rainfall              16800 non-null  float64       
 4   windspeed             16800 non-null  float64       
 5   humidity              16800 non-null  int64         
 6   building_type         16800 non-null  int64         
 7   total_area            16800 non-null  float64       
 8   cooling_area          16800 non-null  float64       
 9   solar_power_capacity  16800 non-null  object        
 10  ess_capacity          16800 non-null  object        
 11  pcs_capacity          16800 non-null  object        
 12  month                 16800 non-null  int64         
 13  day             

In [62]:
test_le = make_label_map(test[['building_type']])
builtype_df2 = label_encoder(test[['building_type']], test_le)
test['building_type'] = builtype_df2[['building_type']]

{}


In [61]:
test_x = test.drop(columns=['building_number','date_time','rainfall','total_area','solar_power_capacity','ess_capacity','pcs_capacity'])

In [64]:
preds = model.predict(test_x)

In [65]:
submission = pd.read_csv('/content/drive/MyDrive/23energy/23data/sample_submission.csv')
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0
...,...,...
16795,100_20220831 19,0
16796,100_20220831 20,0
16797,100_20220831 21,0
16798,100_20220831 22,0


In [66]:
submission['answer'] = preds
submission

Unnamed: 0,num_date_time,answer
0,1_20220825 00,2049.7968
1,1_20220825 01,2067.5760
2,1_20220825 02,1965.0720
3,1_20220825 03,1991.2896
4,1_20220825 04,1987.5456
...,...,...
16795,100_20220831 19,927.9696
16796,100_20220831 20,860.1192
16797,100_20220831 21,783.0180
16798,100_20220831 22,670.2660


In [67]:
submission.to_csv('./submission_0804.csv', index=False)