In [2]:
import os
import random

import numpy as np
import pandas as pd

from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import make_column_transformer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestRegressor

import warnings
warnings.filterwarnings(action='ignore')

In [3]:
def fix_seed(seed):
    random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)
    np.random.seed(seed)

fix_seed(42)

In [4]:
DATA_DIR = '/Users/mungeonhui/git/AI_model/dacon/2023전력사용량예측AI경진대회/open'
train_csv = os.path.join(DATA_DIR, 'train.csv')
test_csv = os.path.join(DATA_DIR, 'test.csv')
building_csv = os.path.join(DATA_DIR, 'building_info.csv')

In [5]:
train_set = pd.read_csv(train_csv)
test_set = pd.read_csv(test_csv)
building_info = pd.read_csv(building_csv)

train_df = pd.merge(train_set, building_info, left_on='건물번호', right_on='건물번호')
test_df = pd.merge(test_set, building_info, left_on='건물번호', right_on='건물번호')

In [6]:
# feature, label 나누기
train_label = train_df['전력소비량(kWh)']
train_feature = train_df.drop(columns=['전력소비량(kWh)'])

train_label.shape, train_feature.shape

((204000,), (204000, 15))

In [28]:
class DropField(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing drop field")
        self.cols = ["num_date_time", "건물번호", "일조(hr)", "일사(MJ/m2)"]
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        columns = X.columns
        cols = [col for col in self.cols if col in columns]
        X_ = X.copy()
        X_ = X_.drop(columns=cols)
        return X_


class GetTimeData(BaseEstimator, TransformerMixin):
    def __init__(self):
        print("initializing time transformer")
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        X_['month'] = X_['일시'].apply(lambda x : int(x[4:6]))
        X_['day'] = X_['일시'].apply(lambda x : int(x[6:8]))
        X_['time'] = X_['일시'].apply(lambda x : int(x[9:11]))
        X_ = X_.drop(columns=['일시'])
        return X_


class TextImputer(BaseEstimator, TransformerMixin):
    def __init__(self, ):
        print("initialising text transformer")
        self.cols = ["태양광용량(kW)", "ESS저장용량(kWh)", "PCS용량(kW)"]
        
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        X_ = X.copy()
        
        for col in self.cols:
            X_[col] = X_[col].replace('-', 0).astype("float64")
        return X_

class MeanImputer(BaseEstimator, TransformerMixin):
    def __init__(self,):
        self.imputer = SimpleImputer()
        self.cols = ['풍속(m/s)', '습도(%)']
        
    def fit(self, X, y=None):
        self.imputer.fit(X[self.cols])
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = self.imputer.transform(X_[self.cols])
        return X_

class ValueImputer(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.cols = ['강수량(mm)']
    
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X_ = X.copy()
        X_[self.cols] = X_[self.cols].fillna(0)
        return X_
    
scale_cols = [
    '풍속(m/s)', '습도(%)',
    '강수량(mm)', '기온(C)',
    '연면적(m2)', '냉방면적(m2)',
    '태양광용량(kW)', 'ESS저장용량(kWh)',
    'PCS용량(kW)', 'month', 'time', 'day'
]
    
column_transformer = make_column_transformer(
    (StandardScaler(), scale_cols),
    (OneHotEncoder(), ['건물유형']), 
    remainder='passthrough'
)

pipeline = Pipeline([
    ('drop_field', DropField()), 
    ('time_spliter', GetTimeData()),
    ('text_imputer', TextImputer()),
    ('mean_imputer', MeanImputer()),
    ('value_imputer', ValueImputer()),
    ('column_transformer', column_transformer),
])

initializing drop field
initializing time transformer
initialising text transformer


In [8]:
test_feature = train_feature.copy()
transformed = pipeline.fit_transform(test_feature)

In [9]:
from pprint import pprint
from sklearn.model_selection import RandomizedSearchCV


# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 100, stop = 200, num = 3)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 100, num = 3)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_depth': max_depth,
               'bootstrap': bootstrap}

pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 55, 100, None],
 'n_estimators': [100, 150, 200]}


In [10]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(
    estimator=rf, 
    param_distributions=random_grid, 
    n_iter=100, 
    cv=3, 
    verbose=2, 
    random_state=42, 
    n_jobs = -1
)
# Fit the random search model
rf_random.fit(transformed, train_label)

Fitting 3 folds for each of 24 candidates, totalling 72 fits
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=  38.3s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=  42.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=100; total time=  42.8s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=150; total time=  57.0s
[CV] END .....bootstrap=True, max_depth=10, n_estimators=150; total time= 1.0min
[CV] END .....bootstrap=True, max_depth=10, n_estimators=150; total time= 1.0min
[CV] END .....bootstrap=True, max_depth=10, n_estimators=200; total time= 1.2min
[CV] END .....bootstrap=True, max_depth=10, n_estimators=200; total time= 1.3min
[CV] END .....bootstrap=True, max_depth=10, n_estimators=200; total time= 1.3min
[CV] END .....bootstrap=True, max_depth=55, n_estimators=100; total time= 1.3min
[CV] END .....bootstrap=True, max_depth=55, n_estimators=100; total time= 1.4min
[CV] END .....bootstrap=True, max_depth=55, n_es

In [11]:
dir(rf_random)

['__abstractmethods__',
 '__annotations__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__sklearn_clone__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_build_request_for_signature',
 '_check_feature_names',
 '_check_n_features',
 '_check_refit_for_multimetric',
 '_estimator_type',
 '_format_results',
 '_get_default_requests',
 '_get_metadata_request',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_parameter_constraints',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_required_parameters',
 '_run_search',
 '_select_best_index',
 '_validate_data',
 '_validate_params',
 'best_estimator_',
 'best_index_',
 'best_params_',
 'best_score_',
 '

In [16]:
best_rf = rf_random.best_estimator_

In [18]:
y_pred = best_rf.predict(transformed)
y_pred[:5]

array([923.56377293, 923.56377293, 923.56377293, 923.56377293,
       923.56377293])

In [19]:
train_label[:5]

0    1085.28
1    1047.36
2     974.88
3     953.76
4     986.40
Name: 전력소비량(kWh), dtype: float64

In [20]:
model = RandomForestRegressor(random_state=42)
model.fit(transformed, train_label)

In [22]:
y_pred = model.predict(transformed)

rmse = np.sqrt((train_label - y_pred) ** 2).mean()
rmse

47.505827247908485

In [25]:
test_df

Unnamed: 0,num_date_time,건물번호,일시,기온(C),강수량(mm),풍속(m/s),습도(%),건물유형,연면적(m2),냉방면적(m2),태양광용량(kW),ESS저장용량(kWh),PCS용량(kW)
0,1_20220825 00,1,20220825 00,23.5,0.0,2.2,72,건물기타,110634.00,39570.00,-,-,-
1,1_20220825 01,1,20220825 01,23.0,0.0,0.9,72,건물기타,110634.00,39570.00,-,-,-
2,1_20220825 02,1,20220825 02,22.7,0.0,1.5,75,건물기타,110634.00,39570.00,-,-,-
3,1_20220825 03,1,20220825 03,22.1,0.0,1.3,78,건물기타,110634.00,39570.00,-,-,-
4,1_20220825 04,1,20220825 04,21.8,0.0,1.0,77,건물기타,110634.00,39570.00,-,-,-
...,...,...,...,...,...,...,...,...,...,...,...,...,...
16795,100_20220831 19,100,20220831 19,22.5,0.0,0.9,84,호텔및리조트,57497.84,40035.23,-,-,-
16796,100_20220831 20,100,20220831 20,20.7,0.0,0.4,95,호텔및리조트,57497.84,40035.23,-,-,-
16797,100_20220831 21,100,20220831 21,20.2,0.0,0.4,98,호텔및리조트,57497.84,40035.23,-,-,-
16798,100_20220831 22,100,20220831 22,20.1,0.0,1.1,97,호텔및리조트,57497.84,40035.23,-,-,-


In [29]:
test_feature = pipeline.fit_transform(test_df)
test_feature[:5]

array([[ 0.39985547, -0.32014656, -0.23598246,  0.5287129 , -0.11798037,
        -0.13530886, -0.39776826, -0.20073685, -0.21221431,  0.        ,
        -1.66132477, -1.5       ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [-0.8479075 , -0.32014656, -0.23598246,  0.35533801, -0.11798037,
        -0.13530886, -0.39776826, -0.20073685, -0.21221431,  0.        ,
        -1.51686175, -1.5       ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ],
       [-0.2720169 , -0.13358226, -0.23598246,  0.25131308, -0.11798037,
        -0.13530886, -0.39776826, -0.20073685, -0.21221431,  0.        ,
        -1.37239873, -1.5       ,  1.        ,  0.        ,  0.        ,
         0.        ,  0.        ,  0.        ,  0.        ,  0.        ,
  

In [30]:
preds = model.predict(test_feature)
preds[:5]

array([1651.68  , 1616.0112, 1532.0256, 1514.7096, 1537.3704])

In [32]:
submission = pd.read_csv(os.path.join(DATA_DIR, 'sample_submission.csv'))
submission.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,0
1,1_20220825 01,0
2,1_20220825 02,0
3,1_20220825 03,0
4,1_20220825 04,0


In [33]:
submission['answer'] = preds
submission.head()

Unnamed: 0,num_date_time,answer
0,1_20220825 00,1651.68
1,1_20220825 01,1616.0112
2,1_20220825 02,1532.0256
3,1_20220825 03,1514.7096
4,1_20220825 04,1537.3704


In [34]:
submission.to_csv('./rf_submission.csv', index=False)