In [None]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.base import BaseEstimator, RegressorMixin
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import mean_squared_error, r2_score
import xgboost as xgb
import lightgbm as lgb

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor, GradientBoostingRegressor, BaggingRegressor, StackingRegressor, VotingRegressor, AdaBoostRegressor
from sklearn.experimental import enable_hist_gradient_boosting  # 이 줄은 아래 HistGradientBoostingRegressor를 사용하기 전에 필요합니다.
from sklearn.ensemble import HistGradientBoostingRegressor
from sklearn.model_selection import cross_val_predict

In [None]:
###  INPUT ###
input_data = pd.read_csv('2023_smartFarm_AI_hackathon_dataset.csv') 
input_data.shape

In [None]:
df = input_data.drop(columns=['frmDist'])
df = df.sort_values(by='date')

# 데이터를 훈련 세트와 테스트 세트로 분할
'''
데이터를 훈련 세트와 테스트 세트로 나누는 데 사용됩니다. 
이는 모델의 성능을 평가하기 위해 데이터를 분리하는 일반적인 절차입니다.
'''
# 데이터셋을 data, target으로 변수분리
X = df[df.drop(columns=['outtrn_cumsum','HeatingEnergyUsage_cumsum']).columns]
Y = df[['outtrn_cumsum','HeatingEnergyUsage_cumsum']]
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=42)

In [None]:
class SimpleEnsemble(BaseEstimator, RegressorMixin):
    def __init__(self, models):
        self.models = models

    def fit(self, X, y):
        for model in self.models:
            model.fit(X, y)
        return self
    
    def predict(self, X):
        predictions = np.array([model.predict(X) for model in self.models])
        return np.mean(predictions, axis=0)

model_xgb = ('xgb', xgb.XGBRegressor())
GBoost = ('gboost', GradientBoostingRegressor())
LightGB = ('lgb', lgb.LGBMRegressor())
RF = ('rf', RandomForestRegressor())
ETR = ('etr', ExtraTreesRegressor())
DT = ('dt', DecisionTreeRegressor())
HGBR = ('hgbr', HistGradientBoostingRegressor())
Bagging = ('bagging', BaggingRegressor())

models = [GBoost, LightGB, RF, model_xgb, ETR, DT, HGBR, Bagging]

ensemble_model = SimpleEnsemble([estimator for _, estimator in models])

In [None]:
# 첫 번째 레벨의 스태킹 모델
first_level = StackingRegressor(estimators=models, final_estimator=ensemble_model)
first_level = MultiOutputRegressor(first_level)

# 첫 번째 레벨의 스태킹 모델 학습
first_level.fit(X_train, y_train)

# 훈련 및 검증 세트에 대한 예측 생성
train_preds = cross_val_predict(first_level, X_train, y_train, cv=5)
test_preds = first_level.predict(X_test)

# 두 번째 레벨의 스태킹 모델
second_level = StackingRegressor(estimators=models, final_estimator=ensemble_model)
model = MultiOutputRegressor(second_level)

#### MODEL ####
model = StackingRegressor(estimators=models, final_estimator=ensemble_model)
model = MultiOutputRegressor(model)

In [None]:
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
# 성능 지표 계산
rmse = np.sqrt(mean_squared_error(y_test, model))
r2score = r2_score(y_test, model)

In [None]:
### OUTPUT ###
print("RMSE:", rmse)
print("R2_score:", r2score)