In [2]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

# 1) 로드
df = pd.read_csv("공공/훈련데이터셋.csv")

target = "파워"
region_col = "지역코드"









In [3]:
# 2) 학습/테스트 분리 (데이터에 date가 없으니 랜덤 split)
X = df.drop(columns=[target])
y = df[target].astype(float)

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=21
)


In [4]:
# 3) (중요) 훈련셋에서만 지역 평균 파워 계산
train_tmp = X_train[[region_col]].copy()
train_tmp[target] = y_train.values

region_mean_train = train_tmp.groupby(region_col)[target].mean()
global_mean_train = y_train.mean()

def make_deviation(X_region, y_series):
    # 훈련셋에 없던 지역이 테스트에 나오면 global_mean으로 대체
    mean_map = X_region.map(region_mean_train).fillna(global_mean_train)
    return (y_series.values - mean_map.values), mean_map.values

y_train_dev, train_mean = make_deviation(X_train[region_col], y_train)
y_test_dev,  test_mean  = make_deviation(X_test[region_col], y_test)



In [5]:

# 4) 입력 피처 선택
#    - deviation은 "지역 효과"를 뺀 값이니, 보통 지역코드를 빼고(날씨만) 보는 게 깔끔함
feature_cols = [c for c in X.columns if c != region_col]
X_train_feat = X_train[feature_cols]
X_test_feat  = X_test[feature_cols]



In [6]:
# 5) 결측치 처리
imp = SimpleImputer(strategy="median")
X_train_i = imp.fit_transform(X_train_feat)
X_test_i  = imp.transform(X_test_feat)



In [7]:
# 6) 모델 학습 (랜덤포레스트)
model = RandomForestRegressor(
    n_estimators=300,
    max_depth=12,
    min_samples_leaf=5,
    random_state=21,
    n_jobs=-1
)
model.fit(X_train_i, y_train_dev)

In [8]:
# 7) deviation 성능 평가
pred_dev = model.predict(X_test_i)
rmse_dev = mean_squared_error(y_test_dev, pred_dev, squared=False)
r2_dev = r2_score(y_test_dev, pred_dev)

print(f"[deviation] RMSE: {rmse_dev:.3f}")
print(f"[deviation] R2  : {r2_dev:.6f}")

[deviation] RMSE: 182820.370
[deviation] R2  : 0.025789


In [9]:
# 8) (선택) deviation 예측을 다시 '파워'로 복원해서 성능도 같이 확인
#    pred_power = pred_dev + (훈련에서 만든 지역 평균)
pred_power = pred_dev + test_mean
rmse_power = mean_squared_error(y_test, pred_power, squared=False)
r2_power = r2_score(y_test, pred_power)

print(f"[reconstructed power] RMSE: {rmse_power:.3f}")
print(f"[reconstructed power] R2  : {r2_power:.6f}")

[reconstructed power] RMSE: 182820.370
[reconstructed power] R2  : 0.677430
