In [56]:
import pandas as pd
import numpy as np
import joblib
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from xgboost import XGBRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

In [57]:
df_final = pd.read_csv("data/processed_data.csv", encoding="utf-8-sig")

In [58]:
df_final

Unnamed: 0,encoder__지역명_강원도 강릉시,encoder__지역명_강원도 고성군,encoder__지역명_강원도 동해시,encoder__지역명_강원도 삼척시,encoder__지역명_강원도 속초시,encoder__지역명_강원도 양구군,encoder__지역명_강원도 양양군,encoder__지역명_강원도 영월군,encoder__지역명_강원도 원주시,encoder__지역명_강원도 인제군,...,encoder__지역명_경기도 의왕시,encoder__지역명_경기도 의정부시,encoder__지역명_경기도 이천시,encoder__지역명_경기도 파주시,encoder__지역명_경기도 평택시,encoder__숙박유형명_Hotel,encoder__숙박유형명_Motel,encoder__숙박유형명_Pension,remainder__성수기여부,평균판매금액
0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,88667
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,205885
2,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,96455
3,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0,142178
4,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,65000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
481,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,41600
482,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,55000
483,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,55406
484,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0,46667


In [66]:
X = df_final.drop(columns=["평균판매금액"])
y = df_final["평균판매금액"]

In [67]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [75]:
rf_model = RandomForestRegressor(n_estimators=500, max_depth=20,
    min_samples_split=5, min_samples_leaf=2,random_state=42)
rf_model.fit(X_train, y_train)
y_pred_rf = rf_model.predict(X_test)

In [76]:

# Random Forest 성능 평가
mae_rf = mean_absolute_error(y_test, y_pred_rf)
mse_rf = mean_squared_error(y_test, y_pred_rf)
rmse_rf = np.sqrt(mse_rf)
r2_rf = r2_score(y_test, y_pred_rf)

In [77]:
# Random Forest 모델 저장
joblib.dump(rf_model, "models/random_forest.pkl")

['models/random_forest.pkl']

In [78]:
xgb_model = XGBRegressor(n_estimators=500, max_depth=10,
    learning_rate=0.01,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=0.1,
    random_state=14)
xgb_model.fit(X_train, y_train) 
y_pred_xgb = xgb_model.predict(X_test)

In [79]:
mae_xgb = mean_absolute_error(y_test, y_pred_xgb)
mse_xgb = mean_squared_error(y_test, y_pred_xgb)
rmse_xgb = np.sqrt(mse_xgb)
r2_xgb = r2_score(y_test, y_pred_xgb)

In [80]:
joblib.dump(xgb_model, "models/xgboost.pkl")



['models/xgboost.pkl']

In [81]:
print("\n📊 모델 성능 비교 결과:")
print(f"🔹 Random Forest:")
print(f"   MAE: {mae_rf:.2f}")
print(f"   RMSE: {rmse_rf:.2f}")
print(f"   R² Score: {r2_rf:.2f}")
print("-" * 40)
print(f"🔹 XGBoost:")
print(f"   MAE: {mae_xgb:.2f}")
print(f"   RMSE: {rmse_xgb:.2f}")
print(f"   R² Score: {r2_xgb:.2f}")
print("-" * 40)

print("✅ 전체 모델 학습 및 저장 완료! 🚀")


📊 모델 성능 비교 결과:
🔹 Random Forest:
   MAE: 20774.38
   RMSE: 32260.62
   R² Score: 0.55
----------------------------------------
🔹 XGBoost:
   MAE: 22778.02
   RMSE: 43946.75
   R² Score: 0.16
----------------------------------------
✅ 전체 모델 학습 및 저장 완료! 🚀
