In [2]:
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor, early_stopping, LGBMClassifier, log_evaluation
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from bayes_opt import BayesianOptimization

from catboost import CatBoostRegressor

import optuna
import time
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

rc('font', family='NanumGothic')  
plt.rcParams['axes.unicode_minus'] = False

import warnings
warnings.filterwarnings('ignore')

In [42]:
train = pd.read_csv("train_2020.csv")
test  = pd.read_csv("test_2020.csv")

In [10]:
# Apply column removal
train = except_cols(train)
test = except_cols(test)

# Separate features and target
X = train.drop(columns=['target'])
y = train['target']

# 범주형 및 카테고리형 컬럼 추출
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 3. 각 범주형 컬럼을 category 타입으로 변환
for col in cat_cols:
    X[col] = X[col].astype('category')
    test[col] = test[col].astype('category')

# unseen 범주형 값은 -1 처리 함수 
def handle_unseen_categories(train_df, test_df, cat_columns):
    for col in cat_columns:
        train_cats = train_df[col].cat.categories
        test_df[col] = test_df[col].cat.set_categories(train_cats)
        test_df[col] = test_df[col].cat.codes  
    return test_df

# test 데이터 unseen 처리
X_test = test.copy()
X_test = handle_unseen_categories(X, X_test, cat_cols)

# train 데이터 범주형도 코드로 변환 (LightGBM 학습시 연속형 사용 권장)
for col in cat_cols:
    X[col] = X[col].cat.codes

In [37]:
# 피처명에 특수문자 제거 또는 변경 (예: 공백, -, /, (, ) 등)
def clean_feature_names(df):
    df = df.rename(columns=lambda x: x.strip()
                                  .replace(' ', '_')
                                  .replace('-', '_')
                                  .replace('/', '_')
                                  .replace('(', '')
                                  .replace(')', '')
                                  .replace(',', '')
                                  .replace('.', '_'))
    return df

# train, test 모두 적용
X = clean_feature_names(X)
X_test = clean_feature_names(X_test)

# 피처명 변경 적용
X = clean_feature_names(X)
X_test = clean_feature_names(X_test)

# 새로 바뀐 컬럼명으로 범주형 컬럼 재생성
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# X와 X_test 각각에서 cat_cols에 해당하는 컬럼이 모두 존재하는지 확인
missing_in_test = [col for col in cat_cols if col not in X_test.columns]
if missing_in_test:
    print(f'X_test에 없는 범주형 컬럼: {missing_in_test}')
    # 여기서 해당 컬럼이 없어졌거나 이름이 바뀌었으면 원인 분석 필요

# 범주형 컬럼 변환 시도 (오직 존재하는 컬럼만)
for col in cat_cols:
    X[col] = X[col].astype('category')
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')
    else:
        print(f"Warning: {col} is missing in X_test")

In [12]:
X

Unnamed: 0,구,전용면적㎡,계약년월,층,건축년도,회사채금리,매매가격지수,건설공사비지수,버스정류장수,지하철수,...,전용면적구간_target_mean,전용면적구간_target_sum,전용면적구간_target_max,전용면적구간_target_min,전용면적구간_target_count,아파트명_target_mean,아파트명_target_sum,아파트명_target_max,아파트명_target_min,아파트명_target_count
0,0,105.8064,202306,2,2021,4.356,92.7,127.42,86.0,2.0,...,162983.720613,4136037878,1450000,19500,25377,148554.226331,100422657,420000,39600,676
1,0,74.2500,202301,2,1983,4.704,94.6,127.10,86.0,2.0,...,85670.923244,1500097866,438000,13000,17510,69941.177197,594430065,730000,13950,8499
2,0,61.1900,202302,2,1983,4.274,93.6,127.16,86.0,2.0,...,85670.923244,1500097866,438000,13000,17510,69941.177197,594430065,730000,13950,8499
3,0,83.1700,202304,2,1983,4.072,92.6,127.45,86.0,2.0,...,85670.923244,1500097866,438000,13000,17510,69941.177197,594430065,730000,13950,8499
4,0,83.1700,202304,2,1983,4.072,92.6,127.45,86.0,2.0,...,85670.923244,1500097866,438000,13000,17510,69941.177197,594430065,730000,13950,8499
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
155969,22,59.9200,202008,2,2008,2.192,95.8,99.35,26.0,1.0,...,81715.948388,2632969573,300000,14500,32221,115089.953271,147775500,505000,34200,1284
155970,23,52.4600,202001,1,1981,1.955,94.2,99.86,1.0,0.0,...,46328.509833,1665556257,370000,6200,35951,87580.020470,10554881327,1450000,6200,120517
155971,23,52.4600,202001,1,1981,1.955,94.2,99.86,1.0,0.0,...,46328.509833,1665556257,370000,6200,35951,87580.020470,10554881327,1450000,6200,120517
155972,23,52.4600,202003,1,1981,1.840,94.4,99.53,1.0,0.0,...,46328.509833,1665556257,370000,6200,35951,87580.020470,10554881327,1450000,6200,120517


In [34]:
# LGBM 중요도 낮은 변수들
col_drop_zero = [
    '계약년', '전용면적log', '평수', '신축10년_미만', '재건축_연한30년_이상',
    'zone4_강남3', 'zone4_외곽', '대장아파트거리log', '지하철접근성', 'elite_cnt_1_5k',
    '계약년월가중치', '전용면적구간_target_min', '아파트명_target_max', '구','층','아파트명','전용면적구간','zone4','대장아파트거리접근성']

# 데이터프레임에서 해당 컬럼 제거
train = train.drop(columns=col_drop_zero, errors='ignore')
test = test.drop(columns=col_drop_zero, errors='ignore')

In [None]:
# LGBM 중요도 낮은 변수들
col_drop = [
    '계약년', '전용면적log', '평수', '신축10년_미만', '재건축_연한30년_이상',
    'zone4_강남3', 'zone4_외곽', '대장아파트거리log', '지하철접근성', 'elite_cnt_1_5k',
    '계약년월가중치', '전용면적구간_target_min', '아파트명_target_max',
    '전용면적구간_target_mean', '전용면적구간_target_max', 'zone4_내부권', 'zone4_도심',
    '전용면적구간_target_sum', '초등학교거리구분', 'zone4', '계약월', '고등학교진학률', '구_target_count'
    '구','층','아파트명','전용면적구간','zone4','대장아파트거리접근성' ]

# 데이터프레임에서 해당 컬럼 제거
train = train.drop(columns=col_drop, errors='ignore')
test = test.drop(columns=col_drop, errors='ignore')

In [43]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155974 entries, 0 to 155973
Data columns (total 60 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   구                    155974 non-null  object 
 1   전용면적(㎡)              155974 non-null  float64
 2   계약년월                 155974 non-null  int64  
 3   층                    155974 non-null  object 
 4   건축년도                 155974 non-null  int64  
 5   회사채금리                155974 non-null  float64
 6   매매가격지수               155974 non-null  float64
 7   건설공사비지수              155974 non-null  float64
 8   버스정류장수               155974 non-null  float64
 9   지하철수                 155974 non-null  float64
 10  연식                   155974 non-null  int64  
 11  x좌표                  155974 non-null  float64
 12  y좌표                  155974 non-null  float64
 13  아파트명                 155974 non-null  object 
 14  강남권여부                155974 non-null  int64  
 15  계약년              

In [40]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 155974 entries, 0 to 155973
Data columns (total 46 columns):
 #   Column               Non-Null Count   Dtype  
---  ------               --------------   -----  
 0   전용면적(㎡)              155974 non-null  float64
 1   계약년월                 155974 non-null  int64  
 2   건축년도                 155974 non-null  int64  
 3   회사채금리                155974 non-null  float64
 4   매매가격지수               155974 non-null  float64
 5   건설공사비지수              155974 non-null  float64
 6   버스정류장수               155974 non-null  float64
 7   지하철수                 155974 non-null  float64
 8   연식                   155974 non-null  int64  
 9   x좌표                  155974 non-null  float64
 10  y좌표                  155974 non-null  float64
 11  강남권여부                155974 non-null  int64  
 12  계약월                  155974 non-null  int64  
 13  전용면적(log)            155974 non-null  float64
 14  신축(10년 미만)           155974 non-null  int64  
 15  재건축 연한(30년 이상)   

In [18]:
from lightgbm import LGBMRegressor

model = LGBMRegressor()
model.fit(X_train, y_train)

# sklearn API 방식
print(model.feature_importances_)

# booster 객체에서 gain 기반 중요도 조회
booster = model.booster_
importance_gain = booster.feature_importance(importance_type='gain')
feature_names = booster.feature_name()

for name, imp in zip(feature_names, importance_gain):
    print(f'{name}: {imp}')



[ 73 433 107  60 300  25  85   6 132  60  68 183 197  14   6   0   9   0
  25   0   0   0   6  10   3   0   2   2   0 193   0   1 140 111  40   0
  75 111   1  15 134   0  21   0 137  34  30  21  11   1   3   1   0  19
  47  35   0   9   4]
구: 77779213339648.0
전용면적㎡: 932283168617472.0
계약년월: 77713624414208.0
층: 13996632850432.0
건축년도: 170396241481728.0
회사채금리: 1440858105856.0
매매가격지수: 30649829733376.0
건설공사비지수: 2367607461888.0
버스정류장수: 19037745708032.0
지하철수: 15082968996864.0
연식: 17812918896640.0
x좌표: 39698760421376.0
y좌표: 42224847717376.0
아파트명: 2349971395584.0
강남권여부: 3131113226240.0
계약년: 0.0
계약월: 454081498112.0
전용면적log: 0.0
전용면적구간: 37795379508224.0
평수: 0.0
신축10년_미만: 0.0
재건축_연한30년_이상: 0.0
우수학군: 2204003520512.0
프리미엄아파트: 1463649867776.0
zone4: 284412895232.0
zone4_강남3: 0.0
zone4_내부권: 53935898624.0
zone4_도심: 55793100800.0
zone4_외곽: 0.0
대장아파트거리: 147666205170688.0
대장아파트거리log: 0.0
대장아파트거리접근성: 1540289986560.0
버스거리: 56241254245376.0
지하철거리: 10185567733760.0
1km이내지하철수: 3392926453760.0
지하철접근성: 0.0
1km이내

In [None]:
import lightgbm as lgb


# 2. 모델 학습에 사용할 X, y 분리
X = train.drop(columns=['target'])
y = train['target']

# Train-validation split for monitoring
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature=cat_cols, free_raw_data=False)


# LightGBM parameters for RMSE and overfitting prevention
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
test_preds = np.zeros(len(X_test))  # 테스트 예측값 누적 합산용

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Fold {fold + 1} 시작')
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature=cat_cols, free_raw_data=False)

    callbacks = [lgb.early_stopping(stopping_rounds=100, verbose=True),
                     lgb.log_evaluation(period=100)]
    
    model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=callbacks
    )

    val_preds = model.predict(X_val, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    rmse_scores.append(rmse)
    print(f'Fold {fold + 1} RMSE: {rmse:.4f}')

    # 테스트 데이터 예측값 누적 (평균 예측을 위해)
    test_fold_preds = model.predict(X_test, num_iteration=model.best_iteration)
    test_preds += test_fold_preds

print(f'CV 평균 RMSE: {np.mean(rmse_scores):.4f}')

# 테스트 예측값 평균내기
test_preds /= kf.n_splits

# 제출용 DataFrame 생성
submission = pd.DataFrame({"target": test_preds})
if 'id' in test.columns:
    submission['id'] = test['id']

submission['target'] = submission['target'].round().astype(int)
# CSV 파일로 저장
submission.to_csv('submission_lgbm.csv', index=False)
print('제출용 CSV 파일이 저장되었습니다: submission_lgbm_drop_only0_2020.csv')

In [19]:
# 타깃 제외 후 숫자형 피처만 선택
X = train.drop(columns=['target'])
X_numeric = X.select_dtypes(include=[float, int])

# 상수항 추가
import statsmodels.api as sm
X_const = sm.add_constant(X_numeric)

# VIF 계산
from statsmodels.stats.outliers_influence import variance_inflation_factor
vif_data = pd.DataFrame()
vif_data['feature'] = X_const.columns
vif_data['VIF'] = [variance_inflation_factor(X_const.values, i) for i in range(X_const.shape[1])]

print(vif_data)


                feature           VIF
0                 const  0.000000e+00
1               전용면적(㎡)  1.125900e+15
2                  계약년월  4.503600e+15
3                  건축년도  3.880711e+05
4                 회사채금리  5.038940e+00
5                매매가격지수  2.801009e+00
6               건설공사비지수  4.170704e+01
7                버스정류장수  2.619872e+00
8                  지하철수  2.148895e+00
9                    연식  3.822393e+05
10                  x좌표  1.732952e+00
11                  y좌표  3.504575e+00
12                강남권여부  1.628546e+01
13                  계약년  3.002400e+15
14                  계약월  1.085052e+10
15            전용면적(log)  1.233399e+01
16                   평수  1.580210e+14
17           신축(10년 미만)  2.960797e+00
18       재건축 연한(30년 이상)  2.917910e+00
19                 우수학군  4.002878e+00
20              프리미엄아파트  1.501040e+01
21              대장아파트거리  1.916051e+01
22         대장아파트거리(log)  1.530221e+01
23                 버스거리  3.255933e+01
24                지하철거리  5.150147e+00
25          

In [None]:
# 분류용 VotingClassifier
# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regressor
print("Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    verbose=1  # RF는 verbose 지원 시 출력됨 (scikit-learn 최신버전 기준)
)
rf_model.fit(X_train, y_train)
print("Random Forest training completed.\n")

# XGBoost Regressor
print("Training XGBoost...")
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    verbosity=1,  
    eval_metric='rmse'
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True  

)
print("XGBoost training completed.\n")

# LightGBM Regressor
print("Training LightGBM...")
lgbm_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=31,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=100, verbose=True),
               log_evaluation(period=100)]
)
print("LightGBM training completed.\n")

# Voting Regressor 앙상블
print("Training Voting Regressor...")
voting_reg = VotingRegressor(estimators=[
    ('rf', rf_model), 
    ('xgb', xgb_model), 
    ('lgbm', lgbm_model)
])
# VotingRegressor는 개별 모델 이미 학습했으므로 fit 필요없음, 단일 학습시만 재호출 가능
# 여기서는 이미 개별 모델 학습 후 조합만 하므로 fit 따로 안함
voting_reg.fit(X_train, y_train)
# 검증 데이터 예측 및 RMSE
y_pred = voting_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Voting Regressor RMSE: {rmse:.4f}")


# 테스트 데이터 예측
test_preds = voting_reg.predict(X_test)  

# 제출용 DataFrame 생성
submission = pd.DataFrame({'target': test_preds})

# id 컬럼 있으면 추가
if 'id' in test.columns:
    submission['id'] = test['id']

submission['target'] = submission['target'].round().astype(int)

# CSV 파일로 저장
submission.to_csv('submission_voting_regressor.csv', index=False)
print('제출용 CSV 파일이 저장되었습니다: submission_voting_regressor_2020.csv')

In [None]:
dtrain = xgb.DMatrix(X_train, label=y_train)
dval = xgb.DMatrix(X_val, label=y_val)

params = {
    'objective':'reg:squarederror',
    'eval_metric':'rmse',
    'learning_rate':0.03,
    'max_depth':6,
    'subsample':0.8,
    'colsample_bytree':0.8,
    'seed':42
}

evals = [(dtrain, 'train'), (dval, 'valid')]

model = xgb.train(
    params,
    dtrain,
    num_boost_round=1000,
    evals=evals,
    early_stopping_rounds=100,
    verbose_eval=100
)

y_pred = model.predict(xgb.DMatrix(X_val), ntree_limit=model.best_iteration)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f'XGBoost RMSE: {rmse:.4f}')

# 테스트 데이터 예측
test_preds = voting_reg.predict(X_test)  # X_test 준비 완료 가정

# 제출용 DataFrame 생성
submission = pd.DataFrame({'target': test_preds})

# id 컬럼 있으면 추가
if 'id' in test.columns:
    submission['id'] = test['id']

submission['target'] = submission['target'].round().astype(int)
# CSV 파일로 저장
submission.to_csv('submission_voting_regressor.csv', index=False)
print('제출용 CSV 파일이 저장되었습니다: submission_xgb.csv')

In [21]:
# 다중공선성 높아 제거 권장 컬럼
high_vif_vars = ['전용면적(㎡)', '계약년월', '건축년도', '건설공사비지수', '연식', '강남권여부', '계약년', '계약월', '전용면적(log)', '평수', '프리미엄아파트', '대장아파트거리', '대장아파트거리(log)', '버스거리', '초등학교거리', '계약년월가중치', '구_target_mean', '구_target_sum', '구_target_max', '구_target_count', '전용면적구간_target_mean', '전용면적구간_target_sum', '전용면적구간_target_max', '전용면적구간_target_min', '전용면적구간_target_count', '아파트명_target_sum', '아파트명_target_max', '아파트명_target_count']

medium_vif_vars = ['회사채금리', '지하철거리', '구_target_min', '아파트명_target_mean']

low_vif_vars = ['const', '매매가격지수', '버스정류장수', '지하철수', 'x좌표', 'y좌표', '신축(10년 미만)', '재건축 연한(30년 이상)', '우수학군', '1km이내지하철수', '지하철접근성', '1km이내학교수', '초등학교거리구분', '고등학교진학률', 'elite_min_dist_km', 'elite_cnt_1.5k', 'elite_cnt_2.0k', '아파트명_target_min']



In [None]:
 'construction_cost_index',
# 모델에 사용할 전체 피처 (예시)
feature_cols_all = low_vif_vars + medium_vif_vars

# high VIF 컬럼 제거 후 사용
feature_cols = [c for c in feature_cols_all if c in train.columns and c not in high_vif_vars]

# 독립변수(X), 종속변수(y) 분리
X = train[feature_cols]
y = train['target']

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 이상치(IQR) 계산 및 NaN 처리 (학습 데이터)
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1
is_outlier = (X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))
X_train = X_train.mask(is_outlier, np.nan)
X_train = X_train.fillna(X_train.mean())

# 검증 데이터도 동일 처리
Q1_val = X_val.quantile(0.25)
Q3_val = X_val.quantile(0.75)
IQR_val = Q3_val - Q1_val
is_outlier_val = (X_val < (Q1_val - 1.5 * IQR_val)) | (X_val > (Q3_val + 1.5 * IQR_val))
X_val = X_val.mask(is_outlier_val, np.nan)
X_val = X_val.fillna(X_val.mean())

# 회귀모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 검증 예측 및 RMSE 계산
pred_val = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred_val))
print(f"Validation RMSE: {rmse:.4f}")

# 제출용 예측 (전체 train 사용 후 테스트 데이터에 적용 시 가정)
scaler = StandardScaler()
scaler.fit(X_train)          # 학습 데이터로 스케일러 학습
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 예: test 데이터 준비 및 전처리 동일하게 수행 필요
test_X = test[feature_cols]
test_X = test_X.mask((test_X < (Q1 - 1.5 * IQR)) | (test_X > (Q3 + 1.5 * IQR)), np.nan)
test_X = test_X.fillna(test_X.mean())

# 테스트 데이터 스케일링
test_X_scaled = scaler.transform(test_X)

# 테스트 데이터 예측
preds = model.predict(test_X_scaled)

# 임시 예측값으로 submission DataFrame 생성 가정
submission = pd.DataFrame({'target': preds})

# 반올림 후 정수 변환
submission['target'] = submission['target'].round().astype(int)

# # CSV 저장
# submission.to_csv('submission_IQR_regressor_2020.csv', index=False)
# print('제출용 CSV 파일이 저장되었습니다: submission_voting_regressor.csv')


In [None]:
feature_cols_all = low_vif_vars + medium_vif_vars
feature_cols = [c for c in feature_cols_all if c in train.columns and c not in high_vif_vars]

X = train[feature_cols]
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 결측치 평균 대체
X_train = X_train.fillna(X_train.mean())
X_val = X_val.fillna(X_val.mean())

# 랜덤 포레스트 모델 학습
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['sqrt'],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='neg_root_mean_squared_error', verbose=2)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)

best_rf = grid_search.best_estimator_

pred_val = best_rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred_val))
print(f'Tuned RandomForest Validation RMSE: {rmse:.4f}')

# 테스트 데이터 예측 및 제출용 CSV 저장(반올림 정수 변환 포함)
test_X = test[feature_cols]
test_X = test_X.fillna(test_X.mean())
test_preds = rf_model.predict(test_X)

submission = pd.DataFrame({'target': test_preds})
submission['target'] = submission['target'].round().astype(int)