In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import optuna
import wandb
from optuna.integration import LightGBMPruningCallback

from sklearn.feature_selection import RFE
from scipy.stats import chi2_contingency, spearmanr
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import StackingRegressor
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import lightgbm
from catboost import CatBoostRegressor, Pool
import xgboost as xgb
from xgboost import XGBRegressor
from sklearn.metrics import mean_squared_error

import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
# 한글처리
plt.rcParams['font.family'] ='NanumGothic'
plt.rcParams['axes.unicode_minus'] =False

In [None]:
# 데이터 로드
train = pd.read_csv('train_2016.csv', encoding='utf-8')
test = pd.read_csv('test_2016.csv', encoding='utf-8')

### 변수 선택 기법

##### 1. 도메인 기반 필요없는 열 제거
##### 2. 범주형: 카이제곱 기반/수치형: 스피어만 기반 p-value 0.05이하
##### 3. 모델 학습 시 embedded 방식(RandomForest만)

In [None]:
# df: 데이터프레임, target: 종속변수
target_col = 'target'

# 변수 타입 구분
categorical_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != target_col]

# 1. 범주형 변수: 카이제곱 검정
selected_categorical = []
for col in categorical_cols:
    contingency_table = pd.crosstab(train[col], train[target_col])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    if p <= 0.05:
        selected_categorical.append(col)

# 2. 수치형 변수: 스피어만 상관 + p-value
selected_numeric = []
for col in numeric_cols:
    corr, p = spearmanr(train[col], train[target_col])
    if p <= 0.05:
        selected_numeric.append(col)

# 결과 출력
print("p-value <= 0.05인 범주형 변수:", selected_categorical)
print("p-value <= 0.05인 수치형 변수:", selected_numeric)

In [None]:
for col in train.columns:
    if col not in selected_categorical + selected_numeric + ['target']:
        print(col)

In [None]:
# 변수 선택 1차
def drop_dup_cols(df):
    df = df.drop(['번지', '본번', '부번', '계약일', '도로명', '중개사소재지', 'k-전화번호', 'k-팩스번호', 'k-관리방식',
                  'k-사용검사일-사용승인일', '경비비관리형태', '세대전기계약방법', '청소비관리형태', 'k-수정일자',
                  '계약월', '주소'], axis=1)
    return df

In [None]:
train = drop_dup_cols(train)
test = drop_dup_cols(test)

In [None]:
# 데이터 확인하기

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# 기존 데이터 copy
train_origin = train.copy()
test_origin = test.copy()

In [None]:
# 모델에 맞춘 칼럼명 변경
def clean_feature_names(df):
    """
    데이터프레임 feature명을 모델에 바로 사용할 수 있게 변환
    - 한글 최대 유지
    - 특수문자 제거: (), /, -, =, 공백
    - ㎡ -> m2 . → _, ~ → _
    - 연속된 _는 하나로
    - bool 타입 → int로 변환
    """
    df = df.copy()
    new_cols = []
    
    for col in df.columns:
        new_col = col
        # 괄호 제거
        new_col = new_col.replace("(", "").replace(")", "")
        # 단위, 특수문자 치환
        new_col = new_col.replace("㎡", "m2")
        new_col = new_col.replace(".", "_")
        new_col = new_col.replace("~", "_")
        # 슬래시, 등호, 하이픈, 공백 → _
        for ch in ["/", "=", "-", " "]:
            new_col = new_col.replace(ch, "_")
        # 연속된 _는 하나로
        while "__" in new_col:
            new_col = new_col.replace("__", "_")
        # 앞뒤 _ 제거
        new_col = new_col.strip("_")
        new_cols.append(new_col)
    
    df.columns = new_cols
    
    # bool 타입 → int
    for col in df.select_dtypes(include="bool").columns:
        df[col] = df[col].astype(int)
    
    return df

In [None]:
train = clean_feature_names(train)
test = clean_feature_names(test)

In [None]:
# 범주형 변수 인코딩
# 1. 범주형 컬럼 선택
cat_cols = train.select_dtypes(include=['object', 'category']).columns

# 2. train 기준 LabelEncoder 학습
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))  # train만 학습
    le_dict[col] = le  # 나중에 val/test 변환용 저장

# 3. val/test 변환
for col in cat_cols:
    le = le_dict[col]
    test[col] = test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

In [None]:
# train target 분류
train_x = train[train.columns[train.columns != 'target']]
train_y = train['target']

X_test = test.copy()

# target 로그 변환
train_y = np.log1p(train_y)

In [None]:
# train/validation 분할(RMSE 성능 보기 및 앙상블)
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True)

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("test shape:", X_test.shape)

# validation y 로그 변환
y_val = np.expm1(y_val)
print(y_val.head())

### LGBM

In [None]:
# LGBM 모델 정의
lgbm = LGBMRegressor(
    n_estimators=15000,          # 충분히 크게 두고 early stopping으로 조기 중단
    learning_rate=0.05,        # 너무 크지 않게
    max_depth=10,              # 트리 깊이 제한
    num_leaves=31,             # 기본값보다 크게 잡으면 과적합 ↑ → 적당히 제한
    min_child_samples=50,      # 리프에 최소 샘플 수 → 크게 잡으면 과적합 ↓
    feature_fraction=0.8,      # 피처 중 일부만 사용 → 랜덤성 ↑
    bagging_fraction=0.8,      # 데이터 샘플링 → 랜덤성 ↑
    bagging_freq=5,            # 5회마다 샘플링
    lambda_l1=1.0,             # L1 정규화
    lambda_l2=1.0,             # L2 정규화
    random_state=42,
    n_jobs=-1
)

# 학습 시작 시간 기록
start_time = time.time()

lgbm.fit(
    X_train[use_cols_lgbm], y_train,
    eval_set=[(X_val[use_cols_lgbm], y_val)],
    eval_metric="rmse",
    callbacks=[early_stopping(stopping_rounds=50, verbose=True), log_evaluation(period=100)]
)

# 학습 종료 시간 기록
end_time = time.time()
training_time = end_time - start_time
print(f"LGBM 학습 시간: {training_time:.2f} 초")

# Validation RMSE(RMSE 성능보기)
y_val_pred_log = lgbm.predict(X_val[use_cols_lgbm])
y_val_pred = np.expm1(y_val_pred_log)

val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {val_rmse:.4f}") #

# Test 예측(제출시)
y_test_pred_lgbm = lgbm.predict(X_test[use_cols_lgbm])

In [None]:
# LGBM feature importance
lgbm_importance = pd.DataFrame({
    'feature': X_train[use_cols_lgbm].columns,
    'importance': lgbm.feature_importances_
}).sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='importance', y='feature', data=lgbm_importance)
plt.title("Top Feature Importance (LGBM)")
plt.show()

### RandomForest

In [None]:
# 랜덤포레스트 학습
rf = RandomForestRegressor(
    n_estimators=2000,
    max_depth=20,
    min_samples_split=5,
    min_samples_leaf=2,
    random_state=42,
    n_jobs=-1
)

# 학습 시작 시간 기록
start_time = time.time()

# 모델 학습
rf.fit(X_train[rf_cols], y_train) # cols

# 학습 종료 시간 기록
end_time = time.time()
training_time = end_time - start_time
print(f"Random Forest 학습 시간: {training_time:.2f} 초")

# Validation RMSE(RMSE 성능보기)
y_val_pred_log = rf.predict(X_val[rf_cols])
y_val_pred = np.expm1(y_val_pred_log)

val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {val_rmse:.4f}") #

# # Test 예측(제출시)
y_test_pred_rf = rf.predict(X_test[rf_cols]) # cols

In [None]:
# max_depth=None(트리개수 제한 없음)일때 트리 깊이 확인해보기
depths = [estimator.tree_.max_depth for estimator in rf.estimators_]

print("트리 깊이 요약")
print("최소 깊이:", min(depths))
print("평균 깊이:", sum(depths) / len(depths))
print("최대 깊이:", max(depths))

In [None]:
# RF feature importance
rf_importance = pd.DataFrame({
    'feature': X_train[selected_final].columns, # cols
    'importance': rf.feature_importances_
}).sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(x='importance', y='feature', data=rf_importance.head(40))
plt.title("Top Feature Importance (RandomForest)")
plt.show()

### XGBoost

In [None]:
# XGBoost
dtrain = xgb.DMatrix(X_train[xgb_cols], y_train)
dvalid = xgb.DMatrix(X_val[xgb_cols], y_val)
dtest = xgb.DMatrix(X_test[xgb_cols])

params = {
    "objective": "reg:squarederror",
    "eval_metric": "rmse",
    "learning_rate": 0.05,
    "max_depth": 8,
    "subsample": 0.8,
    "colsample_bytree": 0.8,
    "lambda": 1.0,   # L2
    "alpha": 0.0,    # L1
    "seed": 42
}

# 학습
evals = [(dtrain, "train"), (dvalid, "valid")]

num_round = 15000
bst = xgb.train(
    params,
    dtrain,
    num_boost_round=num_round,
    evals=evals,
    early_stopping_rounds=50,
    verbose_eval=100
)

# Validation RMSE(RMSE 성능보기)
y_val_pred_log = bst.predict(dvalid)
y_val_pred = np.expm1(y_val_pred_log)  # 로그 역변환

rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print("Validation RMSE:", rmse)

# test 예측(제출시)
y_test_pred_xgb= bst.predict(dtest)

In [None]:
# xgb feature importance
xgb_importance = bst.get_score(importance_type='gain')
xgb_importance = pd.DataFrame(xgb_importance.items(), columns=['feature', 'importance'])
xgb_importance = xgb_importance.sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(10,6))
sns.barplot(xgb_importance.head(40), x='importance', y='feature')
plt.title("Top Feature Importance (xgb)")
plt.show()

### 앙상블

In [None]:
# 단순평균
y_pred_mean = (y_test_pred_rf + y_test_pred_lgbm + y_test_pred_xgb) / 3
y_pred_mean

In [None]:
# 1/RMSE 비례 가중치 계산
rmse_rf = 57484.8395
rmse_lgbm = 8487.8129
rmse_xgb = 7704.1087

inv = np.array([1/rmse_rf, 1/rmse_lgbm, 1/rmse_xgb])
weights = inv / inv.sum()
print("가중치:", weights)

# validation 앙상블 성능 확인
rf_val = y_test_pred_rf
lgbm_val = y_test_pred_lgbm
xgb_val = y_test_pred_xgb

ensemble_val = (
    weights[0]*rf_val +
    weights[1]*lgbm_val +
    weights[2]*xgb_val
)

### KFold

In [None]:
# 0. feature 정의 (모델별 다르게)
FEATURES_RF = use_cols_rf  # RF feature
FEATURES_LGB = use_cols # LGBM feature
FEATURES_XGB = use_cols # XGB feature

# 1. KFold 정의
kf = KFold(n_splits=5, shuffle=True, random_state=42)

# OOF 저장
rf_oof = np.zeros(len(X_train))
lgbm_oof = np.zeros(len(X_train))
xgb_oof = np.zeros(len(X_train))

# test fold 평균
rf_test_pred = np.zeros(len(X_test))
lgbm_test_pred = np.zeros(len(X_test))
xgb_test_pred = np.zeros(len(X_test))

# 2. Fold 학습
for fold, (train_idx, val_idx) in enumerate(kf.split(X_train), 1):
    print(f"\n===== Fold {fold} =====")
    
    # 각 모델별 train/val
    X_tr_rf, X_va_rf = X_train.iloc[train_idx][FEATURES_RF], X_train.iloc[val_idx][FEATURES_RF]
    X_tr_lgb, X_va_lgb = X_train.iloc[train_idx][FEATURES_LGB], X_train.iloc[val_idx][FEATURES_LGB]
    X_tr_xgb, X_va_xgb = X_train.iloc[train_idx][FEATURES_XGB], X_train.iloc[val_idx][FEATURES_XGB]
    
    y_tr, y_va = y_train.iloc[train_idx], y_train.iloc[val_idx]

    # ---------------------------
    # Random Forest
    # ---------------------------
    rf = RandomForestRegressor(
        n_estimators=500,
        max_depth=20,
        min_samples_split=10,
        min_samples_leaf=5,
        max_features='sqrt',
        random_state=42,
        n_jobs=-1
    )
    rf.fit(X_tr_rf, y_tr)
    rf_oof[val_idx] = rf.predict(X_va_rf)
    rf_test_pred += rf.predict(X_test[FEATURES_RF]) / kf.n_splits

    # ---------------------------
    # LightGBM
    # ---------------------------
    lgbm = LGBMRegressor(
        n_estimators=10000,
        learning_rate=0.03,
        num_leaves=31,
        max_depth=10,
        min_data_in_leaf=30,
        feature_fraction=0.8,
        bagging_fraction=0.8,
        bagging_freq=5,
        lambda_l1=0,
        lambda_l2=1,
        random_state=42
    )
    lgbm.fit(
        X_tr_lgb, y_tr,
        eval_set=[(X_va_lgb, y_va)],
        eval_metric='rmse',
        callbacks=[early_stopping(stopping_rounds=50), log_evaluation(period=100)]
    )
    lgbm_oof[val_idx] = lgbm.predict(X_va_lgb, num_iteration=lgbm.best_iteration_)
    lgbm_test_pred += lgbm.predict(X_test[FEATURES_LGB], num_iteration=lgbm.best_iteration_) / kf.n_splits

    # ---------------------------
    # XGBoost
    # ---------------------------
    xgb_model = XGBRegressor(
        n_estimators=10000,
        learning_rate=0.03,
        max_depth=8,
        min_child_weight=5,
        subsample=0.8,
        colsample_bytree=0.8,
        reg_alpha=0,
        reg_lambda=1,
        gamma=1,
        tree_method='hist',
        random_state=42,
        n_jobs=-1,
        early_stopping_rounds=50,
        eval_metric='rmse'
    )
    xgb_model.fit(
        X_tr_xgb, y_tr,
        eval_set=[(X_va_xgb, y_va)],
        verbose=False
    )
    xgb_oof[val_idx] = xgb_model.predict(X_va_xgb)
    xgb_test_pred += xgb_model.predict(X_test[FEATURES_XGB]) / kf.n_splits

# 3. log -> 원값 변환 후 RMSE 계산
rf_oof_orig = np.expm1(rf_oof)
lgbm_oof_orig = np.expm1(lgbm_oof)
xgb_oof_orig = np.expm1(xgb_oof)
y_train_orig = np.expm1(y_train)

rmse_rf = np.sqrt(mean_squared_error(y_train_orig, rf_oof_orig))
rmse_lgbm = np.sqrt(mean_squared_error(y_train_orig, lgbm_oof_orig))
rmse_xgb = np.sqrt(mean_squared_error(y_train_orig, xgb_oof_orig))

print(f"RF RMSE: {rmse_rf:.2f}")
print(f"LGBM RMSE: {rmse_lgbm:.2f}")
print(f"XGB RMSE: {rmse_xgb:.2f}")

# 4. 앙상블 가중치 (1/RMSE 기준)
inv = np.array([1/rmse_rf, 1/rmse_lgbm, 1/rmse_xgb])
weights = inv / inv.sum()
print("앙상블 가중치:", weights)

ensemble_oof = (
    weights[0]*rf_oof_orig +
    weights[1]*lgbm_oof_orig +
    weights[2]*xgb_oof_orig
)
rmse_ensemble = np.sqrt(mean_squared_error(y_train_orig, ensemble_oof))
print("앙상블 RMSE:", rmse_ensemble)

# 5. 최종 test 예측
rf_test_orig = np.expm1(rf_test_pred)
lgbm_test_orig = np.expm1(lgbm_test_pred)
xgb_test_orig = np.expm1(xgb_test_pred)

y_pred = (
    weights[0]*rf_test_orig +
    weights[1]*lgbm_test_orig +
    weights[2]*xgb_test_orig
)

### 단일 모델 제출

In [None]:
# 단일 모델 학습 결과
y_test_pred_rf

In [None]:
y_test_pred_lgbm

In [None]:
y_test_pred_xgb

In [None]:
# 원래 스케일로 변환
y_test_pred = np.expm1(y_test_pred_lgbm)
y_test_pred

In [None]:
# submission에 맞춰서 타입 변경
submission = pd.DataFrame(y_test_pred, columns=['target'])
submission['target'] = round(submission['target'])
submission['target'] = submission['target'].astype('int')

In [None]:
# 제출 csv 생성
submission.to_csv('submission_lgbm_250911.csv', index=False)

### 앙상블 모델 제출

In [None]:
# 앙상블 모델 학습 결과
ensemble_pred

In [None]:
# submission에 맞춰서 타입 변경
submission = pd.DataFrame(ensemble_pred, columns=['target'])
submission['target'] = round(submission['target'])
submission['target'] = submission['target'].astype('int')

In [None]:
# 제출 csv 생성
submission.to_csv('submission_en_250911.csv', index=False)