In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import optuna
import wandb
from optuna.integration import LightGBMPruningCallback

from sklearn.feature_selection import RFE
from scipy.stats import chi2_contingency, spearmanr
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from lightgbm import LGBMRegressor, early_stopping, log_evaluation
import lightgbm
from sklearn.metrics import mean_squared_error

import time
import warnings
warnings.filterwarnings('ignore')

In [None]:
plt.rcParams['font.family'] ='NanumGothic'
plt.rcParams['axes.unicode_minus'] =False

In [None]:
# 데이터 로드
train = pd.read_csv('train_2020.csv', encoding='utf-8')
test = pd.read_csv('test_2020.csv', encoding='utf-8')

### 변수 선택 기법

##### 1. 도메인 기반 필요없는 열 제거
##### 2. 범주형: 카이제곱 기반/수치형: 스피어만 기반 p-value 0.05이하
##### 3. 모델 학습 시 embedded 방식(RandomForest만)

In [None]:
# df: 데이터프레임, target: 종속변수
target_col = 'target'

# 변수 타입 구분
categorical_cols = train.select_dtypes(include=['object', 'category']).columns.tolist()
numeric_cols = train.select_dtypes(include=['int64', 'float64']).columns.tolist()
numeric_cols = [col for col in numeric_cols if col != target_col]

# 1. 범주형 변수: 카이제곱 검정
selected_categorical = []
for col in categorical_cols:
    contingency_table = pd.crosstab(train[col], train[target_col])
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    if p <= 0.05:
        selected_categorical.append(col)

# 2. 수치형 변수: 스피어만 상관 + p-value
selected_numeric = []
for col in numeric_cols:
    corr, p = spearmanr(train[col], train[target_col])
    if p <= 0.05:
        selected_numeric.append(col)

# 결과 출력
print("p-value <= 0.05인 범주형 변수:", selected_categorical)
print("p-value <= 0.05인 수치형 변수:", selected_numeric)

In [None]:
for col in train.columns:
    if col not in selected_categorical + selected_numeric + ['target']:
        print(col)

In [None]:
# 변수 선택 1차
def drop_dup_cols(df):
    df = df.drop(['재건축 연한(30년 이상)', 'zone4_강남3', 'zone4_내부권', 'zone4_도심', 'zone4_외곽'], axis=1)
    return df

In [None]:
train = drop_dup_cols(train)
test = drop_dup_cols(test)

In [None]:
# 데이터 확인하기

In [None]:
train.info()

In [None]:
test.info()

In [None]:
# 기존 데이터 copy
train_origin = train.copy()
test_origin = test.copy()

In [None]:
# 범주형 변수 인코딩
# 1. 범주형 컬럼 선택
cat_cols = train.select_dtypes(include=['object', 'category']).columns

# 2. train 기준 LabelEncoder 학습
le_dict = {}
for col in cat_cols:
    le = LabelEncoder()
    train[col] = le.fit_transform(train[col].astype(str))  # train만 학습
    le_dict[col] = le  # 나중에 val/test 변환용 저장

# 3. val/test 변환
for col in cat_cols:
    le = le_dict[col]
    test[col] = test[col].map(lambda x: le.transform([x])[0] if x in le.classes_ else -1)

In [None]:
# train target 분류
train_x = train[train.columns[train.columns != 'target']]
train_y = train['target']

X_test = test.copy()

# target 로그 변환
train_y = np.log1p(train_y)

In [None]:
# train/validation 분할(RMSE 성능 보기 및 앙상블)
X_train, X_val, y_train, y_val = train_test_split(train_x, train_y, test_size=0.2, random_state=42, shuffle=True)

print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("test shape:", X_test.shape)

# validation y 로그 변환
y_val = np.expm1(y_val)
print(y_val.head())

### LGBM

In [None]:
# featrue importance 기반 변수 선택
# '아파트명_target_max', '계약년', '신축(10년 미만)', '전용면적구간_target_min', '전용면적구간_target_mean',(11731.6638),
# '아파트명_target_sum'(12283.6815),
# '아파트명_target_min'(11534.8368)
# '대장아파트거리접근성'(11422.5563)
#-------------------------------------------
# '우수학군'(11522.7932)
# '강남권여부'(11512.8323)
# '프리미엄아파트' (11525.6730)
# '지하철접근성'(11586.5634)
use_cols_lgbm = ['구', '전용면적(㎡)', '계약년월', '층', '건축년도', '회사채금리', '매매가격지수', '건설공사비지수',
       '버스정류장수', '지하철수', '연식', 'x좌표', 'y좌표', '아파트명', '계약월',
       '전용면적(log)', '전용면적구간', '평수', 'zone4',
       '대장아파트거리', '대장아파트거리(log)', '버스거리', '지하철거리', '1km이내지하철수',
       '1km이내학교수', '초등학교거리', '초등학교거리구분', '고등학교진학률',
       'elite_min_dist_km', 'elite_cnt_1.5k', 'elite_cnt_2.0k', '계약년월가중치',
       '구_target_mean', '구_target_sum', '구_target_max', '구_target_min',
       '구_target_count', '전용면적구간_target_sum', '지하철접근성',
       '전용면적구간_target_max', '전용면적구간_target_count',
       '아파트명_target_mean', '아파트명_target_count', '우수학군', '강남권여부', '프리미엄아파트']

In [None]:
# LGBM 모델 정의
lgbm = LGBMRegressor(
    n_estimators=15000,        # 충분히 크게 두고 early stopping으로 조기 중단
    learning_rate=0.05,        # 너무 크지 않게
    max_depth=10,              # 트리 깊이 제한
    num_leaves=31,             # 기본값보다 크게 잡으면 과적합 ↑ → 적당히 제한
    min_child_samples=50,      # 리프에 최소 샘플 수 → 크게 잡으면 과적합 ↓
    feature_fraction=0.8,      # 피처 중 일부만 사용 → 랜덤성 ↑
    bagging_fraction=0.8,      # 데이터 샘플링 → 랜덤성 ↑
    bagging_freq=5,            # 5회마다 샘플링
    lambda_l1=1.0,             # L1 정규화
    lambda_l2=1.0,             # L2 정규화
    random_state=42,
    n_jobs=-1
)

# 학습 시작 시간 기록
start_time = time.time()

lgbm.fit(
    X_train[use_cols_lgbm], y_train,
    eval_set=[(X_val[use_cols_lgbm], y_val)],
    # eval_set=[(X_train, y_train)],
    eval_metric="rmse",
    callbacks=[early_stopping(stopping_rounds=50, verbose=True), log_evaluation(period=100)]
)

# 학습 종료 시간 기록
end_time = time.time()
training_time = end_time - start_time
print(f"LGBM 학습 시간: {training_time:.2f} 초")

# Validation RMSE(RMSE 성능보기)
y_val_pred_log = lgbm.predict(X_val[use_cols_lgbm])
y_val_pred = np.expm1(y_val_pred_log)

val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {val_rmse:.4f}")

# Test 예측(제출시)
y_test_pred_lgbm = lgbm.predict(X_test[use_cols_lgbm])

In [None]:
# LGBM feature importance
lgbm_importance = pd.DataFrame({
    'feature': X_train[use_cols_lgbm].columns,
    'importance': lgbm.feature_importances_
}).sort_values(by='importance', ascending=False)

In [None]:
plt.figure(figsize=(10,8))
sns.barplot(x='importance', y='feature', data=lgbm_importance)
plt.title("Top Feature Importance (LGBM)")
plt.show()

### LGBM + Optuna + WandB

In [None]:
# W&B 초기화
wandb.init(project="lgbm-optuna-project", name="lgbm_optuna_run")

# Optuna 목적 함수
def objective(trial):
    params = {
        "n_estimators": 10000,
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.1, log=True),
        "max_depth": trial.suggest_int("max_depth", 5, 15),
        "num_leaves": trial.suggest_int("num_leaves", 20, 60),
        "min_child_samples": trial.suggest_int("min_child_samples", 20, 100),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.6, 1.0),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.6, 1.0),
        "bagging_freq": trial.suggest_int("bagging_freq", 1, 10),
        "lambda_l1": trial.suggest_float("lambda_l1", 0.0, 5.0),
        "lambda_l2": trial.suggest_float("lambda_l2", 0.0, 5.0),
        "random_state": 42,
        "n_jobs": -1,
    }

    # 모델 정의
    model = LGBMRegressor(**params)

    # 학습 시간 기록
    start_time = time.time()
    model.fit(
        X_train[use_cols_lgbm], y_train,
        eval_set=[(X_val[use_cols_lgbm], y_val)],
        eval_metric="rmse",
        callbacks=[
            early_stopping(stopping_rounds=50, verbose=False),
            log_evaluation(period=100)
        ]
    )
    training_time = time.time() - start_time

    # Validation 성능
    y_val_pred_log = model.predict(X_val[use_cols_lgbm])
    y_val_pred = np.expm1(y_val_pred_log)  # log 변환 해제
    val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))

    # W&B 로깅
    wandb.log({
        "val_rmse": val_rmse,
        "training_time": training_time,
        **params
    })

    return val_rmse

# Optuna 실행 (tqdm 진행률)
study = optuna.create_study(direction="minimize")

for _ in tqdm(range(30), desc="Optuna Trials"):
    study.optimize(objective, n_trials=1, catch=(Exception,))

print("Best Trial:")
print(study.best_trial.params)

wandb.finish()

In [None]:
study.best_trial.params

In [None]:
# 최적 파라미터 이용한 lgbm 학습
params = {
    'n_estimators' : 15000,
    'learning_rate': 0.03,
    'max_depth': 13,
    'num_leaves': 57,
    'min_child_samples': 96,
    'feature_fraction': 0.8630312552909056,
    'bagging_fraction': 0.8581096366183144,
    'bagging_freq': 1,
    'lambda_l1': 0.11103416306488273,
    'lambda_l2': 3.129785458790951,
    'random_state': 42,
    'n_jobs': -1,
    'verbosity': -1,
    'force_col_wise': True
}

model = LGBMRegressor(**params)

model.fit(
    X_train[use_cols_lgbm], y_train,
    eval_set=[(X_val[use_cols_lgbm], y_val)],
    eval_metric='rmse',
    callbacks=[early_stopping(100), log_evaluation(0)]
)

# Validation RMSE(RMSE 성능보기)
y_pred = model.predict(X_val[use_cols_lgbm])
val_rmse = np.sqrt(mean_squared_error(y_val, y_val_pred))
print(f"Validation RMSE: {val_rmse:.4f}")

# 필요시 post processing
residual_mean = (y_val - y_val_pred).mean()
print("val mean residual (y - pred):", residual_mean)

# Test 예측(제출시)
y_test_pred_lgbm = lgbm.predict(X_test[use_cols_lgbm])

### 단일 모델 제출

In [None]:
# 단일 모델 학습 결과
y_test_pred_lgbm

In [None]:
# submission에 맞춰서 타입 변경
submission = pd.DataFrame(y_test_pred_lgbm, columns=['target'])
submission['target'] = round(submission['target'])
submission['target'] = submission['target'].astype('int')

In [None]:
# 제출 csv 생성
submission.to_csv('submission_lgbm_optuna_250911.csv', index=False)