In [1]:
import pandas as pd
import lightgbm as lgb
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import mean_squared_error

In [2]:
data_path = './data'
train_path = f'{data_path}/new/new_train_ver7.csv'
test_path  = f'{data_path}/new/new_test_ver7.csv'

train_data = pd.read_csv(train_path)
test_data = pd.read_csv(test_path)
print('Train data shape : ', train_data.shape, 'Test data shape : ', test_data.shape)

Train data shape :  (1118822, 184) Test data shape :  (9272, 183)


In [3]:
train_data = train_data.drop(columns=['아파트명', '도로명', 'k-건설사'])
test_data = test_data.drop(columns=['아파트명', '도로명', 'k-건설사'])

In [4]:
X = train_data.drop(columns=['target'])  # 'target'는 예측하려는 부동산 가격 열 이름
y = train_data['target']

# 훈련 및 검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [5]:
# 하이퍼파라미터 설정
param_dist = {
    'num_leaves': [31, 127],
    'learning_rate': [0.01, 0.05, 0.1],
    'n_estimators': [100, 500, 1000],
    'min_child_samples': [20, 30, 50],
    'subsample': [0.7, 0.8, 0.9, 1.0],
    'colsample_bytree': [0.7, 0.8, 0.9, 1.0],
    'feature_fraction': [0.7, 0.8, 0.9, 1.0],
    'bagging_fraction': [0.7, 0.8, 0.9, 1.0],
    'bagging_freq': [1, 5, 10]
}

# 모델 설정
model = lgb.LGBMRegressor(boosting_type='gbdt', objective='regression', metric='rmse')

# 랜덤 서치 설정
random_search = RandomizedSearchCV(
    estimator=model,
    param_distributions=param_dist,
    n_iter=50,  # n_iter 값을 늘림
    scoring='neg_mean_squared_error',
    cv=3,
    verbose=1,
    random_state=42,
    n_jobs=2  # n_jobs 값을 줄임
)

In [6]:
# 랜덤 서치 실행
random_search.fit(X_train, y_train)

# 최적 모델로 예측
best_model = random_search.best_estimator_
y_pred = best_model.predict(X_val)

# RMSE 계산
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')

Fitting 3 folds for each of 50 candidates, totalling 150 fits


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.025760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4559
[LightGBM] [Info] Number of data points in the train set: 596704, number of used features: 171
[LightGBM] [Info] Start training from score 57933.096795
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.027170 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 4549
[LightGBM] [Info] Number of data points in the train set: 596705, number of used features: 172
[LightGBM] [Info] Start training from score 58024.945861
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.028098 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory 

In [7]:
print("Best parameters found: ", random_search.best_params_)

Best parameters found:  {'subsample': 0.8, 'num_leaves': 127, 'n_estimators': 1000, 'min_child_samples': 30, 'learning_rate': 0.1, 'feature_fraction': 0.7, 'colsample_bytree': 0.7, 'bagging_freq': 1, 'bagging_fraction': 0.7}


In [8]:
# 최적 모델로 예측
best_model = random_search.best_estimator_

# 검증 데이터 예측
y_pred = best_model.predict(X_val)

# RMSE 계산
rmse = mean_squared_error(y_val, y_pred, squared=False)
print(f'Validation RMSE: {rmse}')



Validation RMSE: 6141.475001116921


In [9]:
# 테스트 데이터 예측
test_pred = best_model.predict(test_data)

# 예측값을 정수형으로 변환
test_pred = test_pred.round().astype(int)



In [10]:
output_path = './output.csv'
output_df = pd.read_csv(output_path)

# 비교할 예측값 가져오기
output_pred = output_df['target']

# 두 예측값 간의 RMSE 계산
comparison_rmse = mean_squared_error(output_pred, test_pred, squared=False)
print(f'Comparison RMSE: {comparison_rmse}')

Comparison RMSE: 21779.13280478178
