In [3]:
import numpy as np
import pandas as pd

from lightgbm import LGBMRegressor, early_stopping, LGBMClassifier, log_evaluation
import xgboost as xgb
from xgboost import XGBRegressor

from sklearn.model_selection import KFold, cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestRegressor, ExtraTreesRegressor
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.ensemble import VotingRegressor
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import TimeSeriesSplit
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV

from bayes_opt import BayesianOptimization

from catboost import CatBoostRegressor

import optuna
import time
from tqdm import tqdm
import time
import matplotlib.pyplot as plt
import seaborn as sns
from matplotlib import rc

rc('font', family='NanumGothic')  
plt.rcParams['axes.unicode_minus'] = False

import warnings
warnings.filterwarnings('ignore')

Note: You have installed the 'manylinux2014' variant of XGBoost. Certain features such as GPU algorithms or federated learning are not available. To use these features, please upgrade to a recent Linux distro with glibc 2.28+, and install the 'manylinux_2_28' variant.


In [6]:
train = pd.read_csv("train_2016.csv")
test  = pd.read_csv("test_2016.csv")

In [7]:
col_map = {
    '시군구': 'district',
    '번지': 'lot_number',
    '본번': 'main_number',
    '부번': 'sub_number',
    '아파트명': 'apartment_name',
    '전용면적(㎡)': 'exclusive_area_m2',
    '계약년월': 'contract_year_month',
    '계약일': 'contract_day',
    '층': 'floor',
    '건축년도': 'construction_year',
    '도로명': 'road_name',
    '등기신청일자': 'registration_date',
    '거래유형': 'transaction_type',
    '중개사소재지': 'realtor_location',
    'k-단지분류(아파트,주상복합등등)': 'complex_type',
    'k-전화번호': 'contact_number',
    'k-팩스번호': 'fax_number',
    'k-세대타입(분양형태)': 'household_type',
    'k-관리방식': 'management_type',
    'k-복도유형': 'corridor_type',
    'k-난방방식': 'heating_type',
    'k-전체동수': 'total_buildings',
    'k-전체세대수': 'total_households',
    'k-건설사(시공사)': 'constructor',
    'k-시행사': 'developer',
    'k-사용검사일-사용승인일': 'approval_date',
    'k-연면적': 'total_floor_area',
    'k-주거전용면적': 'residential_area',
    'k-관리비부과면적': 'management_fee_area',
    'k-전용면적별세대현황(60㎡이하)': 'households_leq_60m2',
    'k-전용면적별세대현황(60㎡~85㎡이하)': 'households_60_85m2',
    'k-85㎡~135㎡이하': 'households_85_135m2',
    'k-수정일자': 'last_modified_date',
    '경비비관리형태': 'security_management_type',
    '세대전기계약방법': 'electricity_contract_type',
    '청소비관리형태': 'cleaning_management_type',
    '건축면적': 'building_area',
    '주차대수': 'parking_spaces',
    '기타/의무/임대/임의=1/2/3/4': 'allocation_type',
    '단지승인일': 'complex_approval_date',
    '사용허가여부': 'occupancy_permit',
    '관리비 업로드': 'management_fee_upload',
    '단지신청일': 'complex_application_date',
    '계약년': 'contract_year',
    '계약월': 'contract_month',
    '브랜드명': 'brand_name',
    '구': 'district_sub',
    '주소': 'address',
    'x좌표': 'x_coord',
    'y좌표': 'y_coord',
    '전용면적(log)': 'exclusive_area_log',
    '전용면적구간': 'area_range',
    '평수': 'pyung_area',
    '연식': 'building_age',
    '신축(10년 미만)': 'new_building_under10y',
    '재건축 연한(30년 이상)': 'redevelopment_eligible_over30y',
    '강남권여부': 'gangnam_area',
    '우수학군': 'elite_school_zone',
    '프리미엄아파트': 'premium_apartment',
    'zone4': 'zone4',
    'zone4_강남3': 'zone4_gangnam3',
    'zone4_내부권': 'zone4_inner',
    'zone4_도심': 'zone4_cbd',
    'zone4_외곽': 'zone4_outer',
    '대장아파트거리': 'prime_apartment_distance',
    '대장아파트거리(log)': 'prime_apartment_distance_log',
    '대장아파트거리접근성': 'prime_distance_accessibility',
    '회사채금리': 'corporate_bond_rate',
    '매매가격지수': 'price_index',
    '건설공사비지수': 'construction_cost_index',
    '거래량': 'transaction_volume',
    '회사채금리t3': 'corporate_bond_rate_t3',
    '회사채금리t6': 'corporate_bond_rate_t6',
    'delta3': 'delta3',
    'delta6': 'delta6',
    'MA3': 'moving_avg_3',
    'MA6': 'moving_avg_6',
    '버스거리': 'bus_distance',
    '지하철거리': 'subway_distance',
    '1km이내지하철수': 'subway_count_within1k',
    '지하철접근성': 'subway_accessibility',
    '1km이내학교수': 'school_count_within1k',
    '초등학교거리': 'elementary_school_distance',
    '초등학교거리구분': 'elementary_school_distance_group',
    '고등학교진학률': 'highschool_advancement_rate',
    'elite_min_dist_km': 'elite_school_min_dist_km',
    'elite_cnt_1.5k': 'elite_school_count_1_5k',
    'elite_cnt_2.0k': 'elite_school_count_2_0k',
    'target_prev3month': 'target_prev3m',
    '계약년월가중치': 'contract_weight',
    'target_prev6month': 'target_prev6m',
    'target': 'target'
}

# DataFrame 컬럼명 영어로 변경
train = train.rename(columns=col_map)

col_map2 = {
    '시군구': 'district',
    '번지': 'lot_number',
    '본번': 'main_number',
    '부번': 'sub_number',
    '아파트명': 'apartment_name',
    '전용면적(㎡)': 'exclusive_area_m2',
    '계약년월': 'contract_year_month',
    '계약일': 'contract_day',
    '층': 'floor',
    '건축년도': 'construction_year',
    '도로명': 'road_name',
    '등기신청일자': 'registration_date',
    '거래유형': 'transaction_type',
    '중개사소재지': 'realtor_location',
    'k-단지분류(아파트,주상복합등등)': 'complex_type',
    'k-전화번호': 'contact_number',
    'k-팩스번호': 'fax_number',
    'k-세대타입(분양형태)': 'household_type',
    'k-관리방식': 'management_type',
    'k-복도유형': 'corridor_type',
    'k-난방방식': 'heating_type',
    'k-전체동수': 'total_buildings',
    'k-전체세대수': 'total_households',
    'k-건설사(시공사)': 'constructor',
    'k-시행사': 'developer',
    'k-사용검사일-사용승인일': 'approval_date',
    'k-연면적': 'total_floor_area',
    'k-주거전용면적': 'residential_area',
    'k-관리비부과면적': 'management_fee_area',
    'k-전용면적별세대현황(60㎡이하)': 'households_leq_60m2',
    'k-전용면적별세대현황(60㎡~85㎡이하)': 'households_60_85m2',
    'k-85㎡~135㎡이하': 'households_85_135m2',
    'k-수정일자': 'last_modified_date',
    '경비비관리형태': 'security_management_type',
    '세대전기계약방법': 'electricity_contract_type',
    '청소비관리형태': 'cleaning_management_type',
    '건축면적': 'building_area',
    '주차대수': 'parking_spaces',
    '기타/의무/임대/임의=1/2/3/4': 'allocation_type',
    '단지승인일': 'complex_approval_date',
    '사용허가여부': 'occupancy_permit',
    '관리비 업로드': 'management_fee_upload',
    '단지신청일': 'complex_application_date',
    '계약년': 'contract_year',
    '계약월': 'contract_month',
    '브랜드명': 'brand_name',
    '구': 'district_sub',
    '주소': 'address',
    'x좌표': 'x_coord',
    'y좌표': 'y_coord',
    '전용면적(log)': 'exclusive_area_log',
    '전용면적구간': 'area_range',
    '평수': 'pyung_area',
    '연식': 'building_age',
    '신축(10년 미만)': 'new_building_under10y',
    '재건축 연한(30년 이상)': 'redevelopment_eligible_over30y',
    '강남권여부': 'gangnam_area',
    '우수학군': 'elite_school_zone',
    '프리미엄아파트': 'premium_apartment',
    'zone4': 'zone4',
    'zone4_강남3': 'zone4_gangnam3',
    'zone4_내부권': 'zone4_inner',
    'zone4_도심': 'zone4_cbd',
    'zone4_외곽': 'zone4_outer',
    '대장아파트거리': 'prime_apartment_distance',
    '대장아파트거리(log)': 'prime_apartment_distance_log',
    '대장아파트거리접근성': 'prime_distance_accessibility',
    '회사채금리': 'corporate_bond_rate',
    '매매가격지수': 'price_index',
    '건설공사비지수': 'construction_cost_index',
    '거래량': 'transaction_volume',
    '회사채금리t3': 'corporate_bond_rate_t3',
    '회사채금리t6': 'corporate_bond_rate_t6',
    'delta3': 'delta3',
    'delta6': 'delta6',
    'MA3': 'moving_avg_3',
    'MA6': 'moving_avg_6',
    '버스거리': 'bus_distance',
    '지하철거리': 'subway_distance',
    '1km이내지하철수': 'subway_count_within1k',
    '지하철접근성': 'subway_accessibility',
    '1km이내학교수': 'school_count_within1k',
    '초등학교거리': 'elementary_school_distance',
    '초등학교거리구분': 'elementary_school_distance_group',
    '고등학교진학률': 'highschool_advancement_rate',
    'elite_min_dist_km': 'elite_school_min_dist_km',
    'elite_cnt_1.5k': 'elite_school_count_1_5k',
    'elite_cnt_2.0k': 'elite_school_count_2_0k',
    'target_prev3month': 'target_prev3m',
    '계약년월가중치': 'contract_weight',
    'target_prev6month': 'target_prev6m',
}

# DataFrame 컬럼명 영어로 변경
test = test.rename(columns=col_map2)

In [9]:
# train.to_csv("train_2016.csv", index = False)
# test.to_csv("test_2016.csv", index = False)

In [8]:
def except_cols(df):
    df = df.drop([
    'lot_number',
    'main_number',
    'sub_number',
    'contract_day',
    'road_name',
    'realtor_location',
    'contact_number',
    'fax_number',
    'management_type',
    'approval_date',
    'security_management_type',
    'electricity_contract_type',
    'cleaning_management_type',
    'last_modified_date',
    'contract_year',
    'contract_month',
    'address'
], axis=1,  errors='ignore')
    return df

In [10]:
# Apply column removal
train = except_cols(train)
test = except_cols(test)

# Separate features and target
X = train.drop(columns=['target'])
y = train['target']

# 범주형 및 카테고리형 컬럼 추출
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# 3. 각 범주형 컬럼을 category 타입으로 변환
for col in cat_cols:
    X[col] = X[col].astype('category')
    test[col] = test[col].astype('category')

# unseen 범주형 값은 -1 처리 함수 
def handle_unseen_categories(train_df, test_df, cat_columns):
    for col in cat_columns:
        train_cats = train_df[col].cat.categories
        test_df[col] = test_df[col].cat.set_categories(train_cats)
        test_df[col] = test_df[col].cat.codes  
    return test_df

# test 데이터 unseen 처리
X_test = test.copy()
X_test = handle_unseen_categories(X, X_test, cat_cols)

# train 데이터 범주형도 코드로 변환 (LightGBM 학습시 연속형 사용 권장)
for col in cat_cols:
    X[col] = X[col].cat.codes

In [14]:
# 피처명에 특수문자 제거 또는 변경 (예: 공백, -, /, (, ) 등)
def clean_feature_names(df):
    df = df.rename(columns=lambda x: x.strip()
                                  .replace(' ', '_')
                                  .replace('-', '_')
                                  .replace('/', '_')
                                  .replace('(', '')
                                  .replace(')', '')
                                  .replace(',', '')
                                  .replace('.', '_'))
    return df

# train, test 모두 적용
X = clean_feature_names(X)
X_test = clean_feature_names(X_test)

# 피처명 변경 적용
X = clean_feature_names(X)
X_test = clean_feature_names(X_test)

# 새로 바뀐 컬럼명으로 범주형 컬럼 재생성
cat_cols = X.select_dtypes(include=['object', 'category']).columns.tolist()

# X와 X_test 각각에서 cat_cols에 해당하는 컬럼이 모두 존재하는지 확인
missing_in_test = [col for col in cat_cols if col not in X_test.columns]
if missing_in_test:
    print(f'X_test에 없는 범주형 컬럼: {missing_in_test}')
    # 여기서 해당 컬럼이 없어졌거나 이름이 바뀌었으면 원인 분석 필요

# 범주형 컬럼 변환 시도 (오직 존재하는 컬럼만)
for col in cat_cols:
    X[col] = X[col].astype('category')
    if col in X_test.columns:
        X_test[col] = X_test[col].astype('category')
    else:
        print(f"Warning: {col} is missing in X_test")

In [16]:
X

Unnamed: 0,district,apartment_name,exclusive_area_m2,contract_year_month,floor,construction_year,registration_date,transaction_type,complex_type,household_type,...,school_count_within1k,elementary_school_distance,elementary_school_distance_group,highschool_advancement_rate,elite_school_min_dist_km,elite_school_count_1_5k,elite_school_count_2_0k,target_prev3m,contract_weight,target_prev6m
0,0,561,79.97,201712,1,1987,181,0,2,1,...,6,0.450174,4,5.0,0.517496,2,2,190333.333333,211,157666.666667
1,0,561,79.97,201712,1,1987,181,0,2,1,...,6,0.450174,4,5.0,0.517496,2,2,150933.333333,211,142800.000000
2,0,561,54.98,201712,1,1987,181,0,2,1,...,6,0.450174,4,5.0,0.517496,2,2,137500.000000,211,142633.333333
3,0,561,79.97,201801,1,1987,181,0,2,1,...,6,0.450174,4,5.5,0.517496,2,2,75133.333333,300,114983.333333
4,0,561,79.97,201801,1,1987,181,0,2,1,...,6,0.450174,4,5.5,0.517496,2,2,146666.666667,300,145333.333333
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
516840,336,3830,59.99,201608,1,2014,181,0,2,0,...,1,0.421493,4,2.8,1.859554,0,1,38033.333333,107,37633.333333
516841,336,3830,84.82,201608,2,2014,181,0,2,0,...,1,0.421493,4,2.8,1.859554,0,1,36133.333333,107,35616.666667
516842,336,3830,59.99,201608,0,2014,181,0,2,0,...,1,0.421493,4,2.8,1.859554,0,1,37833.333333,107,35533.333333
516843,336,3830,84.94,201608,1,2014,181,0,2,0,...,1,0.421493,4,2.8,1.859554,0,1,30000.000000,107,39133.333333


In [25]:
import lightgbm as lgb
# Train-validation split for monitoring
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Prepare LightGBM dataset
train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature=cat_cols, free_raw_data=False)

# LightGBM parameters for RMSE and overfitting prevention
params = {
    'objective': 'regression',
    'metric': 'rmse',
    'verbosity': -1,
    'boosting_type': 'gbdt',
    'seed': 42,
    'learning_rate': 0.05,
    'num_leaves': 31,
    'feature_fraction': 0.9,
    'bagging_fraction': 0.8,
    'bagging_freq': 5,
    'max_depth': -1,
}

kf = KFold(n_splits=5, shuffle=True, random_state=42)
rmse_scores = []
test_preds = np.zeros(len(X_test))  # 테스트 예측값 누적 합산용

for fold, (train_idx, val_idx) in enumerate(kf.split(X)):
    print(f'Fold {fold + 1} 시작')
    X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
    y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=cat_cols, free_raw_data=False)
    valid_data = lgb.Dataset(X_val, label=y_val, reference=train_data, categorical_feature=cat_cols, free_raw_data=False)

    callbacks = [lgb.early_stopping(stopping_rounds=100, verbose=True),
                     lgb.log_evaluation(period=100)]
    
    model = lgb.train(
    params,
    train_data,
    valid_sets=[train_data, valid_data],
    valid_names=['train', 'valid'],
    num_boost_round=1000,
    callbacks=callbacks
    )

    val_preds = model.predict(X_val, num_iteration=model.best_iteration)
    rmse = np.sqrt(mean_squared_error(y_val, val_preds))
    rmse_scores.append(rmse)
    print(f'Fold {fold + 1} RMSE: {rmse:.4f}')

    # 테스트 데이터 예측값 누적 (평균 예측을 위해)
    test_fold_preds = model.predict(X_test, num_iteration=model.best_iteration)
    test_preds += test_fold_preds

print(f'CV 평균 RMSE: {np.mean(rmse_scores):.4f}')

# 테스트 예측값 평균내기
test_preds /= kf.n_splits

# 제출용 DataFrame 생성
submission = pd.DataFrame({"target": test_preds})
if 'id' in test.columns:
    submission['id'] = test['id']

submission['target'] = submission['target'].round().astype(int)
# CSV 파일로 저장
submission.to_csv('submission_lgbm.csv', index=False)
print('제출용 CSV 파일이 저장되었습니다: submission_lgbm_2016.csv')

Fold 1 시작
Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 16718.5	valid's rmse: 17128.3
[200]	train's rmse: 13657.3	valid's rmse: 14172
[300]	train's rmse: 12148.6	valid's rmse: 12762.9
[400]	train's rmse: 11201.7	valid's rmse: 11906.2
[500]	train's rmse: 10505.7	valid's rmse: 11303.5
[600]	train's rmse: 9954.58	valid's rmse: 10845.6
[700]	train's rmse: 9521.42	valid's rmse: 10497.3
[800]	train's rmse: 9144.67	valid's rmse: 10199.6
[900]	train's rmse: 8834.29	valid's rmse: 9952.73
[1000]	train's rmse: 8562.59	valid's rmse: 9739.59
Did not meet early stopping. Best iteration is:
[1000]	train's rmse: 8562.59	valid's rmse: 9739.59
Fold 1 RMSE: 9739.5861
Fold 2 시작
Training until validation scores don't improve for 100 rounds
[100]	train's rmse: 16741.2	valid's rmse: 17260.8
[200]	train's rmse: 13656.1	valid's rmse: 14299.9
[300]	train's rmse: 12135.9	valid's rmse: 12884.6
[400]	train's rmse: 11156.5	valid's rmse: 12014.4
[500]	train's rmse: 10442.5	valid's

In [None]:
lgb_importances = lgb.feature_importances_
lgb_feat = pd.Series(lgb_importances, index=X_train_df.columns).sort_values(ascending=False)
print(lgb_feat.tail())

plt.figure(figsize=(8, 6))
lgb_feat.head(20).plot(kind='barh')
plt.title("LightGBM Feature Importances")
plt.gca().invert_yaxis()
plt.tight_layout()
plt.show()

In [21]:
# 분류용 VotingClassifier
# 데이터 분할
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Random Forest Regressor
print("Training Random Forest...")
rf_model = RandomForestRegressor(
    n_estimators=200,
    max_depth=10,
    min_samples_split=10,
    min_samples_leaf=5,
    random_state=42,
    n_jobs=-1,
    verbose=1  # RF는 verbose 지원 시 출력됨 (scikit-learn 최신버전 기준)
)
rf_model.fit(X_train, y_train)
print("Random Forest training completed.\n")

# XGBoost Regressor
print("Training XGBoost...")
xgb_model = XGBRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    max_depth=6,
    subsample=0.8,
    colsample_bytree=0.8,
    gamma=1,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    verbosity=1,  
    eval_metric='rmse'
)
xgb_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    verbose=True  

)
print("XGBoost training completed.\n")

# LightGBM Regressor
print("Training LightGBM...")
lgbm_model = LGBMRegressor(
    n_estimators=1000,
    learning_rate=0.03,
    num_leaves=31,
    max_depth=10,
    subsample=0.8,
    colsample_bytree=0.8,
    reg_alpha=0.1,
    reg_lambda=1,
    random_state=42,
    n_jobs=-1,
    verbose=1
)
lgbm_model.fit(
    X_train, y_train,
    eval_set=[(X_val, y_val)],
    callbacks=[early_stopping(stopping_rounds=100, verbose=True),
               log_evaluation(period=100)]
)
print("LightGBM training completed.\n")

# Voting Regressor 앙상블
print("Training Voting Regressor...")
voting_reg = VotingRegressor(estimators=[
    ('rf', rf_model), 
    ('xgb', xgb_model), 
    ('lgbm', lgbm_model)
])
# VotingRegressor는 개별 모델 이미 학습했으므로 fit 필요없음, 단일 학습시만 재호출 가능
# 여기서는 이미 개별 모델 학습 후 조합만 하므로 fit 따로 안함
voting_reg.fit(X_train, y_train)
# 검증 데이터 예측 및 RMSE
y_pred = voting_reg.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, y_pred))
print(f"Voting Regressor RMSE: {rmse:.4f}")


# 테스트 데이터 예측
test_preds = voting_reg.predict(X_test)  

# 제출용 DataFrame 생성
submission = pd.DataFrame({'target': test_preds})

# id 컬럼 있으면 추가
if 'id' in test.columns:
    submission['id'] = test['id']

submission['target'] = submission['target'].round().astype(int)

# CSV 파일로 저장
submission.to_csv('submission_voting_regressor.csv', index=False)
print('제출용 CSV 파일이 저장되었습니다: submission_voting_regressor.csv')

Training Random Forest...


[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   21.0s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.3min finished


Random Forest training completed.

Training XGBoost...
[0]	validation_0-rmse:54591.29723
[1]	validation_0-rmse:53408.32031
[2]	validation_0-rmse:52283.33758
[3]	validation_0-rmse:51175.44884
[4]	validation_0-rmse:50112.76259
[5]	validation_0-rmse:49079.27590
[6]	validation_0-rmse:48096.26788
[7]	validation_0-rmse:47142.24091
[8]	validation_0-rmse:46226.59476
[9]	validation_0-rmse:45331.04411
[10]	validation_0-rmse:44481.36711
[11]	validation_0-rmse:43649.52050
[12]	validation_0-rmse:42831.29092
[13]	validation_0-rmse:42044.06500
[14]	validation_0-rmse:41293.82850
[15]	validation_0-rmse:40580.43497
[16]	validation_0-rmse:39889.98394
[17]	validation_0-rmse:39195.11598
[18]	validation_0-rmse:38543.26650
[19]	validation_0-rmse:37901.23822
[20]	validation_0-rmse:37294.32328
[21]	validation_0-rmse:36705.98297
[22]	validation_0-rmse:36124.98785
[23]	validation_0-rmse:35569.27556
[24]	validation_0-rmse:35030.33508
[25]	validation_0-rmse:34529.04163
[26]	validation_0-rmse:34021.09965
[27]	valid

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 tasks      | elapsed:   22.3s
[Parallel(n_jobs=-1)]: Done 180 tasks      | elapsed:  2.1min
[Parallel(n_jobs=-1)]: Done 200 out of 200 | elapsed:  2.4min finished


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.031255 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 8532
[LightGBM] [Info] Number of data points in the train set: 413476, number of used features: 73
[LightGBM] [Info] Start training from score 73389.950043


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.2s
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:    0.2s finished


Voting Regressor RMSE: 12452.3384
제출용 CSV 파일이 저장되었습니다: submission_voting_regressor.csv


[Parallel(n_jobs=10)]: Using backend ThreadingBackend with 10 concurrent workers.
[Parallel(n_jobs=10)]: Done  30 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 180 tasks      | elapsed:    0.0s
[Parallel(n_jobs=10)]: Done 200 out of 200 | elapsed:    0.0s finished


In [29]:
print(train.columns.tolist())

['district', 'apartment_name', 'exclusive_area_m2', 'contract_year_month', 'floor', 'construction_year', 'registration_date', 'transaction_type', 'complex_type', 'household_type', 'corridor_type', 'heating_type', 'total_buildings', 'total_households', 'constructor', 'developer', 'total_floor_area', 'residential_area', 'management_fee_area', 'households_leq_60m2', 'households_60_85m2', 'households_85_135m2', 'building_area', 'parking_spaces', 'allocation_type', 'complex_approval_date', 'occupancy_permit', 'management_fee_upload', 'complex_application_date', 'brand_name', 'district_sub', 'x_coord', 'y_coord', 'exclusive_area_log', 'area_range', 'pyung_area', 'building_age', 'new_building_under10y', 'redevelopment_eligible_over30y', 'gangnam_area', 'elite_school_zone', 'premium_apartment', 'zone4', 'zone4_gangnam3', 'zone4_inner', 'zone4_cbd', 'zone4_outer', 'prime_apartment_distance', 'prime_apartment_distance_log', 'prime_distance_accessibility', 'corporate_bond_rate', 'price_index', 'c

In [41]:
print(X_train.isnull().sum())
print(X_val.isnull().sum())
print(test_X.isnull().sum())


total_buildings            0
total_floor_area           0
households_leq_60m2        0
households_60_85m2         0
price_index                0
construction_cost_index    0
subway_distance            0
subway_accessibility       0
building_area              0
parking_spaces             0
gangnam_area               0
elite_school_zone          0
dtype: int64
total_buildings            0
total_floor_area           0
households_leq_60m2        0
households_60_85m2         0
price_index                0
construction_cost_index    0
subway_distance            0
subway_accessibility       0
building_area              0
parking_spaces             0
gangnam_area               0
elite_school_zone          0
dtype: int64
total_buildings               0
total_floor_area              0
households_leq_60m2           0
households_60_85m2            0
price_index                   0
construction_cost_index    9272
subway_distance               0
subway_accessibility          0
building_area         

In [17]:
# 다중공선성 높아 제거 권장 컬럼
drop_high_vif = [
    'construction_year', 'total_households', 'residential_area', 'management_fee_area',
    'building_age', 'moving_avg_3', 'moving_avg_6'
]

# 중간 위험 컬럼
drop_mid_vif = [
    'total_buildings', 'total_floor_area', 'households_leq_60m2', 'households_60_85m2',
    'price_index', 'subway_distance', 'subway_accessibility'
]

# 다중공선성 낮은 컬럼 (유지)
keep_low_vif = [
    'main_number', 'sub_number', 'contract_day', 'building_area', 'parking_spaces',
    'gangnam_area', 'elite_school_zone'
]


In [45]:

# 'construction_cost_index',
# 모델에 사용할 전체 피처 (예시)
feature_cols_all = keep_low_vif

# high VIF 컬럼 제거 후 사용
feature_cols = [c for c in feature_cols_all if c in train.columns and c not in drop_high_vif]

# 독립변수(X), 종속변수(y) 분리
X = train[feature_cols]
y = train['target']

# 학습/검증 데이터 분리
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 이상치(IQR) 계산 및 NaN 처리 (학습 데이터)
Q1 = X_train.quantile(0.25)
Q3 = X_train.quantile(0.75)
IQR = Q3 - Q1
is_outlier = (X_train < (Q1 - 1.5 * IQR)) | (X_train > (Q3 + 1.5 * IQR))
X_train = X_train.mask(is_outlier, np.nan)
X_train = X_train.fillna(X_train.mean())

# 검증 데이터도 동일 처리
Q1_val = X_val.quantile(0.25)
Q3_val = X_val.quantile(0.75)
IQR_val = Q3_val - Q1_val
is_outlier_val = (X_val < (Q1_val - 1.5 * IQR_val)) | (X_val > (Q3_val + 1.5 * IQR_val))
X_val = X_val.mask(is_outlier_val, np.nan)
X_val = X_val.fillna(X_val.mean())

# 회귀모델 학습
model = LinearRegression()
model.fit(X_train, y_train)

# 검증 예측 및 RMSE 계산
pred_val = model.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred_val))
print(f"Validation RMSE: {rmse:.4f}")

# 제출용 예측 (전체 train 사용 후 테스트 데이터에 적용 시 가정)
scaler = StandardScaler()
scaler.fit(X_train)          # 학습 데이터로 스케일러 학습
X_train_scaled = scaler.transform(X_train)
X_val_scaled = scaler.transform(X_val)

# 예: test 데이터 준비 및 전처리 동일하게 수행 필요
test_X = test[feature_cols]
test_X = test_X.mask((test_X < (Q1 - 1.5 * IQR)) | (test_X > (Q3 + 1.5 * IQR)), np.nan)
test_X = test_X.fillna(test_X.mean())

# 테스트 데이터 스케일링
test_X_scaled = scaler.transform(test_X)

# 테스트 데이터 예측
preds = model.predict(test_X_scaled)

# 임시 예측값으로 submission DataFrame 생성 가정
submission = pd.DataFrame({'target': preds})

# 반올림 후 정수 변환
submission['target'] = submission['target'].round().astype(int)

# CSV 저장
submission.to_csv('submission_IQR_regressor.csv', index=False)


Validation RMSE: 52833.9483
제출용 CSV 파일이 저장되었습니다: submission_voting_regressor.csv


In [None]:
# 실패 : 출력하는데 너무 오래 걸림

feature_cols_all = drop_mid_vif + keep_low_vif
feature_cols = [c for c in feature_cols_all if c in train.columns and c not in drop_high_vif]

X = train[feature_cols]
y = train['target']

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# 결측치 평균 대체
X_train = X_train.fillna(X_train.mean())
X_val = X_val.fillna(X_val.mean())

# 랜덤 포레스트 모델 학습
rf = RandomForestRegressor(random_state=42, n_jobs=-1)

param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [10, 20, None],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'max_features': ['auto', 'sqrt'],
    'bootstrap': [True, False]
}

grid_search = GridSearchCV(estimator=rf, param_grid=param_grid,
                           cv=3, n_jobs=-1, scoring='neg_root_mean_squared_error', verbose=2)

grid_search.fit(X_train, y_train)

print("최적 하이퍼파라미터:", grid_search.best_params_)

best_rf = grid_search.best_estimator_

pred_val = best_rf.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val, pred_val))
print(f'Tuned RandomForest Validation RMSE: {rmse:.4f}')

# 테스트 데이터 예측 및 제출용 CSV 저장(반올림 정수 변환 포함)
test_X = test[feature_cols]
test_X = test_X.fillna(test_X.mean())
test_preds = rf_model.predict(test_X)

submission = pd.DataFrame({'target': test_preds})
submission['target'] = submission['target'].round().astype(int)