In [1]:
# 필요한 라이브러리 로드
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error
import lightgbm as lgb
from sklearn.preprocessing import LabelEncoder, StandardScaler
import numpy as np
import xgboost as xgb
from catboost import CatBoostRegressor
import optuna
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestRegressor

# 데이터 로드
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')
submission = pd.read_csv('data/sample_submission.csv')

# Calculate quantiles for 주행거리(km)
quantiles = train['주행거리(km)'].quantile([0, 0.2, 0.4, 0.6, 0.8, 1.0]).to_dict()

# Define bin edges and labels based on quantiles
bin_edges = [
    quantiles[0],  # Minimum
    quantiles[0.2], # 5th percentile
    quantiles[0.4], # Q1
    quantiles[0.6],  # Median
    quantiles[0.8], # Q3
    quantiles[1.0]   # Maximum
]
bin_labels = ['Very Low', 'Low', 'Medium', 'High', 'Very High']

# Apply binning
train['주행거리_bin'] = pd.cut(train['주행거리(km)'], bins=bin_edges, labels=bin_labels, include_lowest=True)

# Group by 차량상태 and 주행거리_bin to calculate the median of 배터리용량
grouped_median = (
    train.groupby(['차량상태', '주행거리_bin'])['배터리용량']
    .median()
    .reset_index()
    .rename(columns={'배터리용량': '대치값'})
)

# Merge the grouped median back into the train dataframe
train = train.merge(grouped_median, on=['차량상태', '주행거리_bin'], how='left')

# Fill missing values in 배터리용량 using the 대치값
train['배터리용량'] = train['배터리용량'].fillna(train['대치값'])

# Drop the 대치값 column as it's no longer needed
train.drop(columns=['대치값'], inplace=True)

# Apply binning
test['주행거리_bin'] = pd.cut(test['주행거리(km)'], bins=bin_edges, labels=bin_labels, include_lowest=True)

# Group by 차량상태 and 주행거리_bin to calculate the median of 배터리용량
grouped_median = (
    test.groupby(['차량상태', '주행거리_bin'])['배터리용량']
    .median()
    .reset_index()
    .rename(columns={'배터리용량': '대치값'})
)

# Merge the grouped median back into the train dataframe
test = test.merge(grouped_median, on=['차량상태', '주행거리_bin'], how='left')

# Fill missing values in 배터리용량 using the 대치값
test['배터리용량'] = test['배터리용량'].fillna(test['대치값'])

# Drop the 대치값 column as it's no longer needed
test.drop(columns=['대치값'], inplace=True)

# Calculate the median of 배터리용량 grouped by 차량상태
state_median = (
    train.groupby('차량상태')['배터리용량']
    .median()
    .reset_index()
    .rename(columns={'배터리용량': '차량상태_대치값'})
)

# Merge the state-based median into the test dataframe
test = test.merge(state_median, on='차량상태', how='left')

# Fill the remaining missing value in 배터리용량 using 차량상태_대치값
test['배터리용량'] = test['배터리용량'].fillna(test['차량상태_대치값'])

# Drop the 차량상태_대치값 column as it's no longer needed
test.drop(columns=['차량상태_대치값'], inplace=True)


# 사고이력과 차량상태를 더미 변수로 변환
train = pd.get_dummies(train, columns=['제조사', '모델', '구동방식'], drop_first=True)
test = pd.get_dummies(test, columns=['제조사', '모델', '구동방식'], drop_first=True)

# Label Encoding for '차량상태'
label_encoder = LabelEncoder()
train['차량상태'] = label_encoder.fit_transform(train['차량상태'])
test['차량상태'] = label_encoder.transform(test['차량상태'])
train['사고이력'] = label_encoder.fit_transform(train['사고이력'])
test['사고이력'] = label_encoder.transform(test['사고이력'])


# 수치형 변수 이상치 제거 함수
def remove_outliers(df, columns, z_thresh=3):
    for col in columns:
        mean = df[col].mean()
        std = df[col].std()
        z_scores = (df[col] - mean) / std
        df = df[np.abs(z_scores) <= z_thresh]  # z-score 기준 이상치 제거
    return df


# 파생변수 생성
train['배터리효율'] = train['배터리용량'] / (train['주행거리(km)'] + 1)  # 주행거리가 0일 경우를 방지
test['배터리효율'] = test['배터리용량'] / (test['주행거리(km)'] + 1)

train['연간주행거리'] = train['주행거리(km)'] / (train['연식(년)'] + 1)  # 연식이 0일 경우 방지
test['연간주행거리'] = test['주행거리(km)'] / (test['연식(년)'] + 1)




# 수치형 변수
continuous_vars = ['보증기간(년)', '배터리효율', '연간주행거리']

# print(f"train 행 개수: {len(train)}")
# 
# # 이상치 제거
# train = remove_outliers(train, continuous_vars)
# 
# print(f"train 행 개수: {len(train)}")


# Initialize the scaler
scaler = StandardScaler()

# Fit and transform the scaler on train data, and transform the test data
train[continuous_vars] = scaler.fit_transform(train[continuous_vars])
test[continuous_vars] = scaler.transform(test[continuous_vars])


# 타깃 변수와 특성 분리
X = train.drop(columns=['ID', '가격(백만원)', '주행거리_bin'])
y = train['가격(백만원)']
print(X.columns)
test_data = test.drop(columns=['ID', '주행거리_bin'])

print("특성 개수", len(X.columns))
print(X.info())


# LightGBM, XGBoost, CatBoost Optuna 최적화
def objective(trial, model_type):
    if model_type == 'lightgbm':
        params = {
            'objective': 'regression',
            'metric': 'rmse',
            'boosting_type': 'gbdt',
            'learning_rate': trial.suggest_float('learning_rate', 0.001, 0.1),
            'subsample': trial.suggest_float('subsample', 0.7, 1.0),
            'max_depth': trial.suggest_int('max_depth', 7, 30),
            'colsample_bytree': trial.suggest_float('colsample_bytree', 0.7, 1.0),
            'random_state': 42
        }
        lgb_dataset = lgb.Dataset(X, y)
        cv_results = lgb.cv(
            params,
            lgb_dataset,
            num_boost_round=500,
            nfold=5,
            metrics='rmse',
            stratified=False,
            seed=42        )
        return cv_results['valid rmse-mean'][-1]


# 모델별 Optuna 최적화
lgb_study = optuna.create_study(direction='minimize')
lgb_study.optimize(lambda trial: objective(trial, 'lightgbm'), n_trials=20)
lgb_best_params = lgb_study.best_params
lgb_best_rmse = lgb_study.best_value


# 최적 모델 선택
best_model_type = min(
    [('lightgbm', lgb_best_rmse, lgb_best_params)],
    key=lambda x: x[1]
)

print(f"Best Model: {best_model_type[0]} with RMSE: {best_model_type[1]}")

# 최적 모델 학습 및 예측
if best_model_type[0] == 'lightgbm':
    final_model = lgb.train(
        {**best_model_type[2], 'objective': 'regression', 'metric': 'rmse'},
        lgb.Dataset(X, y),
        num_boost_round=500
    )
    final_pred = final_model.predict(test_data)


# 제출 파일 생성
submission['가격(백만원)'] = final_pred
submission.to_csv('submission.csv', index=False)
print("제출 파일이 'submission.csv'로 저장되었습니다.")

  from .autonotebook import tqdm as notebook_tqdm
  train.groupby(['차량상태', '주행거리_bin'])['배터리용량']
  test.groupby(['차량상태', '주행거리_bin'])['배터리용량']
[I 2025-01-13 22:34:32,161] A new study created in memory with name: no-name-88c34afe-17dc-45c8-8464-a8cf9f3bfaa4


Index(['차량상태', '배터리용량', '주행거리(km)', '보증기간(년)', '사고이력', '연식(년)', '제조사_B사',
       '제조사_H사', '제조사_K사', '제조사_P사', '제조사_T사', '제조사_V사', '모델_ID4', '모델_ION5',
       '모델_ION6', '모델_IONIQ', '모델_KNE', '모델_M3', '모델_MS', '모델_MX', '모델_MY',
       '모델_Niro', '모델_Q4eT', '모델_RSeTGT', '모델_Soul', '모델_Tay', '모델_TayCT',
       '모델_TayGTS', '모델_eT', '모델_i3', '모델_i5', '모델_iX', '구동방식_FWD', '구동방식_RWD',
       '배터리효율', '연간주행거리'],
      dtype='object')
특성 개수 36
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7497 entries, 0 to 7496
Data columns (total 36 columns):
 #   Column     Non-Null Count  Dtype  
---  ------     --------------  -----  
 0   차량상태       7497 non-null   int32  
 1   배터리용량      7497 non-null   float64
 2   주행거리(km)   7497 non-null   int64  
 3   보증기간(년)    7497 non-null   float64
 4   사고이력       7497 non-null   int32  
 5   연식(년)      7497 non-null   int64  
 6   제조사_B사     7497 non-null   bool   
 7   제조사_H사     7497 non-null   bool   
 8   제조사_K사     7497 non-null   bool   
 9   제조사_P사 

[I 2025-01-13 22:34:46,549] Trial 0 finished with value: 1.4045717802691509 and parameters: {'learning_rate': 0.06540496198357808, 'subsample': 0.8455769098780449, 'max_depth': 11, 'colsample_bytree': 0.9074181771538984}. Best is trial 0 with value: 1.4045717802691509.


[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000651 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001146 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000698 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] 

[I 2025-01-13 22:35:02,743] Trial 1 finished with value: 1.3514855542737705 and parameters: {'learning_rate': 0.021130992407605284, 'subsample': 0.8779721041142028, 'max_depth': 26, 'colsample_bytree': 0.8035594309437508}. Best is trial 1 with value: 1.3514855542737705.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001206 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000576 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000780 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] 

[I 2025-01-13 22:35:17,389] Trial 2 finished with value: 1.4088975747582515 and parameters: {'learning_rate': 0.07301619289555257, 'subsample': 0.7741866775420554, 'max_depth': 27, 'colsample_bytree': 0.9704490528835235}. Best is trial 1 with value: 1.3514855542737705.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001081 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000760 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001237 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of t

[I 2025-01-13 22:35:33,407] Trial 3 finished with value: 1.3594932548685719 and parameters: {'learning_rate': 0.03590624628240018, 'subsample': 0.8771389310165039, 'max_depth': 16, 'colsample_bytree': 0.9372325025622986}. Best is trial 1 with value: 1.3514855542737705.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001239 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000656 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001129 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of t

[I 2025-01-13 22:35:50,323] Trial 4 finished with value: 1.7741491332394521 and parameters: {'learning_rate': 0.007360599826337699, 'subsample': 0.9787187529090553, 'max_depth': 17, 'colsample_bytree': 0.8966786357997207}. Best is trial 1 with value: 1.3514855542737705.


[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001004 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000715 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001284 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 944
[LightGBM] [Info] Number of data points in the train set: 5996, number of used features: 36
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of t

[W 2025-01-13 22:35:54,756] Trial 5 failed with parameters: {'learning_rate': 0.0702597882838604, 'subsample': 0.7786299858860802, 'max_depth': 28, 'colsample_bytree': 0.8791957656389967} because of the following error: KeyboardInterrupt().
Traceback (most recent call last):
  File "C:\Users\kms10\anaconda3\envs\py3_12\Lib\site-packages\optuna\study\_optimize.py", line 197, in _run_trial
    value_or_values = func(trial)
                      ^^^^^^^^^^^
  File "C:\Users\kms10\AppData\Local\Temp\ipykernel_126336\470754971.py", line 179, in <lambda>
    lgb_study.optimize(lambda trial: objective(trial, 'lightgbm'), n_trials=20)
                                     ^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "C:\Users\kms10\AppData\Local\Temp\ipykernel_126336\470754971.py", line 166, in objective
    cv_results = lgb.cv(
                 ^^^^^^^
  File "C:\Users\kms10\anaconda3\envs\py3_12\Lib\site-packages\lightgbm\engine.py", line 826, in cv
    cvfolds.update(fobj=fobj)  # type: ignore[call-a