In [1]:
import pandas as pd
import numpy as np
import warnings


warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.model_selection import cross_val_score

from optuna import Trial
import optuna
from optuna.samplers import TPESampler

In [2]:
sample_df = pd.read_csv('train_data_onehot.csv', sep=',')

# 시간 순서대로 데이터 반영하기 위해서 거래년월일 정렬 필요
sample_df = sample_df.sort_values('계약날짜인코딩')
# sample_df = sample_df[sample_df.columns.difference(['동별 공원 갯수', '8학군', '동별지하철역수', '구별 교과학원 갯수'])]
sample_df = sample_df.fillna(0)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237987 entries, 64104 to 208092
Data columns (total 18 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   시구동          237987 non-null  object 
 1   전용면적         237987 non-null  float64
 2   거래금액         237987 non-null  int64  
 3   층            237987 non-null  int64  
 4   건축년도         237987 non-null  int64  
 5   평당단가         237987 non-null  float64
 6   구            237987 non-null  object 
 7   단지명브랜드       237987 non-null  object 
 8   계약날짜인코딩      237987 non-null  int64  
 9   구별 공원 갯수     237987 non-null  float64
 10  8학군          237987 non-null  int64  
 11  구별 교과학원 갯수   237987 non-null  int64  
 12  구별 대형마트 수    237987 non-null  float64
 13  구별 백화점 수     237987 non-null  float64
 14  구별 전체 마트 수   237987 non-null  float64
 15  구별 대학병원 병원수  237987 non-null  int64  
 16  구별 종합병원 병원수  237987 non-null  int64  
 17  단지명별 신축거래비율  237987 non-null  float64
dtypes: float64(7), int64

### 1. 무지성 인코딩

In [3]:

'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 전체 컬럼 데이터 타입 확인 후 통째로 범주형 인코딩 처리
    - param: df
    - return: encoded columns dataframe
'''
def one_hot(df):
    # 컬럼의 이름 리스트로 뽑아오기
    columns_name_list = list(df.columns)

    # 컬럼마다 for문 반복
    for col_name in columns_name_list:
        # 만약 컬럼의 값 타입이 범주형이면
        if df[col_name].dtype == object or df[col_name].dtype == str:

            # # 컬럼의 유니크한 값을 리스트로 만들어둠
            col_items = df[col_name].unique().tolist()
            # print(col_items)

            onehot = OneHotEncoder(sparse=False)
            # print(col_name)
            onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
            onehot_encoded_label = onehot.categories_[0]
            onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)
            # print(onehot_encoded_df)
            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, onehot_encoded_df], axis=1)
        else:
            # 종속(타겟) 컬럼만 인코딩 제외
            if col_name == '평당단가':
                continue
            scaled_label = [col_name]
            x = df[col_name].values.reshape(-1, 1) #returns a numpy array
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            scaled_df = pd.DataFrame(x_scaled, columns=scaled_label)

            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, scaled_df], axis=1)

    return df
'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 새로운 데이터 컬럼 추가 시 별도 인코딩 결과만 출력할 때 사용 ㄱㄱ
    - param: df, col_name
    - return: encoded columns dataframe or series
'''
def encode_column(df, col_name):

    # column data type이 object 또는 str 즉 범주형일 경우
    # onehot 인코딩 수행된 데이터프레임 return
    if df[col_name].dtype == object or df[col_name].dtype == str:
        onehot = OneHotEncoder(sparse=False)

        onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
        onehot_encoded_label = onehot.categories_[0]
        onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)

        return onehot_encoded_df

    # column data type이 나머지 타입일 경우
    # 해당 컬럼의 series return
    else:
        return df[col_name]

In [4]:
# 1-1 test one_hot function
preprocessed_train_df = one_hot(sample_df)
preprocessed_train_df

Unnamed: 0,평당단가,서울특별시 강남구 개포동,서울특별시 강남구 논현동,서울특별시 강남구 대치동,서울특별시 강남구 도곡동,서울특별시 강남구 삼성동,서울특별시 강남구 세곡동,서울특별시 강남구 수서동,서울특별시 강남구 신사동,서울특별시 강남구 압구정동,...,계약날짜인코딩,구별 공원 갯수,8학군,구별 교과학원 갯수,구별 대형마트 수,구별 백화점 수,구별 전체 마트 수,구별 대학병원 병원수,구별 종합병원 병원수,단지명별 신축거래비율
0,5364.511692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.001826,0.916667,1.0,1.000000,0.166667,1.0,0.875,0.571429,1.000000,0.110657
1,4828.060523,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.006393,0.916667,1.0,1.000000,0.166667,1.0,0.875,0.571429,1.000000,0.110657
2,5364.511692,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.009132,0.916667,1.0,1.000000,0.166667,1.0,0.875,0.571429,1.000000,0.110657
3,5756.533700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.070320,0.916667,1.0,1.000000,0.166667,1.0,0.875,0.571429,1.000000,0.110657
4,6452.346308,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,...,0.085845,0.916667,1.0,1.000000,0.166667,1.0,0.875,0.571429,1.000000,0.110657
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237982,1410.601638,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.730594,0.076389,0.0,0.101749,1.000000,0.0,0.750,0.428571,0.322581,0.110657
237983,1269.794287,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.866667,0.076389,0.0,0.101749,1.000000,0.0,0.750,0.428571,0.322581,0.110657
237984,1513.445904,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.841096,0.076389,0.0,0.101749,1.000000,0.0,0.750,0.428571,0.322581,0.016495
237985,1805.330937,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.974429,0.076389,0.0,0.101749,1.000000,0.0,0.750,0.428571,0.322581,0.016495


In [9]:
# 인코딩된 전체 데이터 셋에서 train, test split하여 분할 시행
# fit 함수에서 사용할 예정 train(모델 훈련 및 도출), test(최종 검증 데이터)

# 인코딩된 데이터에서 data, target 분류
data = preprocessed_train_df[preprocessed_train_df.columns.difference(['평당단가', '거래금액', '전용면적'])]
target = preprocessed_train_df['평당단가'] 

train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.4, shuffle=False)

# train_x(data), train_y(target) => cross validation을 이용하여 교차 검증 및 점수 도출할 데이터
# test_x(data), test_y(target) => 향후 train data를 이용하여 생성된 최종 모형 평가하기 위한 테스트 데이터
# 무작위가 아닌 시간순 정렬 반영 위해 shuffle=False

len(train_x), len(train_y)

(142792, 142792)

### 2. 모델링

In [10]:
from sklearn.model_selection import KFold
'''
    - 카테고리: 모델링
    - 개요: 머신러닝 모델링 수행 및 점수 도출
        - 교차 검증 방법으로 TimeSeriesSplit 수행
    - param: 

        1. model_tuple => ex. ('LR', LinearRegression())
        2. X_train, y_train, X_test, y_test

    - return: rmse
'''
def execute_modeling(model_tuple, data, target):

    name = model_tuple[0]
    model = model_tuple[1]

    # TimeSeries Cross validation 
    # tscv = TimeSeriesSplit(n_splits=7)
    kfold = KFold(n_splits=5)

    # 각 모델에 대하여 교차 검증한 결과 점수 확인
    # r^2, rmse 같이 도출 ㄱㄱ
    score_list = []
    score_type = ['r2', 'neg_mean_squared_error']

    for s_type in score_type:

        scores = cross_val_score(model, data, target, cv=kfold, scoring=s_type)

        if s_type == 'neg_mean_squared_error':
            scores = np.sqrt(-scores)

        print(f'{name} average {s_type}: {scores.mean()}, score_list {s_type}: {scores}')

        score_list.append(scores.mean())

    r_square = score_list[0]

    return r_square

In [26]:
from sklearn.linear_model import Ridge, Lasso, LinearRegression, ElasticNet
import tqdm
from sklearn.ensemble import RandomForestRegressor, VotingRegressor

import xgboost as xgb
import lightgbm as lgb

# 2. test execute_modeling function
preprocessed_train_df = one_hot(sample_df)
preprocessed_train_df = preprocessed_train_df.drop_duplicates()

model_list = [
#                 ('RF', RandomForestRegressor(n_estimators=5)),
#                 ('XGB', xgb.XGBRegressor()),
#                 ('LGBM', lgb.LGBMRegressor()),

#                 ('voting', VotingRegressor(estimators=[
#                     ('RF', RandomForestRegressor(n_estimators=5)),
#                     ('XGB', xgb.XGBRegressor()),
#                     ('LGBM', lgb.LGBMRegressor())
#                 ]))
                ('voting', VotingRegressor(estimators=[
                    ('XGB', xgb.XGBRegressor()),
                    ('LGBM', lgb.LGBMRegressor())
                ]))
            ]

for model_tuple in model_list:
    execute_modeling(model_tuple, train_x, train_y)
    pass

voting average r2: 0.7165227293085995, score_list r2: [0.83815711 0.78086846 0.69478342 0.58965144 0.67915322]
voting average neg_mean_squared_error: 797.4463838734443, score_list neg_mean_squared_error: [ 598.8057214   623.51280198 1168.07000529  621.20111403  975.64227667]


### 3. 파라미터 튜닝

In [15]:
# 3. test get_best_param function

# linear regression model object
def linear_object(trial:Trial, data, target):
    params = {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False])
    }
    
    test_model = LinearRegression(**params)
    test_model_score = execute_modeling(('LR', test_model), data, target)

    return test_model_score

# xgbr regression model object
def xgb_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 50, 100),
        'max_depth':trial.suggest_int('max_depth', 5, 10),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 10)
    }

    test_model = xgb.XGBRegressor(**params)
    test_model_score = execute_modeling(('XGBR', test_model), data, target)

    return test_model_score
    

# light gbm regression model object
def lgb_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 50, 100),
        'max_depth':trial.suggest_int('max_depth', 5, 10),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 10)
    }
    
    test_model = lgb.LGBMRegressor(**params)
    test_model_score = execute_modeling(('LGBM', test_model), data, target)

    return test_model_score

# randomforest regressor model object
def random_forest_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 50, 100),
        'max_depth':trial.suggest_int('max_depth', 5, 10),
        'min_samples_split':trial.suggest_int('min_samples_split', 2, 10),
        'min_samples_leaf':trial.suggest_int('min_samples_leaf', 1, 10)
    }
    
    test_model = RandomForestRegressor(**params)
    test_model_score = execute_modeling(('rf', test_model), data, target)

    return test_model_score

# voting regressor model object
def voting_regressor(trial:Trial, data, target):
    pass

# 하이퍼 파라미터 결과 도출
# 위에서 분할한  X_train, y_train, X_test, y_test 파라미터 삽입
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: xgb_object(trial, train_x, train_y), n_trials=10)

best_score = study.best_value
best_param_dict = study.best_trial.params

print(best_score, best_param_dict)

[32m[I 2021-12-30 17:11:22,514][0m A new study created in memory with name: no-name-56ee262b-75ae-424e-96c5-aa8bbbe0cc5b[0m


XGBR average r2: 0.7095428377769324, score_list r2: [0.82273435 0.76481911 0.70636837 0.58197308 0.67181927]


[32m[I 2021-12-30 17:16:18,458][0m Trial 0 finished with value: 0.7095428377769324 and parameters: {'n_estimators': 96, 'max_depth': 5, 'min_child_weight': 10}. Best is trial 0 with value: 0.7095428377769324.[0m


XGBR average neg_mean_squared_error: 806.4068669299389, score_list neg_mean_squared_error: [ 626.68806941  645.94261656 1145.68764053  626.98606645  986.72994171]
XGBR average r2: 0.707626165868014, score_list r2: [0.81444688 0.76297092 0.70080597 0.57652542 0.68338164]


[32m[I 2021-12-30 17:21:28,475][0m Trial 1 finished with value: 0.707626165868014 and parameters: {'n_estimators': 83, 'max_depth': 5, 'min_child_weight': 1}. Best is trial 1 with value: 0.707626165868014.[0m


XGBR average neg_mean_squared_error: 809.2768931884116, score_list neg_mean_squared_error: [ 641.17009122  648.47575178 1156.48839235  631.05823418  969.19199641]
XGBR average r2: 0.6963666569441289, score_list r2: [0.84528169 0.78836072 0.63742656 0.52681846 0.68394584]


[32m[I 2021-12-30 17:31:29,609][0m Trial 2 finished with value: 0.6963666569441289 and parameters: {'n_estimators': 95, 'max_depth': 9, 'min_child_weight': 6}. Best is trial 2 with value: 0.6963666569441289.[0m


XGBR average neg_mean_squared_error: 821.3468685594971, score_list neg_mean_squared_error: [ 585.47719046  612.76092498 1273.10084151  667.06731585  968.32806999]
XGBR average r2: 0.7102343048542894, score_list r2: [0.83671853 0.78540072 0.71918847 0.50144347 0.70842033]


[32m[I 2021-12-30 17:40:02,876][0m Trial 3 finished with value: 0.7102343048542894 and parameters: {'n_estimators': 88, 'max_depth': 8, 'min_child_weight': 9}. Best is trial 2 with value: 0.6963666569441289.[0m


XGBR average neg_mean_squared_error: 790.7380496010428, score_list neg_mean_squared_error: [ 601.46115466  617.03110832 1120.39788323  684.71993086  930.08017093]
XGBR average r2: 0.695441352778244, score_list r2: [0.82762256 0.78001699 0.6456818  0.54608514 0.67780027]


[32m[I 2021-12-30 17:46:14,906][0m Trial 4 finished with value: 0.695441352778244 and parameters: {'n_estimators': 72, 'max_depth': 7, 'min_child_weight': 4}. Best is trial 4 with value: 0.695441352778244.[0m


XGBR average neg_mean_squared_error: 826.4553757322246, score_list neg_mean_squared_error: [ 617.98701321  624.72299502 1258.52411873  653.345596    977.6971557 ]
XGBR average r2: 0.7078899385400866, score_list r2: [0.80759912 0.76868431 0.71239879 0.58005375 0.67071373]


[32m[I 2021-12-30 17:50:11,593][0m Trial 5 finished with value: 0.7078899385400866 and parameters: {'n_estimators': 65, 'max_depth': 5, 'min_child_weight': 2}. Best is trial 4 with value: 0.695441352778244.[0m


XGBR average neg_mean_squared_error: 808.8365639230277, score_list neg_mean_squared_error: [ 652.89397029  640.61259673 1133.86192015  628.42379677  988.39053568]
XGBR average r2: 0.708397706233662, score_list r2: [0.83659456 0.78234833 0.70180075 0.52569917 0.69554573]


[32m[I 2021-12-30 17:57:39,405][0m Trial 6 finished with value: 0.708397706233662 and parameters: {'n_estimators': 77, 'max_depth': 8, 'min_child_weight': 5}. Best is trial 4 with value: 0.695441352778244.[0m


XGBR average neg_mean_squared_error: 799.1810724261542, score_list neg_mean_squared_error: [ 601.68944189  621.40383657 1154.564209    667.85581139  950.39206329]
XGBR average r2: 0.7089557256327463, score_list r2: [0.82430523 0.77851451 0.71329748 0.53417971 0.69448169]


[32m[I 2021-12-30 18:03:30,796][0m Trial 7 finished with value: 0.7089557256327463 and parameters: {'n_estimators': 78, 'max_depth': 6, 'min_child_weight': 4}. Best is trial 4 with value: 0.695441352778244.[0m


XGBR average neg_mean_squared_error: 799.3513002450329, score_list neg_mean_squared_error: [ 623.90511787  626.85278696 1132.08899225  661.8582196   952.05138455]
XGBR average r2: 0.7043885875996782, score_list r2: [0.83666153 0.78116973 0.65942869 0.55400029 0.69068269]


[32m[I 2021-12-30 18:09:54,556][0m Trial 8 finished with value: 0.7043885875996782 and parameters: {'n_estimators': 91, 'max_depth': 7, 'min_child_weight': 10}. Best is trial 4 with value: 0.695441352778244.[0m


XGBR average neg_mean_squared_error: 812.8190041955274, score_list neg_mean_squared_error: [ 601.56611606  623.08403047 1233.86839704  647.6241788   957.95229861]
XGBR average r2: 0.6965164852503326, score_list r2: [0.82807894 0.78190308 0.64880754 0.54569855 0.67809432]


[32m[I 2021-12-30 18:15:23,712][0m Trial 9 finished with value: 0.6965164852503326 and parameters: {'n_estimators': 79, 'max_depth': 7, 'min_child_weight': 4}. Best is trial 4 with value: 0.695441352778244.[0m


XGBR average neg_mean_squared_error: 824.60855009837, score_list neg_mean_squared_error: [ 617.16838413  622.03911522 1252.96057185  653.6237554   977.25092389]
0.695441352778244 {'n_estimators': 72, 'max_depth': 7, 'min_child_weight': 4}


In [18]:
# 하이퍼 파라미터 결과 도출
# 위에서 분할한  X_train, y_train, X_test, y_test 파라미터 삽입
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: lgb_object(trial, train_x, train_y), n_trials=10)

best_score = study.best_value
best_param_dict = study.best_trial.params

print(best_score, best_param_dict)

[32m[I 2021-12-30 18:19:09,138][0m A new study created in memory with name: no-name-788e9fd5-89ac-4f59-bb40-6c685ceff80d[0m


LGBM average r2: 0.6882148312702514, score_list r2: [0.81225661 0.7607092  0.64021953 0.55546157 0.67242724]


[32m[I 2021-12-30 18:19:28,268][0m Trial 0 finished with value: 0.6882148312702514 and parameters: {'n_estimators': 98, 'max_depth': 8, 'min_child_weight': 7}. Best is trial 0 with value: 0.6882148312702514.[0m


LGBM average neg_mean_squared_error: 839.4142501007078, score_list neg_mean_squared_error: [ 644.94317976  651.56226896 1268.18789899  646.56236298  985.8155398 ]
LGBM average r2: 0.6831569915556878, score_list r2: [0.81442416 0.76508661 0.64013389 0.52004824 0.67609205]


[32m[I 2021-12-30 18:19:47,517][0m Trial 1 finished with value: 0.6831569915556878 and parameters: {'n_estimators': 97, 'max_depth': 7, 'min_child_weight': 2}. Best is trial 1 with value: 0.6831569915556878.[0m


LGBM average neg_mean_squared_error: 841.4462671797177, score_list neg_mean_squared_error: [ 641.20934356  645.57515504 1268.33882859  671.82252814  980.28548058]
LGBM average r2: 0.6884369533630366, score_list r2: [0.81706005 0.76517885 0.64746762 0.54435336 0.66812489]


[32m[I 2021-12-30 18:20:06,530][0m Trial 2 finished with value: 0.6884369533630366 and parameters: {'n_estimators': 97, 'max_depth': 10, 'min_child_weight': 4}. Best is trial 1 with value: 0.6831569915556878.[0m


LGBM average neg_mean_squared_error: 836.8590404851636, score_list neg_mean_squared_error: [ 636.63924166  645.44840883 1255.34853279  654.59073506  992.2682841 ]
LGBM average r2: 0.653265246332789, score_list r2: [0.7815434  0.71852796 0.66710581 0.46465676 0.63449231]


[32m[I 2021-12-30 18:20:23,011][0m Trial 3 finished with value: 0.653265246332789 and parameters: {'n_estimators': 55, 'max_depth': 6, 'min_child_weight': 1}. Best is trial 3 with value: 0.653265246332789.[0m


LGBM average neg_mean_squared_error: 874.6215509162851, score_list neg_mean_squared_error: [ 695.69953689  706.66007769 1219.88227696  709.53190572 1041.33395732]
LGBM average r2: 0.6545077228106215, score_list r2: [0.79177795 0.72811086 0.60766401 0.4941613  0.65082449]


[32m[I 2021-12-30 18:20:39,265][0m Trial 4 finished with value: 0.6545077228106215 and parameters: {'n_estimators': 51, 'max_depth': 9, 'min_child_weight': 3}. Best is trial 3 with value: 0.653265246332789.[0m


LGBM average neg_mean_squared_error: 881.1124598479194, score_list neg_mean_squared_error: [ 679.20752478  694.52655947 1324.32289097  689.70248567 1017.80283835]
LGBM average r2: 0.6858235432132634, score_list r2: [0.81614864 0.75325199 0.63278073 0.5479121  0.67902426]


[32m[I 2021-12-30 18:20:57,516][0m Trial 5 finished with value: 0.6858235432132634 and parameters: {'n_estimators': 87, 'max_depth': 9, 'min_child_weight': 10}. Best is trial 3 with value: 0.653265246332789.[0m


LGBM average neg_mean_squared_error: 841.7918401400686, score_list neg_mean_squared_error: [ 638.22314925  661.63695944 1281.23132006  652.02944736  975.83832458]
LGBM average r2: 0.656741933594047, score_list r2: [0.78250141 0.72297109 0.66828333 0.47261747 0.63733636]


[32m[I 2021-12-30 18:21:13,890][0m Trial 6 finished with value: 0.656741933594047 and parameters: {'n_estimators': 58, 'max_depth': 6, 'min_child_weight': 5}. Best is trial 3 with value: 0.653265246332789.[0m


LGBM average neg_mean_squared_error: 870.8934159768618, score_list neg_mean_squared_error: [ 694.17240895  701.06046359 1217.72286188  704.23667215 1037.27467332]
LGBM average r2: 0.6562182177462358, score_list r2: [0.78155523 0.73371556 0.67217641 0.47628911 0.61735478]


[32m[I 2021-12-30 18:21:30,495][0m Trial 7 finished with value: 0.6562182177462358 and parameters: {'n_estimators': 68, 'max_depth': 5, 'min_child_weight': 9}. Best is trial 3 with value: 0.653265246332789.[0m


LGBM average neg_mean_squared_error: 872.1630656124664, score_list neg_mean_squared_error: [ 695.68069579  687.33082387 1210.55607914  701.78094377 1065.46678548]
LGBM average r2: 0.6437010795174194, score_list r2: [0.77640038 0.71950139 0.66391317 0.45355816 0.6051323 ]


[32m[I 2021-12-30 18:21:46,429][0m Trial 8 finished with value: 0.6437010795174194 and parameters: {'n_estimators': 55, 'max_depth': 5, 'min_child_weight': 1}. Best is trial 8 with value: 0.6437010795174194.[0m


LGBM average neg_mean_squared_error: 886.8389905097487, score_list neg_mean_squared_error: [ 703.84116436  705.43707782 1225.71798298  716.84909289 1082.3496345 ]
LGBM average r2: 0.6713346560285504, score_list r2: [0.79030373 0.74775873 0.68278182 0.50588385 0.62994514]


[32m[I 2021-12-30 18:22:03,976][0m Trial 9 finished with value: 0.6713346560285504 and parameters: {'n_estimators': 94, 'max_depth': 5, 'min_child_weight': 2}. Best is trial 8 with value: 0.6437010795174194.[0m


LGBM average neg_mean_squared_error: 854.1676068605254, score_list neg_mean_squared_error: [ 681.6076925   668.96129887 1190.81376554  681.66389473 1047.79138267]
0.6437010795174194 {'n_estimators': 55, 'max_depth': 5, 'min_child_weight': 1}


In [19]:
# 하이퍼 파라미터 결과 도출
# 위에서 분할한  X_train, y_train, X_test, y_test 파라미터 삽입
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: random_forest_object(trial, train_x, train_y), n_trials=10)

best_score = study.best_value
best_param_dict = study.best_trial.params

print(best_score, best_param_dict)

[32m[I 2021-12-30 18:22:03,994][0m A new study created in memory with name: no-name-c98f314b-2978-4edd-a3b7-58dd5b0f1bc2[0m


rf average r2: 0.443796791473914, score_list r2: [0.63638586 0.53672585 0.52483931 0.19292764 0.3281053 ]


[32m[I 2021-12-30 18:33:08,181][0m Trial 0 finished with value: 0.443796791473914 and parameters: {'n_estimators': 80, 'max_depth': 5, 'min_samples_split': 7, 'min_samples_leaf': 5}. Best is trial 0 with value: 0.443796791473914.[0m


rf average neg_mean_squared_error: 1110.6366710726518, score_list neg_mean_squared_error: [ 895.70592316  906.87148005 1466.64009331  871.83317727 1412.13268158]
rf average r2: 0.5582271398526268, score_list r2: [0.7296468  0.63065126 0.60092463 0.29508495 0.53482806]


[32m[I 2021-12-30 18:48:08,727][0m Trial 1 finished with value: 0.5582271398526268 and parameters: {'n_estimators': 84, 'max_depth': 7, 'min_samples_split': 10, 'min_samples_leaf': 1}. Best is trial 0 with value: 0.443796791473914.[0m


rf average neg_mean_squared_error: 972.4209792402191, score_list neg_mean_squared_error: [ 769.92756386  813.0072086  1296.54501155  807.54806646 1175.07704573]
rf average r2: 0.4422501283345424, score_list r2: [0.63591794 0.54233966 0.50675793 0.19260091 0.33363421]


[32m[I 2021-12-30 18:58:26,560][0m Trial 2 finished with value: 0.4422501283345424 and parameters: {'n_estimators': 79, 'max_depth': 5, 'min_samples_split': 9, 'min_samples_leaf': 6}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 1107.3188388676358, score_list neg_mean_squared_error: [ 898.51402532  901.94693403 1460.9622924   864.93352984 1410.23741275]
rf average r2: 0.5112906290769983, score_list r2: [0.6784557  0.59682013 0.5745615  0.25484071 0.45177511]


[32m[I 2021-12-30 19:08:32,664][0m Trial 3 finished with value: 0.5112906290769983 and parameters: {'n_estimators': 67, 'max_depth': 6, 'min_samples_split': 7, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 1031.3418855702425, score_list neg_mean_squared_error: [ 842.73616588  843.8912215  1358.84051096  834.03733977 1277.20418974]
rf average r2: 0.5661436014993184, score_list r2: [0.7301972  0.63662308 0.62284575 0.3055251  0.53552688]


[32m[I 2021-12-30 19:19:27,304][0m Trial 4 finished with value: 0.5661436014993184 and parameters: {'n_estimators': 63, 'max_depth': 7, 'min_samples_split': 3, 'min_samples_leaf': 10}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 965.367490294552, score_list neg_mean_squared_error: [ 770.24520761  807.60617427 1259.42775148  815.26855744 1174.28976067]
rf average r2: 0.6475823163961918, score_list r2: [0.79365415 0.68908048 0.68599875 0.44767349 0.62150471]


[32m[I 2021-12-30 19:30:40,012][0m Trial 5 finished with value: 0.6475823163961918 and parameters: {'n_estimators': 53, 'max_depth': 9, 'min_samples_split': 9, 'min_samples_leaf': 3}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 880.4496049215617, score_list neg_mean_squared_error: [ 674.88488519  738.63300787 1202.59504879  718.80589581 1067.32918694]
rf average r2: 0.568085248715521, score_list r2: [0.73144141 0.62637731 0.63371965 0.31230186 0.53658601]


[32m[I 2021-12-30 19:39:25,963][0m Trial 6 finished with value: 0.568085248715521 and parameters: {'n_estimators': 51, 'max_depth': 7, 'min_samples_split': 5, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 967.0756594673279, score_list neg_mean_squared_error: [ 775.12720636  802.2871592  1279.33831567  804.84267605 1173.78294005]
rf average r2: 0.5681309437147553, score_list r2: [0.73117987 0.62723585 0.6422877  0.30686206 0.53308924]


[32m[I 2021-12-30 19:51:07,474][0m Trial 7 finished with value: 0.5681309437147553 and parameters: {'n_estimators': 68, 'max_depth': 7, 'min_samples_split': 4, 'min_samples_leaf': 9}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 971.368132962684, score_list neg_mean_squared_error: [ 769.26714049  810.81988069 1291.55814196  809.59883533 1175.59666635]
rf average r2: 0.5621916690948343, score_list r2: [0.73270203 0.62873196 0.59181276 0.32153446 0.53617714]


[32m[I 2021-12-30 20:00:45,201][0m Trial 8 finished with value: 0.5621916690948343 and parameters: {'n_estimators': 56, 'max_depth': 7, 'min_samples_split': 2, 'min_samples_leaf': 5}. Best is trial 2 with value: 0.4422501283345424.[0m


rf average neg_mean_squared_error: 972.2636704522984, score_list neg_mean_squared_error: [ 770.69425625  806.63657479 1301.46996918  812.18156699 1170.33598505]
rf average r2: 0.44089997407454373, score_list r2: [0.63704903 0.54034601 0.50824397 0.19116214 0.32769872]


[32m[I 2021-12-30 20:09:37,072][0m Trial 9 finished with value: 0.44089997407454373 and parameters: {'n_estimators': 69, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 2}. Best is trial 9 with value: 0.44089997407454373.[0m


rf average neg_mean_squared_error: 1110.9559861784578, score_list neg_mean_squared_error: [ 901.3414605   902.64591852 1463.63417     870.73088312 1416.42749875]
0.44089997407454373 {'n_estimators': 69, 'max_depth': 5, 'min_samples_split': 8, 'min_samples_leaf': 2}


In [27]:
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
#순서-xgb, lgbm, random-forest, voting
optuna.visualization.plot_param_importances(study)

[33m[W 2021-12-30 21:40:11,557][0m Study instance does not contain completed trials.[0m


In [28]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

[33m[W 2021-12-30 21:40:11,589][0m Study instance does not contain trials.[0m


### 최종 모형 검증

In [29]:
# model = xgb.XGBRegressor()

model = VotingRegressor(estimators=[
                    ('model_xgb', xgb.XGB
                     
                     
                     
                     Regressor()),
                    ('model_lgb', lgb.LGBMRegressor())
                ])

# model = xgb.XGBRegressor()

# model = lgb.LGBMRegressor()

model.fit(train_x, train_y)
pred = model.predict(test_x)

# 실제 평당단가 금액
print(test_y)
# 예측 평당단가 금액
print(pred)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(test_y, pred)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, pred)

print('RMSE: ', rmse)
print('R^2: ', r2 )

# execute_modeling(('model_rf', model), test_x, test_y)

# pickle 생성
import pickle
# pickle.dump(model, open('model.pkl', 'wb'))

142792    1259.992338
142793    1165.665842
142794    1251.926497
142795    1485.828167
142796    1630.971993
             ...     
237982    1410.601638
237983    1269.794287
237984    1513.445904
237985    1805.330937
237986    1698.623086
Name: 평당단가, Length: 95195, dtype: float64
[2586.14359096 2671.51274051 2039.19007528 ... 1743.0770064  1731.09912281
 1731.09912281]
RMSE:  969.5677471255754
R^2:  0.6894484397254004


In [30]:
np.sort(model.feature_importances_)

AttributeError: 'VotingRegressor' object has no attribute 'feature_importances_'