In [1]:
import pandas as pd
import numpy as np
import warnings

warnings.filterwarnings('ignore')

from sklearn.preprocessing import OneHotEncoder
from sklearn import preprocessing

from sklearn.model_selection import train_test_split
from sklearn.model_selection import TimeSeriesSplit

from sklearn.model_selection import cross_val_score

from optuna import Trial
import optuna
from optuna.samplers import TPESampler

In [2]:
sample_df = pd.read_csv('train_data_final.csv', sep=',')

# 시간 순서대로 데이터 반영하기 위해서 거래년월일 정렬 필요
sample_df = sample_df.sort_values('계약날짜인코딩')
sample_df = sample_df[sample_df.columns.difference(['동별 공원 갯수', '8학군', '동별지하철역수', '구별 교과학원 갯수'])]
sample_df = sample_df.fillna(0)
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237987 entries, 64104 to 208092
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   거래금액         237987 non-null  int64  
 1   건축년도         237987 non-null  int64  
 2   계약날짜인코딩      237987 non-null  int64  
 3   구            237987 non-null  object 
 4   구별 공원 갯수     237987 non-null  float64
 5   구별 대학병원 병원수  237987 non-null  int64  
 6   구별 대형마트 수    237987 non-null  float64
 7   구별 백화점 수     237987 non-null  float64
 8   구별 전체 마트 수   237987 non-null  float64
 9   구별 종합병원 병원수  237987 non-null  int64  
 10  단지명별 신축거래비율  237987 non-null  float64
 11  단지명브랜드       237987 non-null  object 
 12  동            237987 non-null  object 
 13  전용면적         237987 non-null  float64
 14  층            237987 non-null  int64  
 15  평당단가         237987 non-null  float64
dtypes: float64(7), int64(6), object(3)
memory usage: 30.9+ MB


In [3]:
# 동 레이블 인코딩
from tqdm import tqdm
dong_price = sample_df.groupby('동')['거래금액'].agg('mean').sort_values(ascending=False)
# 가격기준으로 동을 정렬한 리스트를 바탕으로 dong에 대해 라벨 인코딩 진행 - 477 it.
for i, d in tqdm(enumerate(list(dong_price.index)), total=len(dong_price)):
    sample_df.loc[sample_df['동'] == d, '동'] = i
    # test_dat.loc[test_df['동'] == d, '동'] = i
sample_df.head()

100%|██████████| 330/330 [00:05<00:00, 58.24it/s]


Unnamed: 0,거래금액,건축년도,계약날짜인코딩,구,구별 공원 갯수,구별 대학병원 병원수,구별 대형마트 수,구별 백화점 수,구별 전체 마트 수,구별 종합병원 병원수,단지명별 신축거래비율,단지명브랜드,동,전용면적,층,평당단가
64104,39500,2000,0,송파구,144.0,2,4.0,2.0,6.0,16,0.110657,기타,130,57.88,5,2252.073255
78845,37700,1993,0,중랑구,11.0,3,6.0,0.0,6.0,12,0.12708,두산|두산위브,265,84.94,3,1464.680951
79128,59500,2015,0,중랑구,11.0,3,6.0,0.0,6.0,12,0.673469,하늘채,265,84.7054,11,2318.034033
15567,52000,1996,0,관악구,21.0,2,1.0,1.0,2.0,8,0.021814,삼성,202,84.84,8,2022.630835
1983,180000,2008,0,강남구,132.0,4,1.0,6.0,7.0,33,0.550668,힐스테이트,12,84.236,19,7051.616886


In [4]:
sample_df = sample_df.astype({'동':'int'})
sample_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 237987 entries, 64104 to 208092
Data columns (total 16 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   거래금액         237987 non-null  int64  
 1   건축년도         237987 non-null  int64  
 2   계약날짜인코딩      237987 non-null  int64  
 3   구            237987 non-null  object 
 4   구별 공원 갯수     237987 non-null  float64
 5   구별 대학병원 병원수  237987 non-null  int64  
 6   구별 대형마트 수    237987 non-null  float64
 7   구별 백화점 수     237987 non-null  float64
 8   구별 전체 마트 수   237987 non-null  float64
 9   구별 종합병원 병원수  237987 non-null  int64  
 10  단지명별 신축거래비율  237987 non-null  float64
 11  단지명브랜드       237987 non-null  object 
 12  동            237987 non-null  int32  
 13  전용면적         237987 non-null  float64
 14  층            237987 non-null  int64  
 15  평당단가         237987 non-null  float64
dtypes: float64(7), int32(1), int64(6), object(2)
memory usage: 30.0+ MB


### 1. 무지성 인코딩

In [5]:

'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 전체 컬럼 데이터 타입 확인 후 통째로 범주형 인코딩 처리
    - param: df
    - return: encoded columns dataframe
'''
def one_hot(df):
    # 컬럼의 이름 리스트로 뽑아오기
    columns_name_list = list(df.columns)

    # 컬럼마다 for문 반복
    for col_name in columns_name_list:
        # 만약 컬럼의 값 타입이 범주형이면
        if df[col_name].dtype == object or df[col_name].dtype == str:

            # # 컬럼의 유니크한 값을 리스트로 만들어둠
            col_items = df[col_name].unique().tolist()
            # print(col_items)

            onehot = OneHotEncoder(sparse=False)
            # print(col_name)
            onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
            onehot_encoded_label = onehot.categories_[0]
            onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)
            # print(onehot_encoded_df)
            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, onehot_encoded_df], axis=1)
        else:
            # 종속(타겟) 컬럼만 인코딩 제외
            if col_name == '평당단가':
                continue
            scaled_label = [col_name]
            x = df[col_name].values.reshape(-1, 1) #returns a numpy array
            min_max_scaler = preprocessing.MinMaxScaler()
            x_scaled = min_max_scaler.fit_transform(x)
            scaled_df = pd.DataFrame(x_scaled, columns=scaled_label)

            df.drop(col_name, axis=1, inplace=True)
            df = pd.concat([df, scaled_df], axis=1)

    return df
'''
    - 카테고리: 데이터 전처리
    - 개요: 무지성 인코딩 => 새로운 데이터 컬럼 추가 시 별도 인코딩 결과만 출력할 때 사용 ㄱㄱ
    - param: df, col_name
    - return: encoded columns dataframe or series
'''
def encode_column(df, col_name):

    # column data type이 object 또는 str 즉 범주형일 경우
    # onehot 인코딩 수행된 데이터프레임 return
    if df[col_name].dtype == object or df[col_name].dtype == str:
        onehot = OneHotEncoder(sparse=False)

        onehot_encoded_arr = onehot.fit_transform(df[col_name].values.reshape(-1, 1))
        onehot_encoded_label = onehot.categories_[0]
        onehot_encoded_df = pd.DataFrame(onehot_encoded_arr, columns=onehot_encoded_label)

        return onehot_encoded_df

    # column data type이 나머지 타입일 경우
    # 해당 컬럼의 series return
    else:
        return df[col_name]

In [6]:
# 1-1 test one_hot function
preprocessed_train_df = one_hot(sample_df)
preprocessed_train_df

Unnamed: 0,평당단가,거래금액,건축년도,계약날짜인코딩,강남구,강동구,강북구,강서구,관악구,광진구,...,플래티넘,하늘채,한라|한라비발디,한화,현대,호반,힐스테이트,동,전용면적,층
0,5364.511692,0.040858,0.433333,0.001826,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039514,0.254714,0.044118
1,4828.060523,0.038701,0.433333,0.006393,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039514,0.254714,0.014706
2,5364.511692,0.064821,0.433333,0.009132,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039514,0.254714,0.000000
3,5756.533700,0.055835,0.433333,0.070320,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039514,0.254714,0.014706
4,6452.346308,0.209202,0.433333,0.085845,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.039514,0.163322,0.058824
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
237982,1410.601638,0.076564,0.700000,0.730594,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.811550,0.350677,0.073529
237983,1269.794287,0.059430,0.700000,0.866667,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.811550,0.209365,0.014706
237984,1513.445904,0.103762,0.766667,0.841096,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.811550,0.313118,0.073529
237985,1805.330937,0.075006,0.766667,0.974429,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.811550,0.328588,0.058824


In [7]:
# 인코딩된 전체 데이터 셋에서 train, test split하여 분할 시행
# fit 함수에서 사용할 예정 train(모델 훈련 및 도출), test(최종 검증 데이터)

# 인코딩된 데이터에서 data, target 분류
data = preprocessed_train_df[preprocessed_train_df.columns.difference(['평당단가', '거래금액', '전용면적'])]
target = preprocessed_train_df['평당단가'] 

train_x, test_x, train_y, test_y = train_test_split(data, target, test_size=0.4, shuffle=False)

# train_x(data), train_y(target) => cross validation을 이용하여 교차 검증 및 점수 도출할 데이터
# test_x(data), test_y(target) => 향후 train data를 이용하여 생성된 최종 모형 평가하기 위한 테스트 데이터
# 무작위가 아닌 시간순 정렬 반영 위해 shuffle=False

len(train_x), len(train_y)

(142792, 142792)

### 2. 모델링

In [8]:
from sklearn.model_selection import KFold
'''
    - 카테고리: 모델링
    - 개요: 머신러닝 모델링 수행 및 점수 도출
        - 교차 검증 방법으로 TimeSeriesSplit 수행
    - param: 

        1. model_tuple => ex. ('LR', LinearRegression())
        2. X_train, y_train, X_test, y_test

    - return: rmse
'''
def execute_modeling(model_tuple, data, target):

    name = model_tuple[0]
    model = model_tuple[1]

    # TimeSeries Cross validation 
    # tscv = TimeSeriesSplit(n_splits=7)
    kfold = KFold(n_splits=5)

    # 각 모델에 대하여 교차 검증한 결과 점수 확인
    # r^2, rmse 같이 도출 ㄱㄱ
    # scoring parameter option 어캐 줘야 함?
    score_list = []
    score_type = ['r2', 'neg_mean_squared_error']

    for s_type in score_type:

        scores = cross_val_score(model, data, target, cv=kfold, scoring=s_type)

        if s_type == 'neg_mean_squared_error':
            scores = np.sqrt(-scores)

        print(f'{name} average {s_type}: {scores.mean()}, score_list {s_type}: {scores}')

        score_list.append(scores.mean())

    rmse_score = score_list[1]

    return rmse_score

In [10]:
from sklearn.linear_model import LinearRegression 
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import VotingRegressor
import xgboost as xgb
import lightgbm as lgb

# 2. test execute_modeling function
preprocessed_train_df = one_hot(sample_df)
preprocessed_train_df = preprocessed_train_df.drop_duplicates()

model_list = [
                ('RF', RandomForestRegressor(n_estimators=5)),
                ('model_xgb', xgb.XGBRegressor()),
                ('model_lgb', lgb.LGBMRegressor()),

                ('voting', VotingRegressor(estimators=[
                    ('RF', RandomForestRegressor(n_estimators=5)),
                    ('model_xgb', xgb.XGBRegressor()),
                    ('model_lgb', lgb.LGBMRegressor())
                ]))
            ]

for model_tuple in model_list:
    execute_modeling(model_tuple, train_x, train_y)
    pass

RF average r2: 0.7432132282405489, score_list r2: [0.80105098 0.76895105 0.7799826  0.61039648 0.75568504]
RF average neg_mean_squared_error: 785.0082694010437, score_list neg_mean_squared_error: [ 676.67113357  659.76179902 1082.86365226  609.90403498  895.84072717]
model_xgb average r2: 0.8151721753213408, score_list r2: [0.86919548 0.84059245 0.80969602 0.73685049 0.81952645]
model_xgb average neg_mean_squared_error: 644.3302373605477, score_list neg_mean_squared_error: [538.33239143 531.79882228 922.33513876 497.45872069 731.72611365]
model_lgb average r2: 0.7951417147889878, score_list r2: [0.84433004 0.82645016 0.81871912 0.69451397 0.79169529]
model_lgb average neg_mean_squared_error: 672.8948790435604, score_list neg_mean_squared_error: [587.27502136 554.88762484 900.2037445  535.98353299 786.12447152]
voting average r2: 0.8087667086746473, score_list r2: [0.85499324 0.83529057 0.82065795 0.72175489 0.81113689]
voting average neg_mean_squared_error: 645.630733537886, score_list

### 3. 파라미터 튜닝

In [7]:
# 3. test get_best_param function

# linear regression model object
def linear_object(trial:Trial, data, target):
    params = {
        'fit_intercept': trial.suggest_categorical('fit_intercept', [True, False])
    }
    
    test_model = LinearRegression(**params)
    test_model_score = execute_modeling(('LR', test_model), data, target)

    return test_model_score

# xgbr regression model object
def xgbr_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100),
        'gamma':trial.suggest_int('gamma', 1, 3),
        'learning_rate': 0.01,
        'colsample_bytree':trial.suggest_discrete_uniform('colsample_bytree',0.5, 1, 0.1),
        'nthread' : -1,
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'lambda': trial.suggest_loguniform('lambda', 1e-3, 1.0),
        'alpha': trial.suggest_loguniform('alpha', 1e-3, 1.0),
        'subsample': trial.suggest_categorical('subsample', [0.6,0.7,0.8,1.0] ),
        'random_state': 42
    }
    
    test_model = xgb.XGBRegressor(**params)
    test_model_score = execute_modeling(('XGBR', test_model), data, target)

    return test_model_score

# light gbm regression model object
def lgb_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        'min_child_weight':trial.suggest_int('min_child_weight', 1, 100)
    }
    
    test_model = lgb.LGBMRegressor(**params)
    test_model_score = execute_modeling(('LGBM', test_model), data, target)

    return test_model_score

# randomforest regressor model object
def random_forest_object(trial:Trial, data, target):
    params = {
        "n_estimators" : trial.suggest_int('n_estimators', 500, 1000),
        'max_depth':trial.suggest_int('max_depth', 8, 16),
        ''
    }
    
    test_model = RandomForestRegressor(**params)
    test_model_score = execute_modeling(('rf', test_model), data, target)

    return test_model_score

# voting regressor model object
def voting_regressor(trial:Trial, data, target):
    pass

# 하이퍼 파라미터 결과 도출
# 위에서 분할한  X_train, y_train, X_test, y_test 파라미터 삽입
study = optuna.create_study(direction='minimize', sampler=TPESampler())
study.optimize(lambda trial: lgb_object(trial, train_x, train_y), n_trials=3)

best_score = study.best_value
best_param_dict = study.best_trial.params

print(best_score, best_param_dict)

[32m[I 2021-12-28 16:21:42,984][0m A new study created in memory with name: no-name-aa9728d7-5b52-4b7f-a314-9ab0fc9daa4f[0m
[32m[I 2021-12-28 16:22:21,732][0m Trial 0 finished with value: 25895.329099756465 and parameters: {'n_estimators': 790, 'max_depth': 14, 'min_child_weight': 43}. Best is trial 0 with value: 25895.329099756465.[0m


LGBM average rmse_scores: 25895.329099756465, rmse_scores: [19576.06321571 19376.58158036 55598.44322939 56276.78036999
 29394.86794348 12642.15597452 12019.52356509 25045.80398683
 21638.95337171 34075.15963913 14420.76480632 13809.82873753
 21766.47121174 26651.99570745 26136.5431571 ]


[32m[I 2021-12-28 16:23:02,077][0m Trial 1 finished with value: 26214.330928425254 and parameters: {'n_estimators': 750, 'max_depth': 9, 'min_child_weight': 64}. Best is trial 0 with value: 25895.329099756465.[0m


LGBM average rmse_scores: 26214.330928425254, rmse_scores: [19831.40428064 20039.21382514 54863.88110247 56009.72590423
 29900.24350841 13507.73394071 12359.79148079 25557.985213
 22598.22387112 34117.80646005 14865.52649845 13909.8459276
 21980.87217221 27063.63479704 26609.07494452]


[32m[I 2021-12-28 16:23:40,147][0m Trial 2 finished with value: 25739.27317819786 and parameters: {'n_estimators': 727, 'max_depth': 14, 'min_child_weight': 21}. Best is trial 2 with value: 25739.27317819786.[0m


LGBM average rmse_scores: 25739.27317819786, rmse_scores: [19982.4711522  19401.4453833  55313.02189458 56188.8881972
 29589.53745901 12440.62745936 11658.56881514 24586.18736491
 21635.68308552 33538.90012906 14624.16575483 13800.14089515
 21483.4692944  26478.39201484 25367.59877348]
25739.27317819786 {'n_estimators': 727, 'max_depth': 14, 'min_child_weight': 21}


In [8]:
# 하이퍼파라미터별 중요도를 확인할 수 있는 그래프
optuna.visualization.plot_param_importances(study)

In [9]:
# 하이퍼파라미터 최적화 과정을 확인
optuna.visualization.plot_optimization_history(study)

### 최종 모형 검증

In [12]:
# model = RandomForestRegressor(n_estimators=5)

model = VotingRegressor(estimators=[
                    ('RF', RandomForestRegressor(n_estimators=5)),
                    ('model_xgb', xgb.XGBRegressor()),
                    ('model_lgb', lgb.LGBMRegressor())
                ])

model.fit(train_x, train_y)
pred = model.predict(test_x)

# 실제 평당단가 금액
print(test_y.tolist())
# 예측 평당단가 금액
print(pred)

from sklearn.metrics import mean_squared_error, r2_score

mse = mean_squared_error(test_y, pred)
rmse = np.sqrt(mse)
r2 = r2_score(test_y, pred)

print('RMSE: ', rmse)
print('R^2: ', r2 )

# execute_modeling(('model_rf', model), test_x, test_y)

142792    1259.992338
142793    1165.665842
142794    1251.926497
142795    1485.828167
142796    1630.971993
             ...     
237982    1410.601638
237983    1269.794287
237984    1513.445904
237985    1805.330937
237986    1698.623086
Name: 평당단가, Length: 95195, dtype: float64
[1875.81492161 1789.12565536 1571.90492257 ... 2463.80742686 2457.74112793
 2457.74112793]
RMSE:  807.3275740649004
R^2:  0.7846836708162324
