In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [3]:
os.chdir("/content/drive/MyDrive/암빅데이터_경진대회/최종코드")

In [4]:
df_x_tr = pd.read_csv("./data/preprocessed/data_x_tr.csv")
df_y_tr = pd.read_csv("./data/preprocessed/data_y_tr.csv")
df_x_ts = pd.read_csv("./data/preprocessed/data_x_ts.csv")
df_y_ts = pd.read_csv("./data/preprocessed/data_y_ts.csv")

In [5]:
df_x_tr.drop(['Unnamed: 0'],axis=1, inplace=True)
df_y_tr.drop(['Unnamed: 0'],axis=1, inplace=True)
df_x_ts.drop(['Unnamed: 0'],axis=1, inplace=True)
df_y_ts.drop(['Unnamed: 0'],axis=1, inplace=True)

In [6]:
x_train, x_val, y_train, y_val = train_test_split(df_x_tr, df_y_tr, test_size=0.1, shuffle=True, stratify=df_y_tr, random_state=23)

In [7]:
x_train.reset_index(drop=True,inplace=True)
x_val.reset_index(drop=True,inplace=True)
y_train.reset_index(drop=True,inplace=True)
y_val.reset_index(drop=True,inplace=True)

In [13]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import GridSearchCV

# DecisionTree

In [9]:
dt_reg=DecisionTreeRegressor(random_state=0)

In [10]:
dt_reg.get_params().keys()

dict_keys(['ccp_alpha', 'criterion', 'max_depth', 'max_features', 'max_leaf_nodes', 'min_impurity_decrease', 'min_impurity_split', 'min_samples_leaf', 'min_samples_split', 'min_weight_fraction_leaf', 'presort', 'random_state', 'splitter'])

In [11]:
params = { 
        'max_depth' : [8,11],     
        'min_samples_leaf' : [55,57],
        'min_samples_split' : [2,4],
        'max_features' : [6,7]
          }

grid_cv = GridSearchCV(dt_reg, param_grid = params, cv = 10, n_jobs = -1,scoring = 'neg_mean_absolute_error', return_train_score=True)
grid_cv.fit(x_train, y_train)
mae= -1*grid_cv.best_score_

print('최적 mae: ',mae)
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)

best_dt_cv = grid_cv.cv_results_
dt_result = pd.DataFrame(best_dt_cv)
dt_result.drop(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params'],axis=1,inplace=True)
dt_df = pd.concat([dt_result.iloc[:,0:4],dt_result.iloc[:,14:17],dt_result.iloc[:,27:29]],axis=1)
dt_df.iloc[:,4] = -1*dt_df.iloc[:,4] 
dt_df.iloc[:,7] = -1*dt_df.iloc[:,7] 
dt_df

최적 mae:  1.1683325822744188
최적 하이퍼 파라미터:  {'max_depth': 8, 'max_features': 6, 'min_samples_leaf': 55, 'min_samples_split': 2}


Unnamed: 0,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,mean_test_score,std_test_score,rank_test_score,mean_train_score,std_train_score
0,8,6,55,2,1.168333,0.028581,1,1.16232,0.005806
1,8,6,55,4,1.168333,0.028581,1,1.16232,0.005806
2,8,6,57,2,1.16942,0.028374,9,1.162394,0.005643
3,8,6,57,4,1.16942,0.028374,9,1.162394,0.005643
4,8,7,55,2,1.169745,0.030737,11,1.161193,0.005639
5,8,7,55,4,1.169745,0.030737,11,1.161193,0.005639
6,8,7,57,2,1.170071,0.031122,13,1.161577,0.005673
7,8,7,57,4,1.170071,0.031122,13,1.161577,0.005673
8,11,6,55,2,1.170134,0.028579,15,1.161005,0.005983
9,11,6,55,4,1.170134,0.028579,15,1.161005,0.005983


In [14]:
best_df = grid_cv.best_estimator_

# Validation 데이터로 검증
dt_pred = best_df.predict(x_val)
print("Validation MAE :", mean_absolute_error(y_val,dt_pred))

Validation MAE : 1.1707449262432752


In [15]:
# 검증데이터(df_x_ts, df_y_ts)로 External Validation 진행
dt_reg=DecisionTreeRegressor(random_state=0,max_depth=8,min_samples_split=2,min_samples_leaf=55,max_features=6)
dt_reg.fit(x_train,y_train)
dt_pred = dt_reg.predict(df_x_ts)
print("External Validation MAE :", mean_absolute_error(df_y_ts,dt_pred))

External Validation MAE : 0.885081854624215


# RandomForest

In [16]:
rf_reg=RandomForestRegressor(random_state=0)

In [17]:
rf_reg.get_params

<bound method BaseEstimator.get_params of RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=0, verbose=0, warm_start=False)>

In [18]:
params = { 
        'n_estimators':[1,2,3],
        'max_depth' : [None,5],     
        'min_samples_leaf' : [20,30,40],
        'min_samples_split' : [1,2,3],
        'max_features' : [7,8,9]
          }

grid_cv = GridSearchCV(rf_reg, param_grid = params, cv = 10, n_jobs = -1,scoring = 'neg_mean_absolute_error')
grid_cv.fit(x_train, y_train)
mae= -1*grid_cv.best_score_

print('최적 mae: ',mae)
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)

best_rf_cv=grid_cv.cv_results_
rf_result = pd.DataFrame(best_rf_cv)
rf_result.drop(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params'],axis=1,inplace=True)
rf_df = pd.concat([rf_result.iloc[:,0:4],rf_result.iloc[:,14:18]],axis=1)
rf_df.iloc[:,4] = -1*rf_df.iloc[:,4] 
rf_df

최적 mae:  1.1613454793823632
최적 하이퍼 파라미터:  {'max_depth': None, 'max_features': 8, 'min_samples_leaf': 30, 'min_samples_split': 2, 'n_estimators': 2}


  self.best_estimator_.fit(X, y, **fit_params)


Unnamed: 0,param_max_depth,param_max_features,param_min_samples_leaf,param_min_samples_split,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,,7,20,1,,,,162
1,,7,20,1,,,,110
2,,7,20,1,,,,112
3,,7,20,2,1.184494,-1.167953,0.030911,107
4,,7,20,2,1.175625,-1.163471,0.031909,29
...,...,...,...,...,...,...,...,...
157,5,9,40,2,1.170442,-1.163565,0.030539,31
158,5,9,40,2,1.173784,-1.166275,0.030267,83
159,5,9,40,3,1.178735,-1.166758,0.029769,97
160,5,9,40,3,1.170442,-1.163565,0.030539,31


In [19]:
best_rf = grid_cv.best_estimator_ 

# Validation 데이터로 검증
rf_pred = best_rf.predict(x_val)
print("Validation MAE :", mean_absolute_error(y_val,rf_pred))

Validation MAE : 1.170629168299479


In [20]:
# 검증데이터(df_x_ts, df_y_ts)로 External Validation 진행
best_rf = grid_cv.best_estimator_ 
rf_pred = best_rf.predict(df_x_ts)
print("External Validation MAE :", mean_absolute_error(df_y_ts,rf_pred))

External Validation MAE : 0.8757766093065067


# XGBoost

In [21]:
xgb_reg = XGBRegressor()

In [22]:
xgb_reg.get_params

<bound method XGBModel.get_params of XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=1, gamma=0,
             importance_type='gain', learning_rate=0.1, max_delta_step=0,
             max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
             n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
             reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
             silent=None, subsample=1, verbosity=1)>

In [23]:
params = { 
          'n_estimators':[100,340,350,400],
          'max_depth' : [1,2,3],
          'learning rate' : [0.0001, 0.001, 0.005],
          }

grid_cv = GridSearchCV(xgb_reg, param_grid = params, cv = 10, n_jobs = -1,scoring = 'neg_mean_absolute_error')
grid_cv.fit(x_train, y_train)
mae= -1*grid_cv.best_score_

print('최적 mae: ',mae)
print('최적 하이퍼 파라미터: ', grid_cv.best_params_)

best_xgb_cv=grid_cv.cv_results_
xgb_result = pd.DataFrame(best_xgb_cv)
xgb_result.drop(['mean_fit_time', 'std_fit_time', 'mean_score_time', 'std_score_time', 'params'],axis=1,inplace=True)
xgb_df = pd.concat([xgb_result.iloc[:,0:3],xgb_result.iloc[:,13:16]],axis=1)
xgb_df.iloc[:,3] = -1*xgb_df.iloc[:,3] 
xgb_df

최적 mae:  1.1698160038577186
최적 하이퍼 파라미터:  {'learning rate': 0.0001, 'max_depth': 2, 'n_estimators': 350}


Unnamed: 0,param_learning rate,param_max_depth,param_n_estimators,mean_test_score,std_test_score,rank_test_score
0,0.0001,1,100,1.170826,0.028881,22
1,0.0001,1,340,1.170532,0.029132,19
2,0.0001,1,350,1.170528,0.029134,16
3,0.0001,1,400,1.1705,0.029152,13
4,0.0001,2,100,1.170362,0.029674,10
5,0.0001,2,340,1.169843,0.030105,7
6,0.0001,2,350,1.169816,0.030109,1
7,0.0001,2,400,1.169825,0.030068,4
8,0.0001,3,100,1.171177,0.029538,25
9,0.0001,3,340,1.17185,0.030204,31


In [24]:
best_xgb = grid_cv.best_estimator_ 

# Validation 데이터로 검증
xgb_pred = best_xgb.predict(x_val)
print("Validation MAE :", mean_absolute_error(y_val,xgb_pred))

Validation MAE : 1.1686743136405944


In [25]:
# 검증데이터(df_x_ts, df_y_ts)로 External Validation 진행
best_xgb = grid_cv.best_estimator_ 
xgb_pred = best_xgb.predict(df_x_ts)
print("External Validation MAE :", mean_absolute_error(df_y_ts,xgb_pred))

External Validation MAE : 0.8874446742820741
