In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
import os
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV

from sklearn.ensemble import RandomForestRegressor, VotingRegressor
from xgboost import XGBRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR

from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings(action='ignore')

In [3]:
os.chdir("/content/drive/MyDrive/암빅데이터_경진대회/최종코드")

## [1] 데이터 로드

In [4]:
# 학습데이터 불러오기 (for Training)
df_x_tr = pd.read_csv("./data/preprocessed/data_x_tr.csv")
df_y_tr = pd.read_csv("./data/preprocessed/data_y_tr.csv")
df_x_tr.drop(['Unnamed: 0'],axis=1, inplace=True)  # (10000,9)
df_y_tr.drop(['Unnamed: 0'],axis=1, inplace=True)  # (10000,1)

In [5]:
# 검증데이터 불러오기 (for Test)
df_x_ts = pd.read_csv("./data/preprocessed/data_x_ts.csv")
df_y_ts = pd.read_csv("./data/preprocessed/data_y_ts.csv")
df_x_ts.drop(['Unnamed: 0'],axis=1, inplace=True)  # (5000,9)
df_y_ts.drop(['Unnamed: 0'],axis=1, inplace=True)  # (5000,1)

In [6]:
# Train, Valid 분할 - train_test_split 사용
x_train, x_val, y_train, y_val = train_test_split(df_x_tr, df_y_tr, test_size=0.1, shuffle=True, stratify=df_y_tr, random_state=23)

In [7]:
 # index 번호 0부터 순서대로 reset.
x_train.reset_index(drop=True, inplace=True)
x_val.reset_index(drop=True, inplace=True)
y_train.reset_index(drop=True, inplace=True)
y_val.reset_index(drop=True, inplace=True)  

In [8]:
print("x_train의 shape:",x_train.shape)
print("x_val의 shape:",x_val.shape)
print("y_train의 shape:",y_train.shape)
print("y_val의 shape:",y_val.shape)

x_train의 shape: (9000, 9)
x_val의 shape: (1000, 9)
y_train의 shape: (9000, 1)
y_val의 shape: (1000, 1)


## [2] Modeling

### Ensemble 방법 1    
VotingRegressor 소스코드 수정:  
모델 예측값 4개를 오름차순 정렬 후, 2번째와 3번째 값의 평균 계산

<img src="https://user-images.githubusercontent.com/78155086/135375498-5984cfa1-0b64-49ce-a349-41311bd59cb5.png" width="50%">

In [41]:
# 약한 학습기 구축 
# elst_reg = ElasticNet()
dt_reg = DecisionTreeRegressor(max_depth=8, max_features=6, min_samples_leaf=55, min_samples_split=2)
rf_reg = RandomForestRegressor(max_depth=None, max_features=8, min_samples_leaf=30, min_samples_split=2, n_estimators=2)
xgb_reg = XGBRegressor(learning_rate=0.0001, max_depth=2, n_estimators=350)
sv_reg = SVR(kernel='rbf', C=1000, gamma=0.01)

# 앙상블 모델 구축
# predict: 약한 분류기 4개의 예측값을 평균냄
voting_model = VotingRegressor(
    estimators=[('dt',dt_reg),('rf',rf_reg),('xgb',xgb_reg),('svr',sv_reg) ] # 4개의 약한 학습기  #('elst',elst_reg)  
    #weights = np.array([1,1,1.1])
)

# 모델 비교
for model in (dt_reg, rf_reg, xgb_reg, sv_reg, voting_model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    
    print(model.__class__.__name__," : ",mean_absolute_error(y_val, y_pred))


DecisionTreeRegressor  :  1.1683291239034423
RandomForestRegressor  :  1.1720014115657413
XGBRegressor  :  2.5565753536462785
SVR  :  0.9814829164986304
VotingRegressor  :  1.362140003083679


In [42]:
# Voting model
# Validation 데이터로 검증
voting_pred = voting_model.predict(x_val)
print("Validation MAE :", mean_absolute_error(y_val,voting_pred))

Validation MAE : 1.362140003083679


In [43]:
# 검증데이터(df_x_ts, df_y_ts)로 External Validation 진행
voting_pred = voting_model.predict(df_x_ts)
print("External Validation MAE :", mean_absolute_error(df_y_ts,voting_pred))

External Validation MAE : 1.3139003877672304


### Ensemble 방법 2    
VotingRegressor 소스코드 수정:  
모델 4개 중 성능이 좋은 SVR에 가중치를 크게 두어 평균 계산

<img src="https://user-images.githubusercontent.com/78155086/135375554-293e105d-1afe-43fa-adbf-4f5faf5de6d3.png" width="50%">

In [16]:
# 약한 학습기 구축 
# elst_reg = ElasticNet()
dt_reg = DecisionTreeRegressor(max_depth=8, max_features=6, min_samples_leaf=55, min_samples_split=2)
rf_reg = RandomForestRegressor(max_depth=None, max_features=8, min_samples_leaf=30, min_samples_split=2, n_estimators=2)
xgb_reg = XGBRegressor(learning_rate=0.0001, max_depth=2, n_estimators=350)
sv_reg = SVR(kernel='rbf', C=1000, gamma=0.01)

# 앙상블 모델 구축
# predict: 약한 분류기 4개의 예측값을 평균냄
voting_model = VotingRegressor(
    estimators=[('dt',dt_reg),('rf',rf_reg),('xgb',xgb_reg),('svr',sv_reg) ] # 4개의 약한 학습기  #('elst',elst_reg)  
    #weights = np.array([1,1,1.1])
)

# 모델 비교
for model in (dt_reg, rf_reg, xgb_reg, sv_reg, voting_model):
    model.fit(x_train, y_train)
    y_pred = model.predict(x_val)
    
    print(model.__class__.__name__," : ",mean_absolute_error(y_val, y_pred))


DecisionTreeRegressor  :  1.1692073010039439
RandomForestRegressor  :  1.1621978439445098
XGBRegressor  :  2.5565753536462785
SVR  :  0.9814829164986304
VotingRegressor  :  1.361226637239403


In [17]:
# Voting model
# Validation 데이터로 검증
voting_pred = voting_model.predict(x_val)
print("Validation MAE :", mean_absolute_error(y_val,voting_pred))

Validation MAE : 1.361226637239403


In [18]:
# 검증데이터(df_x_ts, df_y_ts)로 External Validation 진행
voting_pred = voting_model.predict(df_x_ts)
print("External Validation MAE :", mean_absolute_error(df_y_ts,voting_pred))

External Validation MAE : 1.3173459930823126
