#### Ensemble - RandomForest & ExtraTree
- 배깅 방식의 앙상블 ==> 중복 랜덤 샘플 + 동일 모델(DT)
    * 대표 알고리즘 :  RandomForest L:
- 페이스트 방식의 앙상블 ==> 랜덤 샘플 + 동일 모델(DT)
    * 대표 알고리즘 : ExtraTreeC/R

[목표] 와인분류 => 0과 1, 2개 종류 분류

[1] 모듈 로딩 및 데이터 준비

In [1]:
# 모듈로딩
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt


In [2]:
# 데이터
DATA_FILE = '../data/wine.csv'

# CSV >> DataFrame
wineDF=pd.read_csv(DATA_FILE)

In [3]:
wineDF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6497 entries, 0 to 6496
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   alcohol  6497 non-null   float64
 1   sugar    6497 non-null   float64
 2   pH       6497 non-null   float64
 3   class    6497 non-null   float64
dtypes: float64(4)
memory usage: 203.2 KB


In [4]:
wineDF.head()

Unnamed: 0,alcohol,sugar,pH,class
0,9.4,1.9,3.51,0.0
1,9.8,2.6,3.2,0.0
2,9.8,2.3,3.26,0.0
3,9.8,1.9,3.16,0.0
4,9.4,1.9,3.51,0.0


In [5]:
# 타겟/라벨 분류 클래스 분포
wineDF['class'].value_counts()

class
1.0    4898
0.0    1599
Name: count, dtype: int64

In [6]:
wineDF.describe()

Unnamed: 0,alcohol,sugar,pH,class
count,6497.0,6497.0,6497.0,6497.0
mean,10.491801,5.443235,3.218501,0.753886
std,1.192712,4.757804,0.160787,0.430779
min,8.0,0.6,2.72,0.0
25%,9.5,1.8,3.11,1.0
50%,10.3,3.0,3.21,1.0
75%,11.3,8.1,3.32,1.0
max,14.9,65.8,4.01,1.0


[2] 학습 준비

In [7]:
# 학습용 & 테스트용 데이트셋 분할
from sklearn.model_selection import train_test_split

In [8]:
# 피쳐/독립변수와 타겟/라벨/종속변수 분리

featureDF=wineDF[wineDF.columns[:-1]]
targetSR=wineDF[wineDF.columns[-1]]

print(f'featureDF : {featureDF.shape} targetSR : {targetSR.shape}')

featureDF : (6497, 3) targetSR : (6497,)


In [12]:
# 학습용, 테스트용 데이터셋 분리
X_train, X_test, y_train, y_test = train_test_split( featureDF, targetSR,
                                                    test_size=0.2,
                                                    stratify=targetSR,
                                                    random_state=1)

In [13]:
print(f'X_train : {X_train.shape}, y_trian : {y_train.shape}')
print(f'X_tset : {X_test.shape}, y_test  {y_test.shape}')

X_train : (5197, 3), y_trian : (5197,)
X_tset : (1300, 3), y_test  (1300,)


[3] 학습 진행

In [14]:
# 학습방법 : 지도학습 > 분류
# 알고리즘 : 앙상블 > 배깅 - RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier

In [27]:
# 인스턴스 생성 => 100개의 내부 DT 모델에서 사용할 데이터셋 생성
#                 Random_state 매개변수 설정으로 고정된 데이터셋 생성
#                 oob_score 매개변수 : 샘플 데이터셋 추출 후 남은 데이터셋 검증용으로 사용
lf_model = RandomForestClassifier(random_state=7,
                                  oob_score=True)

# 학습
lf_model.fit(X_train,y_train)


In [28]:
#모델 파라미터
print(f'classes_ : {lf_model.classes_}')
print(f'n_classes_ : {lf_model.n_classes_}개')
print()
print(f'feature_names_in : {lf_model.feature_names_in_}')
print(f'n_festures_in_ : {lf_model.n_features_in_}개')
print(f'feature_importances_ : {lf_model.feature_importances_}')

classes_ : [0. 1.]
n_classes_ : 2개

feature_names_in : ['alcohol' 'sugar' 'pH']
n_festures_in_ : 3개
feature_importances_ : [0.23572103 0.49995154 0.26432743]


In [29]:
#모델 파라미터
print(f'lf_model.estimator_ : {lf_model.estimator_}')

for est in lf_model.estimators_: print(est)

lf_model.estimator_ : DecisionTreeClassifier()
DecisionTreeClassifier(max_features='sqrt', random_state=327741615)
DecisionTreeClassifier(max_features='sqrt', random_state=976413892)
DecisionTreeClassifier(max_features='sqrt', random_state=1202242073)
DecisionTreeClassifier(max_features='sqrt', random_state=1369975286)
DecisionTreeClassifier(max_features='sqrt', random_state=1882953283)
DecisionTreeClassifier(max_features='sqrt', random_state=2053951699)
DecisionTreeClassifier(max_features='sqrt', random_state=959775639)
DecisionTreeClassifier(max_features='sqrt', random_state=1956722279)
DecisionTreeClassifier(max_features='sqrt', random_state=2052949340)
DecisionTreeClassifier(max_features='sqrt', random_state=1322904761)
DecisionTreeClassifier(max_features='sqrt', random_state=165338510)
DecisionTreeClassifier(max_features='sqrt', random_state=1133316631)
DecisionTreeClassifier(max_features='sqrt', random_state=4812360)
DecisionTreeClassifier(max_features='sqrt', random_state=372560

[4] 성능평가

In [30]:
train_score=lf_model.score(X_train,y_train)
test_score=lf_model.score(X_test,y_test)

In [31]:
print(f'train_score : {train_score} , test_score : {test_score}')

train_score : 0.9973061381566288 , test_score : 0.9


In [32]:
print(f'oob_score_ : {lf_model.oob_score_}')

oob_score_ : 0.89532422551472


[5] 튜닝
- RandomizedSearchCV 하이퍼파라미터 최적화 클래스
    * 범위가 넓은 하이퍼파라미터 설정에 좋음
    * 지정된 범위에서 지정된 횟수 만큼 하이퍼파라미터를 추출하여 조합 진행

In [33]:
# 모듈 로딩
from sklearn.model_selection import RandomizedSearchCV

In [54]:
# RandomForestCalssifier 하이퍼파라미터 설정
params={'max_depth':range(2,16),
        'min_samples_leaf': range(5,16),
        'criterion':['gini','entropy','log_loss']}

In [62]:
rf_model=RandomForestClassifier(n_estimators=300,random_state=7)

In [63]:
searchCV = RandomizedSearchCV(rf_model,
                              param_distributions=params,
                              verbose=4,
                              n_iter=50)

In [64]:
searchCV.fit(X_train,y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5] END criterion=log_loss, max_depth=9, min_samples_leaf=15;, score=0.869 total time=   0.6s
[CV 2/5] END criterion=log_loss, max_depth=9, min_samples_leaf=15;, score=0.838 total time=   0.6s
[CV 3/5] END criterion=log_loss, max_depth=9, min_samples_leaf=15;, score=0.877 total time=   0.6s
[CV 4/5] END criterion=log_loss, max_depth=9, min_samples_leaf=15;, score=0.882 total time=   0.6s
[CV 5/5] END criterion=log_loss, max_depth=9, min_samples_leaf=15;, score=0.867 total time=   0.6s
[CV 1/5] END criterion=gini, max_depth=3, min_samples_leaf=15;, score=0.815 total time=   0.3s
[CV 2/5] END criterion=gini, max_depth=3, min_samples_leaf=15;, score=0.820 total time=   0.3s
[CV 3/5] END criterion=gini, max_depth=3, min_samples_leaf=15;, score=0.819 total time=   0.3s
[CV 4/5] END criterion=gini, max_depth=3, min_samples_leaf=15;, score=0.832 total time=   0.3s
[CV 5/5] END criterion=gini, max_depth=3, min_samples_leaf=15;,

In [65]:
# 모델 파라미터
print(f'[searchCV.best_score_] {searchCV.best_score_}')
print(f'[searchCV.best_params_] {searchCV.best_params_}')
print(f'[searchCV.best_estimator_] {searchCV.best_estimator_}')

cv_resultDF=pd.DataFrame(searchCV.cv_results_)
cv_resultDF

[searchCV.best_score_] 0.8776247501295625
[searchCV.best_params_] {'min_samples_leaf': 5, 'max_depth': 15, 'criterion': 'entropy'}
[searchCV.best_estimator_] RandomForestClassifier(criterion='entropy', max_depth=15, min_samples_leaf=5,
                       n_estimators=300, random_state=7)


Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_min_samples_leaf,param_max_depth,param_criterion,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.67767,0.017834,0.038502,0.000487,15,9,log_loss,"{'min_samples_leaf': 15, 'max_depth': 9, 'crit...",0.869231,0.838462,0.876805,0.881617,0.86718,0.866659,0.015024,20
1,0.410321,0.001613,0.026325,0.000509,15,3,gini,"{'min_samples_leaf': 15, 'max_depth': 3, 'crit...",0.815385,0.820192,0.819057,0.831569,0.827719,0.822784,0.005948,42
2,0.633818,0.003583,0.036702,0.000417,15,8,entropy,"{'min_samples_leaf': 15, 'max_depth': 8, 'crit...",0.867308,0.836538,0.875842,0.883542,0.86333,0.865312,0.015994,24
3,0.417327,0.001629,0.026142,0.000515,12,3,log_loss,"{'min_samples_leaf': 12, 'max_depth': 3, 'crit...",0.799038,0.807692,0.809432,0.823869,0.824832,0.812973,0.009939,44
4,0.597035,0.002865,0.036385,0.000507,12,7,gini,"{'min_samples_leaf': 12, 'max_depth': 7, 'crit...",0.869231,0.834615,0.879692,0.87873,0.862368,0.864927,0.016446,27
5,0.63154,0.009014,0.035877,0.001093,8,7,log_loss,"{'min_samples_leaf': 8, 'max_depth': 7, 'crite...",0.868269,0.834615,0.880654,0.879692,0.861405,0.864927,0.016779,25
6,0.651489,0.004268,0.037345,0.000455,12,8,log_loss,"{'min_samples_leaf': 12, 'max_depth': 8, 'crit...",0.870192,0.835577,0.881617,0.880654,0.869105,0.867429,0.01674,16
7,0.658886,0.007144,0.039775,0.00131,14,10,gini,"{'min_samples_leaf': 14, 'max_depth': 10, 'cri...",0.872115,0.831731,0.87873,0.882579,0.871992,0.867429,0.0183,15
8,0.576212,0.00082,0.034165,0.000752,13,6,log_loss,"{'min_samples_leaf': 13, 'max_depth': 6, 'crit...",0.8625,0.832692,0.871992,0.877767,0.857555,0.860501,0.015596,31
9,0.63593,0.00607,0.036566,0.000804,10,7,log_loss,"{'min_samples_leaf': 10, 'max_depth': 7, 'crit...",0.868269,0.838462,0.881617,0.875842,0.86333,0.865504,0.014901,23


In [66]:
searchCV.best_params_

{'min_samples_leaf': 5, 'max_depth': 15, 'criterion': 'entropy'}

In [67]:
searchCV.best_estimator_