## Chapter 7. 앙상블 학습과 랜덤 포레스트

#### 7.1 Voting Classifier

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

X, y = make_moons(n_samples=500, noise=0.30, random_state=2023)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2023)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

In [3]:
from sklearn.metrics import accuracy_score
for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    print(clf.__class__.__name__, accuracy_score(y_test, y_pred))

LogisticRegression 0.784
RandomForestClassifier 0.84
SVC 0.824
VotingClassifier 0.824


#### 7.2 Bagging and Pasting

In [7]:
# Decision Tree Classifier 500개 ensemble
# 각 분류기는 중복을 허용하여 무작위 선택된 100개의 샘플로 훈련 (배깅)
# 페이스팅을 사용하려면 boostrap=False 로 지정 

from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, random_state=2023
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [10]:
# oob 평가

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1, oob_score=True, random_state=2023
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)
print(bag_clf.oob_score_)
print(accuracy_score(y_test, y_pred))

0.928
0.832


In [11]:
bag_clf.oob_decision_function_
# 첫 번째 훈련 샘플이 양성 클래스에 속할 확률 93%

array([[0.0610687 , 0.9389313 ],
       [0.80851064, 0.19148936],
       [1.        , 0.        ],
       [0.82901554, 0.17098446],
       [0.02997275, 0.97002725],
       [0.0625    , 0.9375    ],
       [1.        , 0.        ],
       [0.91863517, 0.08136483],
       [0.89378238, 0.10621762],
       [0.95949367, 0.04050633],
       [0.02910053, 0.97089947],
       [0.775     , 0.225     ],
       [0.99469496, 0.00530504],
       [0.94736842, 0.05263158],
       [0.71352785, 0.28647215],
       [0.7284264 , 0.2715736 ],
       [0.00524934, 0.99475066],
       [0.08831169, 0.91168831],
       [0.25      , 0.75      ],
       [0.81746032, 0.18253968],
       [0.23848238, 0.76151762],
       [0.99737533, 0.00262467],
       [0.7605985 , 0.2394015 ],
       [0.70284238, 0.29715762],
       [0.0248139 , 0.9751861 ],
       [0.99739583, 0.00260417],
       [0.        , 1.        ],
       [0.03571429, 0.96428571],
       [1.        , 0.        ],
       [0.77173913, 0.22826087],
       [0.

#### 7.4 Random Forest

In [12]:
from sklearn.ensemble import RandomForestClassifier

rnd_clf = RandomForestClassifier(n_estimators=500, max_leaf_nodes=16, n_jobs=-1)
rnd_clf.fit(X_train, y_train)

y_pred_rf = rnd_clf.predict(X_test)

In [None]:
# BaggingClassifier를 활용한 Randomforest 유사 모델
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(max_features='auto', max_leaf_nodes=16),
    n_estimators=500, max_samples=1.0, bootstrap=True, n_jobs=-1
)

In [13]:
from sklearn.datasets import load_iris

iris = load_iris()
rnd_clf = RandomForestClassifier(n_estimators=500, n_jobs=-1)
rnd_clf.fit(iris['data'], iris['target'])
for name, score in zip(iris['feature_names'], rnd_clf.feature_importances_):
    print(name, score)

sepal length (cm) 0.10354418589863607
sepal width (cm) 0.025140883421020777
petal length (cm) 0.4393720228398539
petal width (cm) 0.43194290784048933


#### 7.5 Boosting