In [1]:
from sklearn.datasets import make_moons
from sklearn.model_selection import train_test_split

X, y = make_moons(n_samples=1000, noise=0.4)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

In [2]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

log_clf = LogisticRegression()
rnd_clf = RandomForestClassifier()
svm_clf = SVC()

voting_clf = VotingClassifier(
    estimators=[('lr', log_clf), ('rf', rnd_clf), ('svc', svm_clf)],
    voting='hard'
)
voting_clf.fit(X_train, y_train)

In [3]:
from sklearn.metrics import accuracy_score

for clf in (log_clf, rnd_clf, svm_clf, voting_clf):
    clf.fit(X_train, y_train)
    y_pred = clf.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(clf.__class__.__name__, accuracy)

LogisticRegression 0.82
RandomForestClassifier 0.8366666666666667
SVC 0.8566666666666667
VotingClassifier 0.86


In [4]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    max_samples=100, bootstrap=True, n_jobs=-1
)
bag_clf.fit(X_train, y_train)
y_pred = bag_clf.predict(X_test)

In [5]:
accuracy_score(y_test, y_pred)

0.8533333333333334

#### Voting
투표를 통해 예측값을 결정함,  
hard voting/soft voting있음. hard voting은 개수를 따지는 것, soft voting은 확률합을 따지는 것.

#### Bagging, Pasting
bagging : bootstrap sampling, 하나의 sample 안에 중복된 데이터가 존재할 수 있음  
pasting : 하나의 sample안에는 무조건 unique한 데이터만 있어야 함.

### oob 평가
out of bag words. 하나의 예측기마다 학습을 위해 샘플링되는 데이터의 비율은 전체 대비 63% 정도. m이 커지면 커질수록, 그 값이 63%에 가까워진다. 남은 37%를 가지고, 앙상블을 구성하는 하나하나의 예측기를 평가한다.

In [6]:
bag_clf = BaggingClassifier(
    DecisionTreeClassifier(), n_estimators=500,
    bootstrap=True, n_jobs=-1, oob_score=True)

bag_clf.fit(X_train, y_train)
bag_clf.oob_score_

0.8371428571428572

In [7]:
from sklearn.metrics import accuracy_score
y_pred = bag_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.8366666666666667

In [13]:
bag_clf.oob_decision_function_

array([[0.30813953, 0.69186047],
       [0.0060241 , 0.9939759 ],
       [0.06043956, 0.93956044],
       ...,
       [0.73033708, 0.26966292],
       [0.33333333, 0.66666667],
       [0.47368421, 0.52631579]])

### 특성 중요도 (feature importances)
RF의 모든 트리에 걸쳐서, 특성의 분할이 일어났을 때 어떤 특성이 더 불순도를 많이 감소시키냐? 를 측정해서 feature importance를 계산  
불순도는 연관된 샘플의 수에 따라 가중평균됨

In [29]:
from sklearn.datasets import fetch_openml
mnist = fetch_openml('mnist_784', version=1)
mnist.keys()

  warn(


dict_keys(['data', 'target', 'frame', 'categories', 'feature_names', 'target_names', 'DESCR', 'details', 'url'])

In [33]:
X, y = mnist['data'], mnist['target']

In [34]:
from sklearn.model_selection import train_test_split

X_trainVal, X_test, y_trainVal, y_test = train_test_split(X, y, test_size=1/7)
X_train, X_val, y_train, y_val = train_test_split(X_trainVal, y_trainVal, test_size=1/6)

print(X_train.shape)
print(X_val.shape)
print(X_test.shape)

(50000, 784)
(10000, 784)
(10000, 784)


In [35]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC # ?? 이게 맞을까? SVM을 잘 공부하지 못했으니까 고민해보기.....

from sklearn.ensemble import VotingClassifier

from sklearn.metrics import accuracy_score
# 1. Random Forest Classifier
# 2. Extra Tree Classifier
# 3. Logistic Regression
# 4. SVM

# 1. Random Forest Classifier
rf_clf = RandomForestClassifier(n_estimators=100)
et_clf = ExtraTreesClassifier(n_estimators=100)
lr_clf = LogisticRegression()
svc = SVC()
vt_clf_hard = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier()), 
        ('et', ExtraTreesClassifier()), 
        ('lr', LogisticRegression()),
        ('svc', SVC())
    ], 
    voting='hard'
)
vt_clf_soft = VotingClassifier(
    estimators=[
        ('rf', RandomForestClassifier()), 
        ('et', ExtraTreesClassifier()), 
        ('lr', LogisticRegression()),
        ('svc', SVC())
    ], 
    voting='soft'
)

rf_clf.fit(X_train, y_train)
print(f'rf_accuracy  : {accuracy_score(rf_clf.predict(X_val), y_val)}')
et_clf.fit(X_train, y_train)
print(f'et_accuracy  : {accuracy_score(et_clf.predict(X_val), y_val)}')
lr_clf.fit(X_train, y_train)
print(f'lr_accuracy  : {accuracy_score(lr_clf.predict(X_val), y_val)}')
svc.fit(X_train, y_train)
print(f'svc_accuracy : {accuracy_score(svc.predict(X_val), y_val)}')
vt_clf_hard.fit(X_train, y_train)
print(f'hard_accuracy: {accuracy_score(vt_clf_hard.predict(X_val), y_val)}')
vt_clf_soft.fit(X_train, y_train)
print(f'soft_accuracy: {accuracy_score(vt_clf_soft.predict(X_val), y_val)}')

rf_accuracy  : 0.9683
et_accuracy  : 0.9689


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


lr_accuracy  : 0.9194
svc_accuracy : 0.9772


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


hard_accuracy: 0.9724


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


AttributeError: predict_proba is not available when  probability=False