## 앙상블(Ensemble) 학습

### 1. Voting 방식
#### 1.1 Hard Voting
- Logistic Regression
- SVM
- KNearestNeighbor

In [7]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [8]:
from sklearn.preprocessing import MinMaxScaler
cancer_scaled = MinMaxScaler().fit_transform(cancer.data)

In [9]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
    cancer_scaled, cancer.target, stratify=cancer.target,
    test_size= 0.2, random_state= 2022
)

In [10]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier

In [11]:
lr = LogisticRegression(random_state=2022)
sv = SVC(random_state=2022)
kn = KNeighborsClassifier()

In [12]:
# Ensemble Classifier for Hard Voting
from sklearn.ensemble import VotingClassifier
voc = VotingClassifier(
    estimators=[('LR',lr),('SVC',sv),('KNN',kn)], voting='hard'
)

In [14]:
voc.fit(X_train, y_train)
voc.score(X_test, y_test)

1.0

In [16]:
# Performance by Each Estimator
lr.fit(X_train, y_train)
sv.fit(X_train, y_train)
kn.fit(X_train, y_train)\

lr.score(X_test, y_test), sv.score(X_test, y_test), kn.score(X_test, y_test)

(0.9912280701754386, 1.0, 0.9824561403508771)

#### 1.2 Soft Voting

- Logistic Regression

In [19]:
# 객체의 속성과 메서드 확인
dir(lr)

['C',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_check_feature_names',
 '_check_n_features',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_more_tags',
 '_predict_proba_lr',
 '_repr_html_',
 '_repr_html_inner',
 '_repr_mimebundle_',
 '_validate_data',
 'class_weight',
 'classes_',
 'coef_',
 'decision_function',
 'densify',
 'dual',
 'fit',
 'fit_intercept',
 'get_params',
 'intercept_',
 'intercept_scaling',
 'l1_ratio',
 'max_iter',
 'multi_class',
 'n_features_in_',
 'n_iter_',
 'n_jobs',
 'penalty',
 'predict',
 'predict_log_proba',
 'predict_proba',
 'random_state',
 'score',
 'set_params',
 'solver',
 'sparsify'

In [20]:
lr.predict(X_test[:5])

array([0, 1, 0, 1, 0])

In [23]:
lr.predict_proba(X_test[:5])

array([[0.99792166, 0.00207834],
       [0.07775117, 0.92224883],
       [0.9774613 , 0.0225387 ],
       [0.05952966, 0.94047034],
       [0.99554778, 0.00445222]])

- Support Vector Machine

In [22]:
svc_2 = SVC(probability=True, random_state=2022)
svc_2.fit(X_train, y_train)
svc_2.predict_proba(X_test[:5])

array([[9.99896299e-01, 1.03701492e-04],
       [3.84470713e-03, 9.96155293e-01],
       [9.99896384e-01, 1.03616009e-04],
       [5.97356113e-03, 9.94026439e-01],
       [9.99311796e-01, 6.88204061e-04]])

- KNearestNeighbor

In [25]:
kn.predict_proba(X_test[-5:])

array([[0.8, 0.2],
       [1. , 0. ],
       [0.8, 0.2],
       [0. , 1. ],
       [0. , 1. ]])

- Soft Voting

In [26]:
voc_2 = VotingClassifier(
    estimators=[('LR',lr),('SVC',svc_2),('KNN',kn)], voting='soft'
)

In [27]:
voc_2.fit(X_train, y_train)
voc_2.score(X_test, y_test)

1.0

In [28]:
voc_2.predict_proba(X_test[:5])

array([[9.99272654e-01, 7.27346212e-04],
       [2.71986265e-02, 9.72801374e-01],
       [9.92452563e-01, 7.54743719e-03],
       [2.18344062e-02, 9.78165594e-01],
       [9.98286525e-01, 1.71347474e-03]])

- GridSearchCV

In [29]:
lr.C, svc_2.C

(1.0, 1.0)

In [34]:
kn.get_params()

{'algorithm': 'auto',
 'leaf_size': 30,
 'metric': 'minkowski',
 'metric_params': None,
 'n_jobs': None,
 'n_neighbors': 5,
 'p': 2,
 'weights': 'uniform'}

In [37]:
# voting class 의 estimator 지정명__파라미터
params = {
    'LR__C': [0.1, 1, 10],
    'SVC__C': [0.1, 1, 10],
}

In [38]:
from sklearn.model_selection import GridSearchCV
gv_voc_2 = GridSearchCV(voc_2, params, scoring='accuracy', cv=5)
gv_voc_2.fit(X_train, y_train)
gv_voc_2.best_params_

{'LR__C': 10, 'SVC__C': 0.1}

In [39]:
params = {
    'LR__C': [5, 10, 30],
    'SVC__C': [0.05, 0.1, 0.3]
}

In [40]:
gv_voc_2 = GridSearchCV(voc_2, params, scoring='accuracy', cv=5)
gv_voc_2.fit(X_train, y_train)
gv_voc_2.best_params_

{'LR__C': 10, 'SVC__C': 0.05}

In [41]:
gv_voc_2.best_estimator_.score(X_test, y_test)

1.0

### 2.Bagging - Random Forest

In [42]:
from sklearn.ensemble import RandomForestClassifier
rf = RandomForestClassifier(random_state=2022)
rf.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2022,
 'verbose': 0,
 'warm_start': False}

In [43]:
rf.fit(X_train, y_train)
rf.score(X_test, y_test)

1.0