# 앙상블 학습

In [3]:
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [5]:
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()

In [6]:
cancer_scaled = scaler.fit_transform(cancer.data)

In [7]:
from sklearn.model_selection import train_test_split

In [42]:
X_train,X_test,y_train,y_test = train_test_split(
    cancer_scaled,cancer.target,test_size=0.2,stratify=cancer.target,random_state=2021
)

#### 앙상블 학습을 위한 분류기
- 로지스틱 회귀
- 서포트 벡터 머신
- K최근접 이웃

In [43]:
from sklearn.linear_model import LogisticRegression # 로지스틱 회귀
from sklearn.svm import SVC # 학습
from sklearn.neighbors import KNeighborsClassifier ## K최근접 이웃

In [44]:
lrc = LogisticRegression(random_state=2021)
svc = SVC()
knn = KNeighborsClassifier()

In [45]:
from sklearn.ensemble import VotingClassifier ## 앙상블 학습

In [55]:
voc = VotingClassifier(
    estimators=[('LR',LogisticRegression(random_state=2021)),('SVC',SVC()),('KNN',KNeighborsClassifier())], voting='hard'
)

In [57]:
voc.fit(X_train,y_train)
voc.score(X_test,y_test)

0.9824561403508771

- 개별학습의 성능

In [56]:
lrc.fit(X_train,y_train)
svc.fit(X_train,y_train)
knn.fit(X_train,y_train)
lrc.score(X_test,y_test),svc.score(X_test,y_test),knn.score(X_test,y_test)

(0.9824561403508771, 0.9824561403508771, 0.9824561403508771)

- soft 보팅

In [60]:
print(dir(lrc))

['C', '__class__', '__delattr__', '__dict__', '__dir__', '__doc__', '__eq__', '__format__', '__ge__', '__getattribute__', '__getstate__', '__gt__', '__hash__', '__init__', '__init_subclass__', '__le__', '__lt__', '__module__', '__ne__', '__new__', '__reduce__', '__reduce_ex__', '__repr__', '__setattr__', '__setstate__', '__sizeof__', '__str__', '__subclasshook__', '__weakref__', '_check_n_features', '_estimator_type', '_get_param_names', '_get_tags', '_more_tags', '_predict_proba_lr', '_repr_html_', '_repr_html_inner', '_repr_mimebundle_', '_validate_data', 'class_weight', 'classes_', 'coef_', 'decision_function', 'densify', 'dual', 'fit', 'fit_intercept', 'get_params', 'intercept_', 'intercept_scaling', 'l1_ratio', 'max_iter', 'multi_class', 'n_features_in_', 'n_iter_', 'n_jobs', 'penalty', 'predict', 'predict_log_proba', 'predict_proba', 'random_state', 'score', 'set_params', 'solver', 'sparsify', 'tol', 'verbose', 'warm_start']


In [161]:
lrc.predict_proba(X_test[:5])

array([[0.36348222, 0.63651778],
       [0.97162943, 0.02837057],
       [0.1869565 , 0.8130435 ],
       [0.02133462, 0.97866538],
       [0.0548391 , 0.9451609 ]])

In [97]:
svc.predict_proba(X_test[:5])  # probability 를 False값으로 줘서 동작을 안함 

AttributeError: predict_proba is not available when  probability=False

In [160]:
svc2 = SVC(probability=True)
svc2.fit(X_train,y_train)
svc2.predict_proba(X_test[:5])

array([[4.40899148e-01, 5.59100852e-01],
       [9.99762776e-01, 2.37224402e-04],
       [1.56819918e-02, 9.84318008e-01],
       [3.47586314e-03, 9.96524137e-01],
       [4.95698785e-03, 9.95043012e-01]])

In [147]:
voc2 = VotingClassifier(
    estimators=[('LR',LogisticRegression(random_state=2021)),('SVC',svc2),('KNN',KNeighborsClassifier())], voting='soft'
)

In [143]:
voc2.fit(X_train,y_train)
voc2.score(X_test,y_test)


0.9912280701754386

In [144]:
voc2._predict_proba(X_test[:5])

array([[0.41407642, 0.58592358],
       [0.9904916 , 0.0095084 ],
       [0.13435155, 0.86564845],
       [0.00824481, 0.99175519],
       [0.01991446, 0.98008554]])

### Random Forest

In [163]:
from sklearn.ensemble import RandomForestClassifier

In [164]:
rfc = RandomForestClassifier(random_state=2021)

In [169]:
rfc.get_params()

{'bootstrap': True,
 'ccp_alpha': 0.0,
 'class_weight': None,
 'criterion': 'gini',
 'max_depth': None,
 'max_features': 'auto',
 'max_leaf_nodes': None,
 'max_samples': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 2021,
 'verbose': 0,
 'warm_start': False}

In [170]:
rfc. fit(X_train,y_train)

RandomForestClassifier(random_state=2021)

In [171]:
rfc.score(X_test,y_test)

0.9736842105263158

- XGBoost

In [175]:
import xgboost as xgb
from xgboost import XGBClassifier


In [178]:
xgc = XGBClassifier()
xgc.fit(X_train,y_train)




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [179]:
xgc.score(X_test,y_test)

0.9912280701754386

- KNN