# 앙상블(ensemble)
- 다양한 모델을 결합하여 예측 성능을 향상시키틑 방법
- 투표(voting), 배깅(Bagging), 부스팅(Boosting), 스태킹(stacking) 네 가지로 구분

In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt
import seaborn as sns

### voting
- hard voting : 여러 개의 예측지에 대해 다수결로 결정
- soft voting : 여러 개의 예측 확률을 평균내어 결정

In [5]:
from sklearn.datasets import load_breast_cancer
data= load_breast_cancer()

df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [6]:
df['target'].value_counts()

target
1    357
0    212
Name: count, dtype: int64

In [7]:
# 데이터 준비 (분리)
from sklearn.model_selection import train_test_split

X = data.data
y = data.target

X_train,X_test,y_train,y_test = train_test_split(X,y,random_state=0,test_size=0.2)


### hard voting

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

knn_clf = KNeighborsClassifier()
lr_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier()

voting_clf = VotingClassifier(
    estimators=[
        ('knn_clf', knn_clf),
        ('lr_clf', lr_clf),
        ('dt_clf', dt_clf)
    ],
    voting='hard' # 안쓸시 기본값으로 설정정
)

#앙상블 모델 학습
voting_clf.fit(X_train,y_train)

y_pred_train = voting_clf.predict(X_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
print('학습 점수 : ', acc_score_train)

y_pred_test = voting_clf.predict(X_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print('테스트  평가 점수 : ' , acc_score_test)

학습 점수 :  0.9714285714285714
테스트  평가 점수 :  0.9649122807017544


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [16]:
# hard votin 작동 원리 == 다수결
start, end = 40,50
voting_clf_pred = voting_clf.predict(X_test[start:end])


for classfier in [knn_clf,lr_clf,dt_clf]:
    #개별 학습 및 예측
    classfier.fit(X_train,y_train)
    pred = classfier.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    
    class_name = classfier.__class__.__name__ # 클래스의 이름 메타데이터 가져옴옴
    print(f'{class_name} 개별 정확도: {acc_score:.4f}')
    print(f'{class_name} 예측값 : {pred[start:end]}')

KNeighborsClassifier 개별 정확도: 0.9386
KNeighborsClassifier 예측값 : [0 1 0 1 0 0 1 1 1 0]
LogisticRegression 개별 정확도: 0.9561
LogisticRegression 예측값 : [0 1 0 1 0 0 1 1 1 0]
DecisionTreeClassifier 개별 정확도: 0.9035
DecisionTreeClassifier 예측값 : [0 1 0 1 0 0 1 1 1 0]


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [30]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

knn_clf = KNeighborsClassifier()
lr_clf = LogisticRegression()
dt_clf = DecisionTreeClassifier(random_state=0)

voting_clf = VotingClassifier(
    estimators=[
        ('knn_clf', knn_clf),
        ('lr_clf', lr_clf),
        ('dt_clf', dt_clf)
    ],
    voting='soft' # 안쓸시 기본값으로 설정정
)

#앙상블 모델 학습
voting_clf.fit(X_train,y_train)

y_pred_train = voting_clf.predict(X_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
print('학습 점수 : ', acc_score_train)

y_pred_test = voting_clf.predict(X_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print('테스트  평가 점수 : ' , acc_score_test)

학습 점수 :  0.9912087912087912
테스트  평가 점수 :  0.9649122807017544


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [31]:
# soft voting 작동 원리 == 각 예측기의 확률값 평균
start, end = 40,50

voting_clf_pred_proba = voting_clf.predict_proba(X_test[start:end])
print('앙상블 예측값:', voting_clf_pred_proba)


앙상블 예측값: [[5.76015572e-01 4.23984428e-01]
 [8.56528685e-04 9.99143471e-01]
 [9.99511890e-01 4.88110466e-04]
 [2.44233373e-04 9.99755767e-01]
 [9.13156643e-01 8.68433574e-02]
 [1.00000000e+00 6.67991763e-14]
 [6.73321806e-05 9.99932668e-01]
 [1.07083474e-02 9.89291653e-01]
 [6.09596883e-04 9.99390403e-01]
 [9.99899506e-01 1.00494433e-04]]


In [36]:
# hard votin 작동 원리 == 다수결
start, end = 40,50
voting_clf_pred_proba = voting_clf.predict_proba(X_test[start:end])

averages = np.full_like(voting_clf_pred_proba, 0)


for classfier in [knn_clf,lr_clf,dt_clf]:
    #개별 학습 및 예측
    classfier.fit(X_train,y_train)
    pred = classfier.predict(X_test)
    acc_score = accuracy_score(y_test, pred)
    pred_proba = classfier.predict_proba(X_test[start:end])
    
    averages += pred_proba
    
    class_name = classfier.__class__.__name__ # 클래스의 이름 메타데이터 가져옴옴
    # print(f'{class_name} 개별 정확도: {acc_score:.4f}')
    # print(f'{class_name} 예측값 : {pred_proba}')
    
calc_averages = averages / 3
print('각 모델별 예측값 평균 : \n',calc_averages)
print(np.array_equal(voting_clf_pred_proba, calc_averages))

각 모델별 예측값 평균 : 
 [[5.76015572e-01 4.23984428e-01]
 [8.56528685e-04 9.99143471e-01]
 [9.99511890e-01 4.88110466e-04]
 [2.44233373e-04 9.99755767e-01]
 [9.13156643e-01 8.68433574e-02]
 [1.00000000e+00 6.67991763e-14]
 [6.73321806e-05 9.99932668e-01]
 [1.07083474e-02 9.89291653e-01]
 [6.09596883e-04 9.99390403e-01]
 [9.99899506e-01 1.00494433e-04]]
True


STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


### Bagging
- Bootstrap Aggregation
- Bootstrap 방식의 샘플링 : 각 estimator 마다 훈련 데이터를 뽑을 때, 중복 값을 허용하는 방식
- 분류 모델의 경우, 각 tree(estimator)의 예측값을 다수결(hard voting)결정
- 회귀 모델의 경우, 각 tree(estimator)의 예측값을 평균내어 결정
- 기본적으로 100개의 tree 사용

In [39]:
from sklearn.ensemble import RandomForestClassifier

rt_clf = RandomForestClassifier(n_estimators=100,max_depth=5, random_state=0)
# 학습
rt_clf.fit(X_train,y_train)

y_pred_train = rt_clf.predict(X_train)
acc_score_train = accuracy_score(y_train, y_pred_train)
print('학습 점수: ', acc_score_train)

y_pred_test = rt_clf.predict(X_test)
acc_score_test = accuracy_score(y_test, y_pred_test)
print('테스트 평가 점수 : ', acc_score_test)



학습 점수:  0.9934065934065934
테스트 평가 점수 :  0.9473684210526315
