Voting -- > 같은샘플, 다른 분류기 (회기 voting, 분류 voting

# VotingRegression (보팅 회귀)
- 여러 회귀 모형 알고리즘을 결합하고 평균 예측값을 반환한다.

## 패키지로딩

In [2]:
from sklearn.datasets import load_boston
from sklearn.linear_model import Lasso, Ridge, LinearRegression
from sklearn.ensemble import VotingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

import numpy as np
import pandas as pd

In [3]:
import warnings
warnings.filterwarnings(action='ignore')

## 데이터 로딩 및 분할

In [11]:
x, y= load_boston(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)

## 모델 생성 및 평가

In [12]:
lasso = Lasso(alpha=0.05)
ridge = Ridge(alpha=1)
linear = LinearRegression()

vo_r = VotingRegressor(estimators=[('Lasso',lasso),('Ridge',ridge),('Linear', linear)])

vo_r.fit(x_train,y_train)

VotingRegressor(estimators=[('Lasso', Lasso(alpha=0.05)),
                            ('Ridge', Ridge(alpha=1)),
                            ('Linear', LinearRegression())])

In [13]:
r_squre = vo_r.score(x_test,y_test)
print(f'결정계수: {r_squre:.3f}')

결정계수: 0.667


In [14]:
y_hat = vo_r.predict(x_test)
rmse = np.sqrt(mean_squared_error(y_test, y_hat))
print(f'RSME: {rmse:.3f}')

RSME: 5.270


# VotingClassifier (보팅 분류)

## 패키지 로딩

In [15]:
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

import numpy as np
import pandas as pd

In [25]:
x, y = load_breast_cancer(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)

## 모델 생성 및 검증

In [26]:
logistic = LogisticRegression(max_iter=3000)
knn = KNeighborsClassifier()

# voting regression은 회기 분석이라서 voring 파라메터가 없음
# voting = hard or soft, default =hard
vo_c = VotingClassifier(estimators=[('Logisitic',logistic), ('KNN',knn)], voting='soft')
vo_c.fit(x_train, y_train)

y_hat = vo_c.predict(x_test)
print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')
print(f'AUC: {roc_auc_score(y_test, vo_c.predict_proba(x_test)[:,1]):.3f}')

정확도: 0.965
AUC: 0.994


# Gradient Boossting Classifier

## 패키지 로딩

In [27]:
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score

## 데이터 로딩 및 분할

In [28]:
x, y = load_breast_cancer(return_X_y=True)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)

## 모델 생성 및 평가
- GradientBoostingClassifier
    - 랜덤포레스트와 같이 의사결정 나무 모델을 부스팅 방법으로 활용하는 모델
    - 이전 예측기가 만든 잔여오차(residual error)에 새로운 예측기를 학습시킴

In [30]:
gb_c = GradientBoostingClassifier()
gb_c.fit(x_train, y_train)

y_hat = gb_c.predict(x_test)
print(f'정확도: {accuracy_score(y_test, y_hat):.3f}')
print(f'AUC: {roc_auc_score(y_test, vo_c.predict_proba(x_test)[:,1]):.3f}')

정확도: 0.977
AUC: 0.994


## 하이퍼 파라메터 튜닝

In [None]:
# gradient Boost도 의사결정나무를 이용하기 때문에 같은 파라미터가 있다

In [31]:
from sklearn.model_selection import GridSearchCV
gbc_params = {'n_estimators':[100,200],
            'max_depth':[6,8,10,12],
            'min_samples_leaf':[3,5,7,19],
            'min_samples_split':[3,5,7,10]}

gbc_gs = GridSearchCV(gb_c, gbc_params, scoring= 'accuracy', n_jobs=-1)#전체 코어사용
gbc_gs.fit(x_train, y_train)

GridSearchCV(estimator=GradientBoostingClassifier(), n_jobs=-1,
             param_grid={'max_depth': [6, 8, 10, 12],
                         'min_samples_leaf': [3, 5, 7, 19],
                         'min_samples_split': [3, 5, 7, 10],
                         'n_estimators': [100, 200]},
             scoring='accuracy')

In [32]:
print('최적 파라메터:', gbc_gs.best_params_)
print('예측 정확도:', gbc_gs.best_score_)

최적 파라메터: {'max_depth': 12, 'min_samples_leaf': 7, 'min_samples_split': 3, 'n_estimators': 200}
예측 정확도: 0.9648417721518987


## Gradient Boosting Regressor (부스팅 회귀)

In [33]:
from sklearn.datasets import load_diabetes
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

import numpy as np

## 데이터 로딩 및 분할

In [34]:
x, y = load_diabetes(return_X_y= True)
x_train, x_test, y_train, y_test = train_test_split(x,y,test_size=0.3, random_state=0)

In [35]:
gb_r = GradientBoostingClassifier(n_estimators=120, max_depth=6)
gb_r.fit(x_train, y_train)

GradientBoostingClassifier(max_depth=6, n_estimators=120)

In [37]:
y_hat = gb_r.predict(x_test)
r_square = gb_r.score(x_test, y_test)
rmse = np.sqrt(mean_squared_error(y_test, y_hat))
print(f'결정계수:{r_square:3f}')
print(f'RMSE: {rmse:.3f}')

결정계수:0.000000
RMSE: 81.517
