In [None]:
#################################################
# Voting 코드 summary
#################################################

import pandas as pd

from sklearn.ensemble import VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

import warnings 
warnings.filterwarnings('ignore')

cancer = load_breast_cancer()
data_df = pd.DataFrame(cancer.data, columns=cancer.feature_names)
data_df.head(3)



lr_clf = LogisticRegression()
knn_clf = KNeighborsClassifier(n_neighbors=8)
                                        # classifier이름을 다시 지정해줌
vo_clf = VotingClassifier(estimators = [('LR', lr_clf),('KNN', knn_clf)], 
                          voting='soft')
X_train, X_test, y_train, y_test = train_test_split(cancer.data, cancer.target, test_size=.2, random_state=156)

# VotingClassifier가 학습
vo_clf.fit(X_train, y_train)
pred = vo_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)
print('VotingClassifier의 정확도 {:.4f}'.format(accuracy))

classifiers = [lr_clf, knn_clf]
for classifier in classifiers:
    classifier.fit(X_train, y_train)
    pred = classifier.predict(X_test)
    print('{}의 정확도: {: .4f}'.format(classifier, accuracy_score(y_test, pred)))

In [None]:
#################################################
# RandomForest (Bagging) 코드 summary
#################################################

# 기본 코드
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score
import pandas as pd
import warnings 
warnings.filterwarnings('ignore')

rf_clf = RandomForestClassifier(n_estimators=100, 
                                random_state=0, 
                                max_depth=8)
rf_clf.fit(X_train, y_train)
pred = rf_clf.predict(X_test)
accuracy = accuracy_score(y_test, pred)

# GridSearchCV로 하기
from sklearn.model_selection import GridSearchCV
params = {
    'max_depth':[8,16,24],
    'min_samples_split':[1,6,12],
    'min_samples_leaf':[2,8,16]
}
rf_clf = RandomForestClassifier(n_estimators = 100, random_state=0, n_jobs=-1)
grid_cv = GridSearchCV(rf_clf, param_grid=params, cv=2, n_jobs=-1)
grid_cv.fit(X_train, y_train)

grid_cv.best_params_
grid_cv.best_score_

# GridSearchCV로 나온 best_params으로 다시 모델링 하기
rf_clf1 = RandomForestClassifier(n_estimators=100, random_state=0,
                                 max_depth=16, min_samples_split=2, 
                                 min_samples_leaf=6)
rf_clf1.fit(X_train, y_train)
pred = rf_clf1.predict(X_test)
accruacy = accuracy_score(y_test, pred)
print(accruacy)

# feature_importances_확인하기
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

ftr_importances_values = rf_clf1.feature_importances_
ftr_importances = pd.Series(feature_importances_values, index=X_train.columns)
ftr_top20 = ftr_importances.sort_values(ascending=False)[:20]

plt.figure(figsize=(8,6))
plt.title('Feature importance Top 20')
sns.barplot(
    x=ftr_top20,
    y=ftr_top20.index
)
plt.show()

In [None]:
#################################################
# GBM 코드 summary
#################################################

from sklearn.ensemble import GradientBoostingClassifier
import time
import warnings
warnings.filterwarnings('ignore')

X_train, X_test, y_train, y_test = get_human_dataset()

# GBM 수행 시간 측정을 위함. 시작 시간 설정.
start_time = time.time()

gb_clf = GradientBoostingClassifier(random_state=0)
gb_clf.fit(X_train , y_train)
gb_pred = gb_clf.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)

print('GBM 정확도: {0:.4f}'.format(gb_accuracy))
print("GBM 수행 시간: {0:.1f} 초 ".format(time.time() - start_time))

# GridSearchCV를 사용하여 하이퍼 파라미터 찾기
from sklearn.model_selection import GridSearchCV

params = {
    'n_estimators':[100, 500],
    'learning_rate' : [ 0.05, 0.1]
}
grid_cv = GridSearchCV(gb_clf , param_grid=params , cv=2 ,verbose=1)
grid_cv.fit(X_train , y_train)
print('최적 하이퍼 파라미터:\n', grid_cv.best_params_)
print('최고 예측 정확도: {0:.4f}'.format(grid_cv.best_score_))

gb_pred = grid_cv.best_estimator_.predict(X_test)
gb_accuracy = accuracy_score(y_test, gb_pred)
print('GBM 정확도: {0:.4f}'.format(gb_accuracy))