In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Ensemble - Voting

In [2]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split 

In [3]:
cancer = load_breast_cancer() 

x_train, x_test, y_train, y_test = train_test_split(cancer['data'],
                                                           cancer['target'],
                                                           stratify=cancer['target'],
                                                           random_state=0)
cancer['target_names']

array(['malignant', 'benign'], dtype='<U9')

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier

knn1 = KNeighborsClassifier(n_neighbors=5)
knn2 = KNeighborsClassifier(n_neighbors=3)
lr = LogisticRegression(max_iter=10000)
dt3 = DecisionTreeClassifier(max_depth=3)
dt5 = DecisionTreeClassifier(max_depth=5)

In [5]:
from sklearn.ensemble import VotingClassifier
hard = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                        ('dt3', dt3), ('dt5', dt5)])
soft = VotingClassifier([('knn1', knn1), ('knn2', knn2), ('lr', lr),
                        ('dt3', dt3), ('dt5', dt5)], voting='soft')

In [6]:
names = ['hard', 'soft', 'knn1', 'knn2', 'lr', 'dt3', 'dt5']
for idx, model in enumerate([hard, soft, knn1, knn2, lr, dt3, dt5]):
    model.fit(x_train, y_train)
    name = names[idx]
    train_score = model.score(x_train, y_train)*100
    test_score = model.score(x_test, y_test)*100
    print(f'{name} Train Accuracy: {train_score:.2f}%')
    print(f'{name} Test Accuracy: {test_score:.2f}%')
    print()

hard Train Accuracy: 98.12%
hard Test Accuracy: 95.10%

soft Train Accuracy: 99.53%
soft Test Accuracy: 95.80%

knn1 Train Accuracy: 94.60%
knn1 Test Accuracy: 91.61%

knn2 Train Accuracy: 95.77%
knn2 Test Accuracy: 91.61%

lr Train Accuracy: 96.71%
lr Test Accuracy: 93.71%

dt3 Train Accuracy: 97.65%
dt3 Test Accuracy: 93.01%

dt5 Train Accuracy: 100.00%
dt5 Test Accuracy: 93.01%



### Ensemble - Bagging

In [7]:
from sklearn.ensemble import RandomForestClassifier
model = RandomForestClassifier(max_depth=5).fit(x_train, y_train)
model.score(x_train,y_train), model.score(x_test,y_test)

(1.0, 0.951048951048951)

### Ensemble - Boosting

In [8]:
from sklearn.ensemble import GradientBoostingClassifier
model = GradientBoostingClassifier().fit(x_train, y_train)
model.score(x_train,y_train), model.score(x_test,y_test)

(1.0, 0.958041958041958)

### Ensemble - Stacking

In [9]:
from sklearn.ensemble import StackingClassifier

estimators = [('rf', RandomForestClassifier()),
               ('gb', GradientBoostingClassifier())]

model = StackingClassifier(estimators=estimators,
                          final_estimator=LogisticRegression())

model.fit(x_train,y_train).score(x_test,y_test)

0.958041958041958

### Grid Search

In [12]:
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import GradientBoostingRegressor
boston = pd.read_csv('../data/boston.csv')
x = boston.iloc[:, :-1]
y = boston['price']

model = GradientBoostingRegressor(random_state=0)

params = {
    'n_estimators': [100, 300, 500],
    'learning_rate' : [0.01, 0.01, 0.1],
    'max_depth' : [3, 4, 5],
}

gs = GridSearchCV(model, params).fit(x, y)
gs

In [13]:
import pandas as pd
report = pd.DataFrame(gs.cv_results_)
report

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_learning_rate,param_max_depth,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.14293,0.011582,0.0,0.0,0.01,3,100,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.62202,0.688112,0.220861,0.325642,-0.175309,0.336265,0.310029,24
1,0.391113,0.022348,0.000597,0.000796,0.01,3,300,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.756358,0.865449,0.691776,0.48463,0.314734,0.622589,0.197722,9
2,0.643083,0.020727,0.0,0.0,0.01,3,500,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.762956,0.867336,0.738485,0.535428,0.38054,0.656949,0.17511,4
3,0.163073,0.006892,0.000397,0.000487,0.01,4,100,"{'learning_rate': 0.01, 'max_depth': 4, 'n_est...",0.633338,0.622736,0.314555,0.374583,-0.250098,0.339023,0.32126,22
4,0.508461,0.021249,0.0,0.0,0.01,4,300,"{'learning_rate': 0.01, 'max_depth': 4, 'n_est...",0.756735,0.804976,0.656688,0.506602,0.123035,0.569607,0.245518,13
5,0.812511,0.014279,0.0,0.0,0.01,4,500,"{'learning_rate': 0.01, 'max_depth': 4, 'n_est...",0.768721,0.805867,0.693576,0.544797,0.22456,0.607504,0.211341,11
6,0.200673,0.016901,0.003748,0.005974,0.01,5,100,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.634855,0.560458,0.290091,0.377049,-0.234089,0.325673,0.305927,26
7,0.575356,0.01195,0.006251,0.007656,0.01,5,300,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.741932,0.763737,0.582341,0.47783,0.032308,0.519629,0.265394,20
8,1.03425,0.074271,0.000573,0.001146,0.01,5,500,"{'learning_rate': 0.01, 'max_depth': 5, 'n_est...",0.753964,0.763259,0.622998,0.498162,0.105823,0.548841,0.241819,16
9,0.146147,0.011246,0.000361,0.000723,0.01,3,100,"{'learning_rate': 0.01, 'max_depth': 3, 'n_est...",0.62202,0.688112,0.220861,0.325642,-0.175309,0.336265,0.310029,24


In [14]:
gs.best_params_

{'learning_rate': 0.1, 'max_depth': 3, 'n_estimators': 100}

In [15]:
gs.best_score_

0.6697600256867121

In [16]:
gs.best_estimator_

### Ensemble(Voting)을 활용한 분류 실습

In [17]:
# !pip install xgboost lightgbm 

In [18]:
import xgboost as xgb
from sklearn.metrics import r2_score

model = xgb.XGBClassifier(objective='reg:squarederror')
model.fit(x_train, y_train)

p_train = model.predict(x_train)
p_test = model.predict(x_test)

r2_score(y_train, p_train), r2_score(y_test, p_test)

(1.0, 0.7601677148846959)

In [19]:
import lightgbm as lgb

lgb_train = lgb.Dataset(x_train, y_train)
lgb_eval = lgb.Dataset(x_test, y_test, reference=lgb_train)

params = {
    'objective': 'regression',
}

model = lgb.train(params, lgb_train, valid_sets=lgb_eval)

p_train = model.predict(x_train, num_iteration=model.best_iteration)
p_test = model.predict(x_test, num_iteration=model.best_iteration)

r2_score(y_train, p_train), r2_score(y_test, p_test)

[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.000251 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 4272
[LightGBM] [Info] Number of data points in the train set: 426, number of used features: 30
[LightGBM] [Info] Start training from score 0.626761


(0.9918813367894838, 0.8215397238164611)