## Курс "Машинное обучение"

## Тема занятия: Ансамбли

Загружаем данные

In [None]:
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split

In [None]:
X, y = load_breast_cancer(return_X_y=True)

In [None]:
trainX, testX, trainY, testY = train_test_split(X, y, random_state=42, test_size=0.2)
print('Train: ', trainX.shape, trainY.shape, 'Test: ', testX.shape, testY.shape)

In [None]:
print('Y labels: ', testY)

## Stacking

Подготовим (разные) классификаторы для ансамбля

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import Perceptron
from sklearn.svm import LinearSVC
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import StackingClassifier

In [None]:
clsfrs = [
    ('dtr', DecisionTreeClassifier(random_state=42)),
    ('prc', Perceptron(random_state=42)),
    ('svm', LinearSVC(random_state=42)),
]

Обучим стэкинг

In [None]:
st = StackingClassifier(estimators=clsfrs, final_estimator=LogisticRegression(random_state=42), n_jobs=-1)
st.fit(trainX, trainY)
print('Stacking (dtr, prc, svm) score:', st.score(testX, testY))

Подготовим (похожие) классификаторы для ансамбля

In [None]:
clsfrs = [
    ('svm1', LinearSVC(random_state=421)),
    ('svm2', LinearSVC(random_state=422)),
    ('svm3', LinearSVC(random_state=423)),
]
st = StackingClassifier(estimators=clsfrs, final_estimator=LogisticRegression(random_state=42), n_jobs=-1)
st.fit(trainX, trainY)
print('Stacking (svm1, svm2, svm3) score:', st.score(testX, testY))

In [None]:
clsfrs = [
    ('dtr1', DecisionTreeClassifier(random_state=421)),
    ('dtr2', DecisionTreeClassifier(random_state=422)),
    ('dtr3', DecisionTreeClassifier(random_state=423)),
]
st = StackingClassifier(estimators=clsfrs, final_estimator=LogisticRegression(random_state=42), n_jobs=-1)
st.fit(trainX, trainY)
print('Stacking (dtr1, dtr2, dtr3) score:', st.score(testX, testY))

In [None]:
clsfrs = [
    ('prc1', Perceptron(random_state=421)),
    ('prc2', Perceptron(random_state=422)),
    ('prc3', Perceptron(random_state=423)),
]
st = StackingClassifier(estimators=clsfrs, final_estimator=LogisticRegression(random_state=42), n_jobs=-1)
st.fit(trainX, trainY)
print('Stacking (prc1, prc2, prc3) score:', st.score(testX, testY))

## Bagging

Готовим классификаторы (деревья) для бэггинга

In [None]:
from sklearn.ensemble import BaggingClassifier

In [None]:
bg = BaggingClassifier(base_estimator=None, n_estimators=2, n_jobs=-1, random_state=42)
bg.fit(trainX, trainY)

Смотрим на результат

In [None]:
out_y = bg.predict(testX) # предсказываем классы
print(out_y)
out_score = bg.score(testX, testY) # считаем качество классификации
print('Bagging (2 tree) score:', out_score)

Играемся с числом базовых классификаторов

In [None]:
bg = BaggingClassifier(base_estimator=None, n_estimators=10, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('Bagging (10 tree) score:', bg.score(testX, testY))

In [None]:
bg = BaggingClassifier(base_estimator=None, n_estimators=100, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('Bagging (100 tree) score:', bg.score(testX, testY))

Поменяем базовые классификаторы на SVM

In [None]:
from sklearn.svm import LinearSVC
clf = LinearSVC(random_state=42)

In [None]:
bg = BaggingClassifier(base_estimator=clf, n_estimators=10, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('Bagging (10 svm) score:', bg.score(testX, testY))

In [None]:
bg = BaggingClassifier(base_estimator=clf, n_estimators=100, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('Bagging (100 svm) score:', bg.score(testX, testY))

## AdaBoost

Поработаем с деревьями в качестве базовых классификаторов для AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier

In [None]:
ada = AdaBoostClassifier(base_estimator=None, n_estimators=10, random_state=42).fit(trainX, trainY)
print('AdaBoost (10 tree) score:', ada.score(testX, testY))

Посмотрим на историю обучения (как добавление базовых классификаторов улучшает ситуацию)

In [None]:
s_hist = list(ada.staged_score(testX, testY)) # посмотрим на историю обучения
import matplotlib.pyplot as plt 
%matplotlib inline 
plt.plot(s_hist)
plt.xlabel('Boosting iteration')
plt.ylabel('Accuracy')

Увеличим число деревьев

In [None]:
ada = AdaBoostClassifier(base_estimator=None, n_estimators=100, random_state=42).fit(trainX, trainY)
print('AdaBoost (100 tree) score:', ada.score(testX, testY))
print("Overfit!")

Пеменяем базовые классификаторы на логистическую регрессию

In [None]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=42, solver='liblinear')

In [None]:
ada = AdaBoostClassifier(base_estimator=clf, n_estimators=10, random_state=42).fit(trainX, trainY)
print('AdaBoost (10 logr) score:', ada.score(testX, testY))

In [None]:
ada = AdaBoostClassifier(base_estimator=clf, n_estimators=100, random_state=42).fit(trainX, trainY)
print('AdaBoost (100 logr) score:', ada.score(testX, testY))

In [None]:
ada = AdaBoostClassifier(base_estimator=clf, n_estimators=1000, random_state=42).fit(trainX, trainY)
print('AdaBoost (1000 logr) score:', ada.score(testX, testY))

## Gradient Boosting

Градиентный бустинг общего вида (на деревьях)

In [None]:
from sklearn.ensemble import GradientBoostingClassifier

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=10, random_state=42).fit(trainX, trainY)
print('Gradient Boosting (10 tree) score:', gb.score(testX, testY))

In [None]:
gb = GradientBoostingClassifier(learning_rate=0.1, n_estimators=100, random_state=42).fit(trainX, trainY)
print('Gradient Boosting (100 tree) score:', gb.score(testX, testY))

----

А теперь посмотрим на сторонние успешные реализации, которые можно поставить с помощью:
* pip/conda install xgboost
* pip/conda install lightgbm
* pip/conda install catboost

## XGBoost

In [None]:
from xgboost import XGBClassifier

In [None]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=10, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('XGB (10 tree) score:', xgb.score(testX, testY))

In [None]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=100, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('XGB (100 tree) score:', xgb.score(testX, testY))

In [None]:
xgb = XGBClassifier(learning_rate=0.1, n_estimators=1000, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('XGB (1000 tree) score:', xgb.score(testX, testY))

## LightGBM

In [None]:
from lightgbm import LGBMClassifier

In [None]:
xgb = LGBMClassifier(learning_rate=0.1, n_estimators=10, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('LGB (10 tree) score:', xgb.score(testX, testY))

In [None]:
xgb = LGBMClassifier(learning_rate=0.1, n_estimators=100, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('LGB (100 tree) score:', xgb.score(testX, testY))

In [None]:
xgb = LGBMClassifier(learning_rate=0.1, n_estimators=1000, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('LGB (1000 tree) score:', xgb.score(testX, testY))

In [None]:
xgb = LGBMClassifier(learning_rate=0.1, n_estimators=10000, n_jobs=-1, random_state=42).fit(trainX, trainY)
print('LGB (2000 tree) score:', xgb.score(testX, testY))

## CatBoost

In [None]:
from catboost import CatBoostClassifier

In [None]:
cb = CatBoostClassifier(learning_rate=0.1, n_estimators=10, thread_count=-1, random_state=42, logging_level='Silent').fit(trainX, trainY)
print('CB (10 tree) score:', cb.score(testX, testY))

In [None]:
cb = CatBoostClassifier(learning_rate=0.1, n_estimators=100, thread_count=-1, random_state=42, logging_level='Silent').fit(trainX, trainY)
print('CB (100 tree) score:', cb.score(testX, testY))