# Семинар 8 - Бустинг

В качестве источника используется:
- [Глава 7](https://github.com/ageron/handson-ml/blob/master/07_ensemble_learning_and_random_forests.ipynb) из Книги [Aurélien Geron](https://github.com/ageron)
- материалы [mlcourse.ai](https://mlcourse.ai) от [Юрия Кашницкого](https://yorko.github.io) и [OpenDataScience](https://ods.ai)

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap

from sklearn.model_selection import train_test_split
from sklearn.datasets import make_moons

%matplotlib inline

seed = 42
np.random.seed(seed)

In [None]:
def plot_decision_boundary(clf, X, y, axes=[-1.5, 2.5, -1, 1.5], alpha=0.5, contour=True):
    x1s = np.linspace(axes[0], axes[1], 100)
    x2s = np.linspace(axes[2], axes[3], 100)
    x1, x2 = np.meshgrid(x1s, x2s)
    X_new = np.c_[x1.ravel(), x2.ravel()]
    y_pred = clf.predict(X_new).reshape(x1.shape)

    custom_cmap = ListedColormap(['#fafab0','#9898ff','#a0faa0'])
    plt.contourf(x1, x2, y_pred, alpha=0.3, cmap=custom_cmap)
    if contour:
        custom_cmap2 = ListedColormap(['#7d7d58','#4c4c7f','#507d50'])
        plt.contour(x1, x2, y_pred, cmap=custom_cmap2, alpha=0.8)
        
    plt.plot(X[:, 0][y==0], X[:, 1][y==0], "yo", alpha=alpha)
    plt.plot(X[:, 0][y==1], X[:, 1][y==1], "bs", alpha=alpha)

    plt.axis(axes)
    plt.xlabel(r"$x_1$", fontsize=18)
    plt.ylabel(r"$x_2$", fontsize=18, rotation=0)

 Загрузим синтетический набор данных "Луны"

In [None]:
X, y = make_moons(n_samples=500, noise=0.30, random_state=seed)

X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=seed)

# AdaBoost

In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.tree import DecisionTreeClassifier

In [None]:
# бустинг наж решающим пнем 
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(max_depth=1),
                             n_estimators=200, algorithm="SAMME.R",
                             learning_rate=0.5, random_state=seed)
ada_clf.fit(X_train, y_train)

In [None]:
plot_decision_boundary(ada_clf, X, y)
plt.show()

Давайте попробуем посмотреть что происходит по шагам. В качестве иллюстрации будем использовать SVM.

In [None]:
from sklearn.svm import SVC

In [None]:
m = len(X_train)

plt.figure(figsize=(20, 4))

for subplot, learning_rate in ((121, 1), (122, 0.5)):
    sample_weights = np.ones(m)
    plt.subplot(subplot)

    for i in range(5):
        # init and fit
        svm_clf = SVC(kernel="rbf", C=0.05, gamma="auto", random_state=seed)
        svm_clf.fit(X_train, y_train, sample_weight=sample_weights)

        # predict
        y_pred = svm_clf.predict(X_train)
        sample_weights[y_pred != y_train] *= (1 + learning_rate)

        # plot
        plot_decision_boundary(svm_clf, X, y, alpha=0.2)
        plt.title("learning_rate = {}".format(learning_rate), fontsize=16)

    if subplot == 121:
        plt.text(-0.7, -0.65, "1", fontsize=14)
        plt.text(-0.6, -0.10, "2", fontsize=14)
        plt.text(-0.5,  0.10, "3", fontsize=14)
        plt.text(-0.4,  0.55, "4", fontsize=14)
        plt.text(-0.3,  0.90, "5", fontsize=14)

plt.show()

# Gradient Boosting
Для начала посмотрим на игрушечных данных

In [None]:
X = np.random.rand(100, 1) - 0.5
y = 3 * X[:, 0] ** 2 + 0.05 * np.random.randn(100)

plt.scatter(X, y)
plt.show()

Построим решающее дерево

In [None]:
from sklearn.tree import DecisionTreeRegressor

In [None]:
tree_reg1 = DecisionTreeRegressor(max_depth=2, random_state=seed)
tree_reg1.fit(X, y)

In [None]:
# Найдем остатки от прогнозов и построим еще одно дерево
y2 = y - tree_reg1.predict(X)
tree_reg2 = DecisionTreeRegressor(max_depth=2, random_state=seed)
tree_reg2.fit(X, y2)

In [None]:
# Найдем остатки от прогнозов и построим еще одно дерево
y3 = y2 - tree_reg2.predict(X)
tree_reg3 = DecisionTreeRegressor(max_depth=2, random_state=seed)
tree_reg3.fit(X, y3)

In [None]:
# зададим новую точку не из обуающей выборки
X_new = np.array([[0.8]])

ошибка на i-ом объекте зависит от веса (ошибки всех предыдущих моделей в ансамбле) на ошибку последней модели ансамбля 

In [None]:
# получим суммарное предсказание 3 деревьев по ансамблю
y_pred = sum(tree.predict(X_new) for tree in (tree_reg1, tree_reg2, tree_reg3))

In [None]:
def plot_predictions(regressors, X, y, axes, label=None, 
                     style="r-", data_style="b.", data_label=None):

    # get predict
    x1 = np.linspace(axes[0], axes[1], 500)
    y_pred = sum(regressor.predict(x1.reshape(-1, 1))
     for regressor in regressors)

    # plot
    plt.plot(X[:, 0], y, data_style, label=data_label)
    plt.plot(x1, y_pred, style, linewidth=2, label=label)
    if label or data_label:
        plt.legend(loc="upper center", fontsize=16)
    plt.axis(axes)

In [None]:
plt.figure(figsize=(11,11))

plt.subplot(321)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], 
                 label="$h_1(x_1)$", style="g-", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Residuals and tree predictions", fontsize=16)

plt.subplot(322)
plot_predictions([tree_reg1], X, y, axes=[-0.5, 0.5, -0.1, 0.8], 
                 label="$h(x_1) = h_1(x_1)$", data_label="Training set")
plt.ylabel("$y$", fontsize=16, rotation=0)
plt.title("Ensemble predictions", fontsize=16)

plt.subplot(323)
plot_predictions([tree_reg2], X, y2, axes=[-0.5, 0.5, -0.5, 0.5], 
                 label="$h_2(x_1)$", style="g-", data_style="k+", data_label="Residuals")
plt.ylabel("$y - h_1(x_1)$", fontsize=16)

plt.subplot(324)
plot_predictions([tree_reg1, tree_reg2], X, y, axes=[-0.5, 0.5, -0.1, 0.8], 
                 label="$h(x_1) = h_1(x_1) + h_2(x_1)$")
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.subplot(325)
plot_predictions([tree_reg3], X, y3, axes=[-0.5, 0.5, -0.5, 0.5], 
                 label="$h_3(x_1)$", style="g-", data_style="k+")
plt.ylabel("$y - h_1(x_1) - h_2(x_1)$", fontsize=16)
plt.xlabel("$x_1$", fontsize=16)

plt.subplot(326)
plot_predictions([tree_reg1, tree_reg2, tree_reg3], X, y, axes=[-0.5, 0.5, -0.1, 0.8], 
                 label="$h(x_1) = h_1(x_1) + h_2(x_1) + h_3(x_1)$")
plt.xlabel("$x_1$", fontsize=16)
plt.ylabel("$y$", fontsize=16, rotation=0)

plt.show()

#### Попробуем реализацию градиентного бустинга из пакета sklearn

In [None]:
from sklearn.ensemble import GradientBoostingRegressor

Посмотрим, как изменятся результаты при разном количестве базовых классификаторов

In [None]:
gbrt = GradientBoostingRegressor(max_depth=2,
                                 n_estimators=3,
                                 learning_rate=1,
                                 random_state=seed)
gbrt.fit(X, y)

In [None]:
gbrt_slow = GradientBoostingRegressor(max_depth=2, 
                                      n_estimators=200,
                                      learning_rate=1,
                                      random_state=seed)
gbrt_slow.fit(X, y)

In [None]:
plt.figure(figsize=(18, 4))

plt.subplot(121)
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], 
                 label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt.learning_rate, gbrt.n_estimators), fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8],
                 label="Ensemble predictions")
plt.title("learning_rate={}, n_estimators={}".format(gbrt_slow.learning_rate, gbrt_slow.n_estimators), fontsize=14)

plt.show()

Посмотрим, как изменятся результаты при разной глубине базовых классификаторов 

In [None]:
gbrt = GradientBoostingRegressor(max_depth=2, 
                                 n_estimators=3,
                                 learning_rate=1.0,
                                 random_state=seed)
gbrt.fit(X, y)

gbrt_slow = GradientBoostingRegressor(max_depth=10,
                                      n_estimators=3,
                                      learning_rate=1.0,
                                      random_state=seed)
gbrt_slow.fit(X, y)

In [None]:
plt.figure(figsize=(18, 4))

plt.subplot(121)
plot_predictions([gbrt], X, y, axes=[-0.5, 0.5, -0.1, 0.8], 
                 label="Ensemble predictions")
plt.title("learning_rate={}, max_depth={}".format(gbrt.learning_rate, gbrt.max_depth), fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_slow], X, y, axes=[-0.5, 0.5, -0.1, 0.8],
                 label="Ensemble predictions")
plt.title("learning_rate={}, max_depth={}".format(gbrt_slow.learning_rate, gbrt_slow.max_depth), fontsize=14)

plt.show()

Рассмотрим методы борьбы с переобучением

#### Посмотрим, как ведет себя ошибка на каждой итерации

In [None]:
from sklearn.metrics import mean_squared_error

In [None]:
# split data
X_train, X_val, y_train, y_val = train_test_split(X, y, random_state=seed)

# init and fit
gbrt = GradientBoostingRegressor(max_depth=2, n_estimators=200, random_state=42)
gbrt.fit(X_train, y_train)

In [None]:
# get errors on each model
errors = [mean_squared_error(y_val, y_pred)
          for y_pred in gbrt.staged_predict(X_val)]

# определим дерево, на котором произошла минимальная ошибка
best_n_estimators = np.argmin(errors)
print(best_n_estimators)

last_estimator = 200 - 1
print(f'Error on {best_n_estimators} model - {errors[best_n_estimators]}')
print(f'Error on {last_estimator} (last) model - {errors[last_estimator]}')
print(f'Error difference - {errors[best_n_estimators] - errors[last_estimator]}')

In [None]:
# обучим ансамбль на "оптимальном количестве" моделей
gbrt_best = GradientBoostingRegressor(max_depth=2,
                                      n_estimators=best_n_estimators,
                                      random_state=seed)
gbrt_best.fit(X_train, y_train)
min_error = np.min(errors)

In [None]:
plt.figure(figsize=(20, 4))

plt.subplot(121)
plt.plot(errors, "b.-")
plt.plot([best_n_estimators, best_n_estimators], [0, min_error], "k--")
plt.plot([0, 200], [min_error, min_error], "k--")
plt.plot(best_n_estimators, min_error, "ko")
plt.text(best_n_estimators, min_error * 1.2, "Minimum", ha="center", fontsize=14)
plt.axis([0, 200, 0, 0.01])
plt.xlabel("Number of trees")
plt.title("Validation error", fontsize=14)

plt.subplot(122)
plot_predictions([gbrt_best], X, y, axes=[-0.5, 0.5, -0.1, 0.8])
plt.title("Best model (%d trees)" % best_n_estimators, fontsize=14)

plt.show()

## LightGBM

In [None]:
import lightgbm as lgb

In [None]:
from google.colab import drive
drive.mount('/content/gdrive/')

In [None]:
# Загрузим данные и проведем предобработки: 
data = pd.read_csv('./gdrive/My Drive/Colab Notebooks/week_05_ensembels/data/flight_delays_train.csv')
data['dep_delayed_15min'] = data['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)
data['Month'] = data['Month'].str.replace('c-', '').astype('int16')
data['DayofMonth'] = data['DayofMonth'].str.replace('c-', '').astype('int16')
data['DayOfWeek'] = data['DayOfWeek'].str.replace('c-', '').astype('int16')
data['UniqueCarrier'] = pd.factorize(data['UniqueCarrier'])[0]
data['Origin'] = pd.factorize(data['Origin'])[0]
data['Dest'] = pd.factorize(data['Dest'])[0]

# DepTime пусть бдет более вещественным числом (так как 60 минут в часах)
data['DepTime_real'] = data['DepTime'].apply(lambda x: int(x/100)+((x/100-int(x/100))*100)/59)
data.drop('DepTime', axis=1, inplace=True)

x = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'].values

data.head()

Разделим выборку на обучающую и тестовую

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7,
                                                    shuffle=True,
                                                    random_state=seed)

Создадим датасеты, как того требует библиотека

In [None]:
trn_data = lgb.Dataset(x_train, label=y_train)
val_data = lgb.Dataset(x_test, label=y_test)

Укажем параметры, для того чтобы бустинг решал задачу классификации

In [None]:
param = {
    'boost': 'gbdt',
    'metric':'auc',
    'num_threads': 1,
    'objective': 'binary', 
}

Обучим модель

In [None]:
history = {}
clf = lgb.train(param, trn_data, num_boost_round=1000, 
                valid_sets=[trn_data, val_data], 
                verbose_eval=100,  evals_result=history)

In [None]:
plt.figure(figsize=(20, 4))

plt.plot(history['training']['auc'], label='Train')
plt.plot(history['valid_1']['auc'], label='Test')
plt.xlabel('Iterations')
plt.ylabel('AUC')
plt.title('История обучения модели')
plt.legend()

plt.show()

В документации LightGBM есть целый [раздел](https://lightgbm.readthedocs.io/en/latest/Parameters-Tuning.html) о том, как тюнить параметры. 
Попробуем улучшить качество. 

In [None]:
c = {
    'boost': 'gbdt',
    'metric':'auc',
    'num_threads': 12,
    'objective': 'binary', 
    'bagging_fraction': 0.7, # Не больше 70% выборки
    'feature_fraction': 0.7, # Не больше 70% фич
    'learning_rate': 0.5, # Установим скорость обучения
    'max_depth': 2, # Ограничим дерево 
    'min_data_in_leaf': 100, # Ограничим кол-во объектов в листе 
    'num_leaves': 2, # Ограничим кол-во листьев (Не совсем то, что max_depth)
}

In [None]:
history = {}

# Увеличим кол-во итераций num_boost_round и установим критерий останова early_stopping_rounds
clf = lgb.train(param, trn_data, num_boost_round=3000, 
                valid_sets=[trn_data, val_data], 
                verbose_eval=200,  evals_result=history,
                early_stopping_rounds=100)

In [None]:
plt.figure(figsize=(20, 4))

plt.plot(history['training']['auc'], label='Train')
plt.plot(history['valid_1']['auc'], label='Test')
plt.xlabel('Iterations')
plt.ylabel('AUC')
plt.title('История обучения модели')
plt.legend()

plt.show()

## XGboost

In [None]:
!pip install xgboost
from xgboost import XGBClassifier, DMatrix, train

Описание [XGBClassifier](https://xgboost.readthedocs.io/en/latest/python/python_api.html?highlight=train#xgboost.XGBClassifier)

In [None]:
params = {
    'n_estimators': 450,
    'colsample_bytree': 0.75,
    'reg_alpha': 1,
    'reg_lambda': 2,
    'subsample': 0.55,
    'learning_rate': 0.5,
    'gamma': 0.5,
    'min_child_weight': 0.01,
    'sampling_method': 'uniform',
    'n_jobs': -1,
    'gpu_id ': 0,
}

xgb = XGBClassifier(**params, random_state=seed)

In [None]:
history = {}

xgb.fit(x_train, y_train,
        eval_metric=['auc'], eval_set=[(x_train, y_train), (x_test, y_test)], 
        verbose=True, early_stopping_rounds=100)

In [None]:
preds = xgb.predict(x_test)
preds

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, preds)

## CatBoost

In [None]:
!pip install catboost
from catboost import CatBoostClassifier, Pool

In [None]:
cbc = CatBoostClassifier(random_state=seed, 
                         early_stopping_rounds=100, 
                         task_type="GPU", devices='0:1')

In [None]:
# Загрузим данные и не проведем предобработки: 
data = pd.read_csv('./gdrive/My Drive/Colab Notebooks/week_05_ensembels/data/flight_delays_train.csv')
data['dep_delayed_15min'] = data['dep_delayed_15min'].apply(lambda x: 1 if x == 'Y' else 0)

data.drop('DepTime', axis=1, inplace=True)

x = data.drop('dep_delayed_15min', axis=1)
y = data['dep_delayed_15min'].values

data.head()

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, train_size=0.7,
                                                    shuffle=True, 
                                                    random_state=seed)

In [None]:
x_train.head()

In [None]:
from google.colab import output
output.enable_custom_widget_manager()

In [None]:
valid_set = [x_test, y_test]

eval_dataset = Pool(data=x_test,
                    label=y_test,
                    cat_features=[0, 1, 2, 3, 4, 5])

cbc.fit(x_train, y_train, cat_features=[0, 1, 2, 3, 4, 5], eval_set=eval_dataset, 
        verbose=1, plot=True)

In [None]:
preds = cbc.predict(eval_dataset)

In [None]:
preds

In [None]:
roc_auc_score(y_test, preds)