# CatBoost Tutorial

## Установка

In [None]:
!pip install catboost -q

In [None]:
!pip install jupyter_contrib_nbextensions
!jupyter contrib nbextension install --user

In [1]:
!jupyter nbextension enable --py widgetsnbextension

Enabling notebook extension jupyter-js-widgets/extension...
      - Validating: ok


## Загрузка данных

In [2]:
from catboost.datasets import titanic
import numpy as np

train_df, test_df = titanic()

train_df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [4]:
train_df.shape

(891, 12)

## Обработка данных

In [7]:
null_value_stats = train_df.isnull().sum(axis=0)
null_value_stats[null_value_stats != 0]

Series([], dtype: int64)

Заполним пропуски в данных некоторым уникальным значением (есть и другие техники, но здесь для простоты используем эту).

In [6]:
train_df.fillna(-999, inplace=True)
test_df.fillna(-999, inplace=True)

Разбиваем данные на матрицу объект-признак и вектор с целевой переменной.

In [8]:
X = train_df.drop('Survived', axis=1)
y = train_df.Survived

Посмотрим на типы признаков.

In [9]:
print(X.dtypes)

categorical_features_indices = np.where(X.dtypes != float)[0]

PassengerId      int64
Pclass           int64
Name            object
Sex             object
Age            float64
SibSp            int64
Parch            int64
Ticket          object
Fare           float64
Cabin           object
Embarked        object
dtype: object


In [13]:
np.where(X.dtypes != float)[0]

array([ 0,  1,  2,  3,  5,  6,  7,  9, 10], dtype=int64)

Разобъем данные на трейн и валидацию.

In [14]:
from sklearn.model_selection import train_test_split

X_train, X_validation, y_train, y_validation = train_test_split(X, y, train_size=0.75, random_state=42)

X_test = test_df

## Обучение модели

In [15]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score

In [16]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy()],
    random_seed=42,
    logging_level='Silent'
)

В google colab нет возможности отрисовывать динамические графики (насколько мы знаем), поэтому для отрисовки графиков запускайте ноутбук на локальной машине.

In [17]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    logging_level='Verbose',
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 173ms	remaining: 2m 53s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 182ms	remaining: 1m 31s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 198ms	remaining: 1m 5s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 213ms	remaining: 53.1s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 231ms	remaining: 46s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 249ms	remaining: 41.2s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 265ms	remaining: 37.7s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 283ms	remaining: 35s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 299ms	remaining: 32.9s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 315ms	remaining: 31.2s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 333ms	remaining: 29.9s
11:	learn: 0.5423887	tes

<catboost.core.CatBoostClassifier at 0x21d7b4b5e50>

## Кросс-валидация

In [18]:
cv_params = model.get_params()
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X, y, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [20]:
model.get_params()

{'random_seed': 42, 'logging_level': 'Silent', 'custom_loss': ['Accuracy']}

Посмотрим на среднее качество и его разброс по кросс-валидации

In [29]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.83±0.02 on step 355


## Применяем обученную модель

In [30]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 0 1 0 1 0 1 0]
[[0.85473931 0.14526069]
 [0.76313031 0.23686969]
 [0.88972889 0.11027111]
 [0.87876173 0.12123827]
 [0.3611047  0.6388953 ]
 [0.90513381 0.09486619]
 [0.33434185 0.66565815]
 [0.78468564 0.21531436]
 [0.39429048 0.60570952]
 [0.94047549 0.05952451]]


## Улучшение предсказаний и другие возможности CatBoost

### Early Stopping

In [32]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_validation, y_validation),
    early_stopping_rounds = 30,
    logging_level='Verbose',  # you can uncomment this for text output
#    plot=True
)

Learning rate set to 0.028683
0:	learn: 0.6739988	test: 0.6742630	best: 0.6742630 (0)	total: 15ms	remaining: 15s
1:	learn: 0.6589013	test: 0.6592240	best: 0.6592240 (1)	total: 24.4ms	remaining: 12.2s
2:	learn: 0.6421502	test: 0.6426778	best: 0.6426778 (2)	total: 40.1ms	remaining: 13.3s
3:	learn: 0.6297276	test: 0.6302310	best: 0.6302310 (3)	total: 53.2ms	remaining: 13.2s
4:	learn: 0.6147184	test: 0.6198228	best: 0.6198228 (4)	total: 69.2ms	remaining: 13.8s
5:	learn: 0.6017730	test: 0.6073627	best: 0.6073627 (5)	total: 101ms	remaining: 16.7s
6:	learn: 0.5885309	test: 0.5956000	best: 0.5956000 (6)	total: 117ms	remaining: 16.5s
7:	learn: 0.5783200	test: 0.5858523	best: 0.5858523 (7)	total: 133ms	remaining: 16.4s
8:	learn: 0.5665895	test: 0.5743842	best: 0.5743842 (8)	total: 149ms	remaining: 16.4s
9:	learn: 0.5575381	test: 0.5662283	best: 0.5662283 (9)	total: 166ms	remaining: 16.4s
10:	learn: 0.5491045	test: 0.5575176	best: 0.5575176 (10)	total: 183ms	remaining: 16.4s
11:	learn: 0.5423887	

<catboost.core.CatBoostClassifier at 0x21d7b4b5e50>

In [33]:
model.tree_count_

284

Получили непереобученную модель, причем не пришлось ждать 1000 итераций!

## Важность признаков

CatBoost поддерживает несколько способов вычисления важности признаков, в том числе широко применяемый сейчас подход Shap (про него поговорим в следующих модулях).

In [34]:
feature_importances = model.get_feature_importance()

feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

Sex: 28.377591527551807
Pclass: 17.450379813673287
Parch: 10.276200044515498
Embarked: 8.761954037905873
Cabin: 8.281577549519366
SibSp: 7.950157281933982
Age: 7.842375602284015
Ticket: 5.620556803330714
Fare: 5.439207339285509
PassengerId: 0.0
Name: 0.0


## Сохранение модели

In [35]:
# сохраняем модель
model.save_model('catboost_model.dump')

# загружаем сохраненную модель
model.load_model('catboost_model.dump');