# 1. Data Preparation

In [1]:
import pandas as pd
import numpy as np

In [2]:
df_train = pd.read_csv('/media/Z/NDT/Transformer_Tabular/dataset/train.csv')
df_test = pd.read_csv('/media/Z/NDT/Transformer_Tabular/dataset/test.csv')
df_val = pd.read_csv('/media/Z/NDT/Transformer_Tabular/dataset/val.csv')

In [3]:
df_train.head()

Unnamed: 0,age,workclass,fnlwgt,education,education_num,marital_status,occupation,relationship,race,gender,capital_gain,capital_loss,hours_per_week,native_country,income_bracket
0,21.0,Local-gov,38771.0,Some-college,10.0,Never-married,Adm-clerical,Own-child,White,Male,0.0,0.0,40.0,United-States,0
1,43.0,Private,188436.0,Assoc-voc,11.0,Divorced,Prof-specialty,Not-in-family,White,Male,0.0,0.0,48.0,United-States,0
2,69.0,Private,182862.0,HS-grad,9.0,Never-married,Transport-moving,Not-in-family,White,Male,15831.0,0.0,40.0,United-States,1
3,60.0,Private,121832.0,11th,7.0,Married-civ-spouse,Sales,Husband,White,Male,0.0,0.0,40.0,United-States,1
4,29.0,Private,34292.0,HS-grad,9.0,Never-married,Other-service,Own-child,White,Male,0.0,0.0,38.0,United-States,0


In [4]:
X_train = df_train.drop('income_bracket', axis=1)
y_train = df_train.income_bracket
X_val = df_val.drop('income_bracket', axis=1)
y_val = df_val.income_bracket
X_test = df_test.drop('income_bracket', axis=1)
y_test = df_test.income_bracket

In [5]:
print(X_train.dtypes)

age               float64
workclass          object
fnlwgt            float64
education          object
education_num     float64
marital_status     object
occupation         object
relationship       object
race               object
gender             object
capital_gain      float64
capital_loss      float64
hours_per_week    float64
native_country     object
dtype: object


In [6]:
categorical_features_indices = np.where(X_train.dtypes != float)[0]
categorical_features_indices

array([ 1,  3,  5,  6,  7,  8,  9, 13])

# 2. CatBoost Basics

In [7]:
from catboost import CatBoostClassifier, Pool, metrics, cv
from sklearn.metrics import accuracy_score, classification_report, f1_score

## 2.1 Model Training

In [19]:
model = CatBoostClassifier(
    custom_loss=[metrics.Accuracy(), metrics.F1()],
    random_seed=42,
    logging_level='Silent',
)

In [20]:
model.fit(
    X_train, y_train,
    cat_features=categorical_features_indices,
    eval_set=(X_val, y_val),
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

<catboost.core.CatBoostClassifier at 0x7f6f300ff580>

## 2.2 Model Cross-Validation

In [21]:
cv_params = model.get_params()

In [22]:
cv_params.update({
    'loss_function': metrics.Logloss()
})
cv_data = cv(
    Pool(X_train, y_train, cat_features=categorical_features_indices),
    cv_params,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [23]:
print('Best validation accuracy score: {:.2f}±{:.2f} on step {}'.format(
    np.max(cv_data['test-Accuracy-mean']),
    cv_data['test-Accuracy-std'][np.argmax(cv_data['test-Accuracy-mean'])],
    np.argmax(cv_data['test-Accuracy-mean'])
))

Best validation accuracy score: 0.87±0.00 on step 947


In [24]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8719288128767017


## 2.3 Model Applying

In [25]:
predictions = model.predict(X_test)
predictions_probs = model.predict_proba(X_test)
print(predictions[:10])
print(predictions_probs[:10])

[0 0 0 1 0 0 0 1 0 0]
[[9.99207237e-01 7.92762671e-04]
 [7.55975622e-01 2.44024378e-01]
 [6.40197514e-01 3.59802486e-01]
 [9.22250497e-03 9.90777495e-01]
 [9.99803525e-01 1.96475018e-04]
 [9.98303184e-01 1.69681596e-03]
 [9.98998481e-01 1.00151861e-03]
 [2.93961115e-01 7.06038885e-01]
 [9.99353352e-01 6.46647535e-04]
 [9.65484884e-01 3.45151162e-02]]


In [26]:
predictions = model.predict(X_test)

In [27]:
predictions

array([0, 0, 0, ..., 1, 0, 1])

In [31]:
print(classification_report(y_true=y_test.values, y_pred=predictions))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92     12435
           1       0.77      0.65      0.70      3846

    accuracy                           0.87     16281
   macro avg       0.83      0.79      0.81     16281
weighted avg       0.87      0.87      0.87     16281



# 3. Catboost Features

In [32]:
model_without_seed = CatBoostClassifier(iterations=10, logging_level='Silent')
model_without_seed.fit(X_train, y_train, cat_features=categorical_features_indices)

print('Random seed assigned for this model: {}'.format(model_without_seed.random_seed_))

Random seed assigned for this model: 0


In [43]:
# Pool: The fastest way to pass the features data to the Pool constructor
params = {
    'iterations': 500,
    'learning_rate': 0.01,
    'eval_metric': metrics.F1(),
    'random_seed': 42,
    'logging_level': 'Silent',
    'use_best_model': False
}
train_pool = Pool(X_train, y_train, cat_features=categorical_features_indices)
validate_pool = Pool(X_val, y_val, cat_features=categorical_features_indices)

## 3.1 Using the best model

In [44]:
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

<catboost.core.CatBoostClassifier at 0x7f6f22569f10>

In [45]:
best_model_params = params.copy()
best_model_params.update({
    'use_best_model': True
})
best_model = CatBoostClassifier(**best_model_params)
best_model.fit(train_pool, eval_set=validate_pool);

In [46]:
print('Simple model validation accuracy: {:.4}'.format(
    f1_score(y_val, model.predict(X_val))
))
print('')

print('Best model validation accuracy: {:.4}'.format(
    f1_score(y_val, best_model.predict(X_val))
))

Simple model validation accuracy: 0.686

Best model validation accuracy: 0.6862


## 3.2 Early Stopping

In [47]:
%%time
model = CatBoostClassifier(**params)
model.fit(train_pool, eval_set=validate_pool)

CPU times: user 1min 2s, sys: 4.35 s, total: 1min 7s
Wall time: 5.57 s


<catboost.core.CatBoostClassifier at 0x7f6f30257b50>

In [50]:
%%time
earlystop_params = params.copy()
earlystop_params.update({
    'od_type': 'Iter',
    'od_wait': 50
})
earlystop_model = CatBoostClassifier(**earlystop_params)
earlystop_model.fit(train_pool, eval_set=validate_pool);

CPU times: user 6.49 s, sys: 467 ms, total: 6.95 s
Wall time: 676 ms


<catboost.core.CatBoostClassifier at 0x7f6f21faa040>

In [51]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    f1_score(y_val, model.predict(X_val))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    f1_score(y_val, earlystop_model.predict(X_val))
))

Simple model tree count: 500
Simple model validation accuracy: 0.686

Early-stopped model tree count: 53
Early-stopped model validation accuracy: 0.6251


## 3.3 Using Baseline

It is posible to use pre-training results (baseline) for training.

In [52]:
current_params = params.copy()
current_params.update({
    'iterations': 10
})
model = CatBoostClassifier(**current_params).fit(X_train, y_train, categorical_features_indices)
# Get baseline (only with prediction_type='RawFormulaVal')
baseline = model.predict(X_train, prediction_type='RawFormulaVal')
# Fit new model
model.fit(X_train, y_train, categorical_features_indices, baseline=baseline)

In [53]:
print('Simple model tree count: {}'.format(model.tree_count_))
print('Simple model validation accuracy: {:.4}'.format(
    f1_score(y_val, model.predict(X_val))
))
print('')

print('Early-stopped model tree count: {}'.format(earlystop_model.tree_count_))
print('Early-stopped model validation accuracy: {:.4}'.format(
    f1_score(y_val, earlystop_model.predict(X_val))
))

Simple model tree count: 10
Simple model validation accuracy: 0.6148

Early-stopped model tree count: 53
Early-stopped model validation accuracy: 0.6251


## 3.4 Snapshot Support

Catboost supports snapshots. You can use it for recovering training after an interruption or for starting training with previous results.

In [54]:
params_with_snapshot = params.copy()
params_with_snapshot.update({
    'iterations': 5,
    'learning_rate': 0.5,
    'logging_level': 'Verbose'
})

In [55]:
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)
params_with_snapshot.update({
    'iterations': 10,
    'learning_rate': 0.1,
})
model = CatBoostClassifier(**params_with_snapshot).fit(train_pool, eval_set=validate_pool, save_snapshot=True)

0:	learn: 0.6259126	test: 0.6174161	best: 0.6174161 (0)	total: 10.2ms	remaining: 40.7ms
1:	learn: 0.6395703	test: 0.6257225	best: 0.6257225 (1)	total: 18.1ms	remaining: 27.1ms
2:	learn: 0.6437150	test: 0.6316542	best: 0.6316542 (2)	total: 25ms	remaining: 16.7ms
3:	learn: 0.6443294	test: 0.6330674	best: 0.6330674 (3)	total: 31.2ms	remaining: 7.81ms
4:	learn: 0.6566194	test: 0.6462976	best: 0.6462976 (4)	total: 37.1ms	remaining: 0us

bestTest = 0.6462976276
bestIteration = 4

5:	learn: 0.6576854	test: 0.6484740	best: 0.6484740 (5)	total: 47.9ms	remaining: 43.1ms
6:	learn: 0.6576487	test: 0.6491354	best: 0.6491354 (6)	total: 55.6ms	remaining: 27.6ms
7:	learn: 0.6583727	test: 0.6491354	best: 0.6491354 (6)	total: 62ms	remaining: 16.6ms
8:	learn: 0.6610446	test: 0.6532519	best: 0.6532519 (8)	total: 68.5ms	remaining: 7.85ms
9:	learn: 0.6624989	test: 0.6537356	best: 0.6537356 (9)	total: 73.5ms	remaining: 0us

bestTest = 0.6537356322
bestIteration = 9



## 3.5 User Defined Objective Function

In [56]:
# for performance reasons it is better to install `numba` package for working with user defined functions
!pip install numba

[0m

In [57]:
class LoglossObjective(object):
    def calc_ders_range(self, approxes, targets, weights):
        # approxes, targets, weights are indexed containers of floats
        # (containers which have only __len__ and __getitem__ defined).
        # weights parameter can be None.
        #
        # To understand what these parameters mean, assume that there is
        # a subset of your dataset that is currently being processed.
        # approxes contains current predictions for this subset,
        # targets contains target values you provided with the dataset.
        #
        # This function should return a list of pairs (der1, der2), where
        # der1 is the first derivative of the loss function with respect
        # to the predicted value, and der2 is the second derivative.
        #
        # In our case, logloss is defined by the following formula:
        # target * log(sigmoid(approx)) + (1 - target) * (1 - sigmoid(approx))
        # where sigmoid(x) = 1 / (1 + e^(-x)).
        
        assert len(approxes) == len(targets)
        if weights is not None:
            assert len(weights) == len(approxes)
        
        result = []
        for index in range(len(targets)):
            e = np.exp(approxes[index])
            p = e / (1 + e)
            der1 = (1 - p) if targets[index] > 0.0 else -p
            der2 = -p * (1 - p)

            if weights is not None:
                der1 *= weights[index]
                der2 *= weights[index]

            result.append((der1, der2))
        return result

In [58]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=LoglossObjective(), 
    eval_metric=metrics.Logloss()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

0:	learn: 0.6773058	total: 361ms	remaining: 3.24s
1:	learn: 0.6622491	total: 407ms	remaining: 1.63s
2:	learn: 0.6478094	total: 449ms	remaining: 1.05s
3:	learn: 0.6340186	total: 493ms	remaining: 739ms
4:	learn: 0.6209517	total: 530ms	remaining: 530ms
5:	learn: 0.6084052	total: 559ms	remaining: 373ms
6:	learn: 0.5966998	total: 599ms	remaining: 257ms
7:	learn: 0.5854992	total: 633ms	remaining: 158ms
8:	learn: 0.5749714	total: 667ms	remaining: 74.1ms
9:	learn: 0.5650735	total: 707ms	remaining: 0us


## 3.6 User Defined Metric Function

In [60]:
class LoglossMetric(object):
    def get_final_error(self, error, weight):
        return error / (weight + 1e-38)

    def is_max_optimal(self):
        return False

    def evaluate(self, approxes, target, weight):
        # approxes is a list of indexed containers
        # (containers with only __len__ and __getitem__ defined),
        # one container per approx dimension.
        # Each container contains floats.
        # weight is a one dimensional indexed container.
        # target is float.
        
        # weight parameter can be None.
        # Returns pair (error, weights sum)
        
        assert len(approxes) == 1
        assert len(target) == len(approxes[0])

        approx = approxes[0]

        error_sum = 0.0
        weight_sum = 0.0

        for i in range(len(approx)):
            w = 1.0 if weight is None else weight[i]
            weight_sum += w
            error_sum += -w * (target[i] * approx[i] - np.log(1 + np.exp(approx[i])))

        return error_sum, weight_sum

In [61]:
model = CatBoostClassifier(
    iterations=10,
    random_seed=42, 
    loss_function=metrics.Logloss(),
    eval_metric=LoglossMetric()
)
# Fit model
model.fit(train_pool)
# Only prediction_type='RawFormulaVal' is allowed with custom `loss_function`
preds_raw = model.predict(X_test, prediction_type='RawFormulaVal')

Learning rate set to 0.5
0:	learn: 0.4897324	total: 238ms	remaining: 2.14s
1:	learn: 0.4079571	total: 247ms	remaining: 990ms
2:	learn: 0.3664814	total: 256ms	remaining: 597ms
3:	learn: 0.3470594	total: 263ms	remaining: 395ms
4:	learn: 0.3286165	total: 270ms	remaining: 270ms
5:	learn: 0.3185360	total: 276ms	remaining: 184ms
6:	learn: 0.3117023	total: 281ms	remaining: 121ms
7:	learn: 0.3067000	total: 287ms	remaining: 71.7ms
8:	learn: 0.3041437	total: 291ms	remaining: 32.4ms
9:	learn: 0.3014551	total: 296ms	remaining: 0us


## 3.7 Staged Predict

In [62]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
ntree_start, ntree_end, eval_period = 3, 9, 2
predictions_iterator = model.staged_predict(validate_pool, 'Probability', ntree_start, ntree_end, eval_period)
for preds, tree_count in zip(predictions_iterator, range(ntree_start, ntree_end, eval_period)):
    print('First class probabilities using the first {} trees: {}'.format(tree_count, preds[:5, 1]))

First class probabilities using the first 3 trees: [0.43461295 0.41563261 0.55200067 0.35924986 0.55416344]
First class probabilities using the first 5 trees: [0.58583516 0.30016483 0.6396269  0.25267305 0.72002035]
First class probabilities using the first 7 trees: [0.59755112 0.26438144 0.65578498 0.28981635 0.79925934]


## 3.8 Feature Importances

In [63]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
feature_importances = model.get_feature_importance(train_pool)
feature_names = X_train.columns
for score, name in sorted(zip(feature_importances, feature_names), reverse=True):
    print('{}: {}'.format(name, score))

relationship: 26.549293791029793
capital_gain: 15.066988164313381
age: 11.348308572955766
education_num: 9.982357499227714
occupation: 9.0779913962102
hours_per_week: 6.445736296920767
capital_loss: 6.178407174515397
marital_status: 5.926828526077566
education: 2.182551824722219
workclass: 2.153529483171403
fnlwgt: 2.11543488138674
gender: 1.9479473307377864
race: 0.6669831096073858
native_country: 0.3576419491238522


## 3.9 Eval Metrics

In [64]:
model = CatBoostClassifier(iterations=50, random_seed=42, logging_level='Silent').fit(train_pool)
eval_metrics = model.eval_metrics(validate_pool, [metrics.AUC()], plot=True)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

In [65]:
print(eval_metrics['AUC'][:6])

[0.863590547466673, 0.8847177287157673, 0.8960662234367822, 0.9014003210658894, 0.907566502468117, 0.9103099894245582]


## 3.10 Learning Processes Comparison

In [67]:
model1 = CatBoostClassifier(iterations=100, depth=1, train_dir='model_depth_1/', logging_level='Silent')
model1.fit(train_pool, eval_set=validate_pool)
model2 = CatBoostClassifier(iterations=100, depth=5, train_dir='model_depth_5/', logging_level='Silent')
model2.fit(train_pool, eval_set=validate_pool)

<catboost.core.CatBoostClassifier at 0x7f6f6bd62fa0>

In [68]:
from catboost import MetricVisualizer
widget = MetricVisualizer(['model_depth_1', 'model_depth_5'])
widget.start()

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

## 3.11 Model Saving

In [69]:
model = CatBoostClassifier(iterations=10, random_seed=42, logging_level='Silent').fit(train_pool)
model.save_model('catboost_model.dump')
model = CatBoostClassifier()
model.load_model('catboost_model.dump')

<catboost.core.CatBoostClassifier at 0x7f6f302040d0>

# 4. Tuning Parameters

In [70]:
!pip install hyperopt

[0mCollecting hyperopt
  Downloading hyperopt-0.2.7-py2.py3-none-any.whl (1.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m4.7 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
Collecting py4j
  Downloading py4j-0.10.9.7-py2.py3-none-any.whl (200 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m200.5/200.5 kB[0m [31m16.8 MB/s[0m eta [36m0:00:00[0m
[0mInstalling collected packages: py4j, hyperopt
[0mSuccessfully installed hyperopt-0.2.7 py4j-0.10.9.7
[0m

In [71]:
import hyperopt

def hyperopt_objective(params):
    model = CatBoostClassifier(
        l2_leaf_reg=int(params['l2_leaf_reg']),
        learning_rate=params['learning_rate'],
        iterations=500,
        eval_metric=metrics.Accuracy(),
        random_seed=42,
        verbose=False,
        loss_function=metrics.Logloss(),
    )
    
    cv_data = cv(
        Pool(X_train, y_train, cat_features=categorical_features_indices),
        model.get_params(),
        logging_level='Silent',
    )
    best_accuracy = np.max(cv_data['test-Accuracy-mean'])
    
    return 1 - best_accuracy # as hyperopt minimises

In [73]:
params_space = {
    'l2_leaf_reg': hyperopt.hp.qloguniform('l2_leaf_reg', 0, 2, 1),
    'learning_rate': hyperopt.hp.uniform('learning_rate', 1e-3, 5e-1),
}

trials = hyperopt.Trials()

best = hyperopt.fmin(
    hyperopt_objective,
    space=params_space,
    algo=hyperopt.tpe.suggest,
    max_evals=50,
    trials=trials,
    rstate=np.random.default_rng(123)
)

print(best)

100%|██████████| 50/50 [27:43<00:00, 33.27s/trial, best loss: 0.12676594583335754]
{'l2_leaf_reg': 2.0, 'learning_rate': 0.0994427748333858}


In [74]:
model = CatBoostClassifier(
    l2_leaf_reg=int(best['l2_leaf_reg']),
    learning_rate=best['learning_rate'],
    iterations=500,
    eval_metric=metrics.Accuracy(),
    random_seed=42,
    verbose=False,
    loss_function=metrics.Logloss(),
)
cv_data = cv(Pool(X_train, y_train, cat_features=categorical_features_indices), model.get_params())

Training on fold [0/3]

bestTest = 0.8741218473
bestIteration = 424

Training on fold [1/3]

bestTest = 0.8727398365
bestIteration = 358

Training on fold [2/3]

bestTest = 0.8742225294
bestIteration = 232



In [75]:
print('Precise validation accuracy score: {}'.format(np.max(cv_data['test-Accuracy-mean'])))

Precise validation accuracy score: 0.8732340541666425


## Make Submission

In [76]:
model.fit(X_train, y_train, cat_features=categorical_features_indices)

<catboost.core.CatBoostClassifier at 0x7f6f8fd4bc10>

In [77]:
predictions = model.predict(X_test)
print(classification_report(y_true=y_test.values, y_pred=predictions))

              precision    recall  f1-score   support

           0       0.90      0.94      0.92     12435
           1       0.77      0.65      0.71      3846

    accuracy                           0.87     16281
   macro avg       0.83      0.80      0.81     16281
weighted avg       0.87      0.87      0.87     16281

