# End to End CatBoost

©️2025 MetaSnake

## https://github.com/mattharrison/odsc_east_2025

`@__mharrison__`

In [None]:
!pip install -U catboost

In [None]:
import catboost
catboost.__version__

## Datasets

I'll be demoing with Kaggle 2018 survey data

In [None]:
import pandas as pd
import os
import urllib.request
import zipfile

local = 'kaggle-survey-2018.zip'
if not os.path.exists(local):
    url = 'https://github.com/mattharrison/datasets/raw/master/data/kaggle-survey-2018.zip'
    fin = urllib.request.urlopen(url)
    data = fin.read()
    with open(local, mode='wb') as fout:
        fout.write(data)
with zipfile.ZipFile(local) as z:
    print(z.namelist())
    kag = pd.read_csv(z.open('multipleChoiceResponses.csv'))
    kag_questions = kag.iloc[0]
    raw = kag.iloc[1:]

In [None]:
raw

In [None]:
raw.Q4.value_counts().index

In [None]:
raw.Q6.value_counts()

In [None]:
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, FunctionTransformer
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer

from sklearn import set_config

set_config(transform_output='pandas')

class TopNEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, n=5, default='other'):
        self.n = n
        self.default = default
        self.top_n_categories = None

    def fit(self, X, y=None):
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]
        counts = X.value_counts()
        self.top_n_categories = set(counts.index[:self.n])
        return self

    def transform(self, X):
        return X.where(X.isin(self.top_n_categories), self.default)

class CustomOneHotEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, drop='first', prefix=None):
        self.drop = drop
        self.prefix = prefix
        self.encoder = None

    def fit(self, X: pd.DataFrame, y=None):
        self.encoder = OneHotEncoder(drop=self.drop, sparse_output=False)
        self.encoder.fit(X)
        return self

    def transform(self, X):
        encoded = self.encoder.transform(X)#.values.reshape(-1, 1))
        return encoded

class AgeExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (X
                .assign(**{col: X[col].str.slice(0, 2).astype(int)
                         for col in X.columns})
        )

class EducationEncoder(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.mapping = {
            'I prefer not to answer': None,
            'No formal education past high school': 12,
            'Some college/university study without earning a bachelor’s degree': 13,
            'Bachelor’s degree': 16,
            'Master’s degree': 18,
            'Professional degree': 19,     
            'Doctoral degree':20,
        }

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]
        return X.map(self.mapping).to_frame()

class ExperienceExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (pd.DataFrame(X)
            .assign(
                experience=lambda df: (
                    df.iloc[:, 0]
                    .str.replace('+', '', regex=False)
                    .str.split('-', expand=True)
                    .iloc[:, 0]
                    .astype(float)
                )
            )
            .loc[:, ['experience']]
        )


class CompensationExtractor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return (pd.DataFrame(X)
            .assign(
                compensation=lambda df: (
                    df.iloc[:, 0]
                    .str.replace('+', '', regex=False)
                    .str.replace(',', '', regex=False)
                    .str.replace('500000', '500', regex=False)
                    .str.replace('I do not wish to disclose my approximate yearly compensation', '0', regex=False)
                    .str.split('-', expand=True)
                    .iloc[:, 0]
                    .fillna(0)
                    .astype(int)
                    .mul(1000)
                )
            )
            .loc[:, ['compensation']]
        )

class LanguageEncoder(BaseEstimator, TransformerMixin):
    def __init__(self, language):
        self.language = language

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        if isinstance(X, pd.DataFrame):
            X = X.iloc[:, 0]
        return X.fillna(0).map({self.language: 1}).rename(self.language).to_frame()

def create_pipeline():
    gender_encoder = CustomOneHotEncoder(drop='first', prefix='gender')
    country_encoder = CustomOneHotEncoder(drop='first', prefix='country')
    major_encoder = Pipeline([
        ('topn', TopNEncoder(n=3)),
        ('replace', FunctionTransformer(lambda X: X.iloc[:, 0].map({
            'Computer science (software engineering, etc.)': 'cs',
            'Engineering (non-computer focused)': 'eng',
            'Mathematics or statistics': 'stat'
        }).to_frame())),
        ('onehot', CustomOneHotEncoder(drop='first', prefix='major'))
    ])

    preprocessor = ColumnTransformer([
        ('gender', gender_encoder, ['Q1']),
        ('age', AgeExtractor(), ['Q2']),
        ('country', country_encoder, ['Q3']),
        ('education', EducationEncoder(), ['Q4']),
        ('major', major_encoder, ['Q5']),
        ('years_exp', ExperienceExtractor(), ['Q8']),
        ('compensation', CompensationExtractor(), ['Q9']),
        ('python', LanguageEncoder('Python'), ['Q16_Part_1']),
        ('r', LanguageEncoder('R'), ['Q16_Part_2']),
        ('sql', LanguageEncoder('SQL'), ['Q16_Part_3'])
    ])

    pipeline = Pipeline([
        ('preprocessor', preprocessor),
        ('imputer', SimpleImputer(strategy='mean'))
    ])

    return pipeline

# Usage
pipeline = create_pipeline()
kag_X_transformed = pipeline.fit_transform(
    raw[raw.Q3.isin(["United States of America", "China", "India"])]
)
kag_y = (raw
         .loc[kag_X_transformed.index]
            .query('Q6 == "Data Scientist" or Q6 == "Software Engineer"')
            .loc[:, 'Q6']
)
kag_X = kag_X_transformed.loc[kag_y.index]

# Split the data
from sklearn.model_selection import train_test_split
kag_X_train, kag_X_test, kag_y_train, kag_y_test = train_test_split(
    kag_X, kag_y, stratify=kag_y, random_state=42)

In [None]:
kag_X

In [None]:
kag_y

In [None]:
# check for missing values
kag_y.isna().pipe(lambda s: s[s > 0])

## Use PCA to Visualize

In [None]:
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

pca_pipeline = Pipeline([('std', StandardScaler()), ('pca', PCA(n_components=3))])
X_pca = pca_pipeline.fit_transform(kag_X)
pca = pca_pipeline.named_steps['pca']

In [None]:
X_pca

In [None]:
# components

(pd.DataFrame(pca.components_, columns=kag_X.columns,
              index=[f'pca{i}' for i in range(pca.n_components_ )])
.loc[:, lambda df: (df.abs() > .1).any(axis='index')]
.plot.bar()
.legend(bbox_to_anchor=(1,1))              
)

In [None]:
import plotly.express as px
fig = px.scatter_3d(data:=X_pca.assign(**kag_X),
                x='pca0', y='pca1', z='pca2',
                #color='age__Q2',
                #color='major__experience',
                #color='education__Q4',
                color='compensation__compensation',
                  hover_data=data.columns,
                color_continuous_scale='viridis')
fig.update_layout(
    width=800,
    height=600,
    title='3D PCA Scatter Plot'
)      
fig.update_traces(marker=dict(size=3))

fig.show()

In [None]:
# color by job title
import plotly.express as px
fig = px.scatter_3d(data:=X_pca.assign(**kag_X),
                x='pca0', y='pca1', z='pca2',
                color=kag_y, hover_data=data.columns,
                color_continuous_scale='viridis')
fig.update_layout(
    width=800,
    height=600,
    title='3D PCA Scatter Plot'
)      
fig.update_traces(marker=dict(size=3))

fig.show()

## Stumps, Trees, and Forests

In [None]:
X = (raw.loc[
    raw.Q3.isin(["United States of America", "China", "India"])
    & raw.Q6.isin(["Data Scientist", "Software Engineer"]),
    ['Q1', 'Q2', 'Q3', 'Q4', 'Q5', 'Q8', 'Q9', 'Q16_Part_1', 'Q16_Part_2', 'Q16_Part_3']]
    .rename(columns={'Q1': 'gender', 'Q2': 'age', 'Q3': 'country', 'Q4': 'education', 
                     'Q5': 'major', 'Q8': 'years_exp', 'Q9': 'compensation', 
                     'Q16_Part_1': 'python', 'Q16_Part_2': 'r', 'Q16_Part_3': 'sql'})
    .fillna('NA')  # categories can't have missing values
    # not strictly required to convert to category, but it will save memory (generally)
    #.pipe(lambda df: df.assign(**df.select_dtypes('object').astype('category')))    
)
y = (raw
        .loc[X.index]
        .loc[:, 'Q6']
    )

In [None]:
X

In [None]:
y

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42)

In [None]:
X_train.columns

In [None]:
stump = catboost.CatBoostClassifier(iterations=1, depth=1, 
                                    cat_features=list(X_train.columns))
stump.fit(X_train, y_train)


In [None]:
stump.score(X_test, y_test)

In [None]:
# on an underfit model, we generally see similar (bad) performance on the training set
stump.score(X_train, y_train)

In [None]:
stump.classes_

In [None]:
# This fails without a CatBoost "Pool"
stump.plot_tree(tree_idx=0)

In [None]:
# On codespaces run:
# sudo apt-get update
# sudo apt-get install graphviz
pool = catboost.Pool(X_train, y_train, cat_features=list(X_train.columns))
res = stump.plot_tree(tree_idx=0, pool=pool)
res

## Hyperparameters

In [None]:
# sort the keys so that the order is consistent
# .get_all_params() returns all of the parameters
unsorted_dict = stump.get_all_params()
sorted_dict = dict(sorted(unsorted_dict.items()))
sorted_dict 

## Underfit

A stump is too simple. It has too much *bias*.

Solutions:

- Add better features
- Use a more complex model

If we let the tree grow, it will do both of these things.

## Overfit

A model that is too complex has too much *variance*.

Solutions:

- Simplify the model (or constrain/regularize it)
- Use more data

IF a tree is too complex, we can prune it so that the leaf nodes are not too specific.

In [None]:
var_settings = catboost_config = {
    'auto_class_weights': 'None',
    #'bayesian_matrix_reg': 0,  # Reduced from 0.10000000149011612
    'best_model_min_trees': 1,
    'boost_from_average': False,
    'boosting_type': 'Plain',
    #'bootstrap_type': 'No',  # Changed from 'MVS'
    'border_count': 1024, #254,
    #'class_names': ['Data Scientist', 'Software Engineer'],
    'classes_count': 0,
    'combinations_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
     'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
    'counter_calc_method': 'SkipTest',
    'ctr_leaf_count_limit': 18446744073709551615,
    'ctr_target_border_count': 1,
    #'depth': None,  # Changed from 1 to None (unlimited depth)
    'eval_fraction': 0,
    'eval_metric': 'Logloss',
    'feature_border_type': 'GreedyLogSum',
    'fold_permutation_block': 0,
    #'force_unit_auto_pair_weights': False,
    'grow_policy': 'SymmetricTree',
    'has_time': False,
    #'iterations': 1000,  # Increased from 1
    'l2_leaf_reg': 0,  # Reduced from 3
    'leaf_estimation_backtracking': 'No',  # Changed from 'AnyImprovement'
    'leaf_estimation_iterations': 10,  # Increased from 1
    'leaf_estimation_method': 'Newton',
    'learning_rate': 1,  # Increased from 0.5
    'loss_function': 'Logloss',
    'max_ctr_complexity': 8,  # Increased from 1
    'max_leaves': None, #31,  # Increased from 2
    'min_data_in_leaf': 1,
    'model_shrink_mode': 'Constant',
    'model_shrink_rate': 0,
    'model_size_reg': 0,  # Reduced from 0.5
    'nan_mode': 'Min',
    'one_hot_max_size': 2,
    'penalties_coefficient': 1,
    #'permutation_count': 1,  # Reduced from 4
    #'pool_metainfo_options': {'tags': {}},
    'posterior_sampling': False,
    'random_score_type': 'NormalWithModelSizeDecrease',
    'random_seed': 0,
    'random_strength': 0,  # Reduced from 1
    'rsm': 1,
    'sampling_frequency': 'PerTree',
    'score_function': 'Cosine',
    'simple_ctr': ['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1',
     'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1'],
    'sparse_features_conflict_fraction': 0,
    'store_all_simple_ctr': False,
    'subsample': 1,  # Increased from 0.800000011920929
    'task_type': 'CPU',
    'use_best_model': False
}


In [None]:
# fails because CatBoost has a depth limit of 16
hi_variance = catboost.CatBoostClassifier(iterations=1, depth=100, **var_settings,
                                            cat_features=list(X_train.columns))
hi_variance.fit(X_train, y_train)

In [None]:
# plotting the tree fails if the depth > 13 (on my mac)
hi_variance = catboost.CatBoostClassifier(iterations=1, depth=13, **var_settings,
                        cat_features=list(X_train.columns))
hi_variance.fit(X_train, y_train)


In [None]:
hi_variance.score(X_test, y_test)

In [None]:
hi_variance.score(X_train, y_train)

In [None]:
print(f'Res={hi_variance._object._is_oblivious()}')

In [None]:
# note that the tree is "oblivious". This means that at every level of the tree,
# the feature that is split on is the same.
# - These work better with categorical data
# - They are also faster to train
# - Easier to regularize
hi_variance.plot_tree(tree_idx=0, pool=pool)

## Goldilocks

We want a model that is just right.

In [None]:
train_scores = []
test_scores = []
for i in range(1, 16):
    goldilocks = catboost.CatBoostClassifier(iterations=1, depth=i, **var_settings,
                                            cat_features=list(X_train.columns))
    goldilocks.fit(X_train, y_train)
    train_scores.append(goldilocks.score(X_train, y_train))
    test_scores.append(goldilocks.score(X_test, y_test))

import matplotlib.pyplot as plt
plt.plot(range(1, 16), train_scores, label='train')
plt.plot(range(1, 16), test_scores, label='test')
plt.xlabel('depth')
plt.ylabel('accuracy')
plt.legend()

In [None]:
gold5 = catboost.CatBoostClassifier(iterations=1, depth=5, **var_settings,
                                    cat_features=list(X_train.columns))
gold5.fit(X_train, y_train)
gold5.score(X_test, y_test)

In [None]:
gold5.score(X_train, y_train)

## CatBoost

CatBoost uses *boosting* to train a series of symmetric (oblivious) decision trees, where each tree tries to correct the mistakes made by the previous one. For classification tasks, this process refines the prediction into a probability.

Imagine it like golfing: after each shot (tree), you adjust your next move to get closer to the hole (correct prediction). In contrast, a decision tree is like hitting one ball and stopping. A random forest would be like taking multiple tee shots and averaging them to find the best one.

- *Automatic Handling of Categorical Features*: CatBoost natively supports categorical features, encoding them in a highly efficient way without the need for preprocessing.
- *Missing Value Support*: CatBoost handles missing data for numeric data automatically. (It does not support missing categorical data.)
- *Overfitting Detection*: To prevent overfitting, CatBoost can stop training early when it detects that the model is no longer improving.
- *Embedding Features*: You can specify that a group of columns should be treated as an embedding feature, which can improve the model's performance.


In [None]:
# This is the out of the box model. Note that it performs much better than 
# goldilocks
cat1 = catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False)
cat1.fit(X_train, y_train)
cat1.score(X_test, y_test)

In [None]:
cat1.score(X_train, y_train)

In [None]:
# plot tree 1
cat1.plot_tree(tree_idx=0, pool=pool)


In [None]:
# plot tree 2
cat1.plot_tree(tree_idx=1, pool=pool)


In [None]:
X_test.iloc[0]

In [None]:
print([x for x in dir(cat1) if x.endswith('_')])

## Tree Hyperparameters

## **Model Architecture and Tree Structure**

- **`boosting_type='Plain'`**: Type of boosting algorithm. `'Plain'` means no boosting over time or residuals.
  
- **`grow_policy='SymmetricTree'`**: Strategy for growing trees. Symmetric trees split all branches at a given depth on the same feature, ensuring a balanced structure.

- **`depth=None`**: Depth of the trees. Setting `None` means unlimited depth, allowing trees to grow without restrictions. *Changed from 1 to None*.

- **`max_leaves=None`**: The maximum number of leaves in a tree. *Increased from 2 to None* (no limit).

---

## **Feature Sampling and Processing**

- **`rsm=1`**: Random subspace method. Controls the fraction of features considered for splits. Set to 1, meaning all features are used for splitting.

- **`border_count=1024`**: Number of splits for numeric features. A high value (1024) allows more precise splits. *Increased from 254*.

- **`max_ctr_complexity=8`**: Maximum complexity of combinatorial feature transformations for categorical features. *Increased from 1*.

- **`feature_border_type='GreedyLogSum'`**: Strategy for selecting borders (thresholds) when binning numeric features. `'GreedyLogSum'` uses a greedy logarithmic sum approach.

- **`one_hot_max_size=2`**: Maximum number of unique categorical values for which one-hot encoding is used. For categories larger than this, other methods are applied.

---

## **Categorical Feature Handling**

- **`combinations_ctr=['Borders:CtrBorderCount=15:CtrBorderType=Uniform:TargetBorderCount=1:TargetBorderType=MinEntropy:Prior=0/1:Prior=0.5/1:Prior=1/1', 'Counter:CtrBorderCount=15:CtrBorderType=Uniform:Prior=0/1']`**: Control how categorical features are processed using combinatorial counters and borders.
  
- **`ctr_leaf_count_limit=18446744073709551615`**: Limits the maximum number of leaves used for categorical feature transformation. The large value effectively disables this limit.

- **`ctr_target_border_count=1`**: Sets the number of target borders used in categorical target statistics. Helps to limit the complexity of categorical features.

---

## **Regularization and Overfitting Prevention**

- **`l2_leaf_reg=0`**: L2 regularization coefficient applied to leaf values. A lower value like 0 reduces regularization, allowing more variance. *Reduced from 3*.

- **`random_strength=0`**: Strength of random noise added to splits' scores. Zero means no randomization, making the model more deterministic. *Reduced from 1*.

- **`model_shrink_rate=0`**: Shrinkage rate for model weights during training. Zero means no shrinkage. *Reduced from 0.5*.

- **`model_size_reg=0`**: Regularization coefficient for model size. A lower value like 0 disables this regularization. *Reduced from 0.5*.

---

## **Tree Estimation and Leaf Calculations**

- **`leaf_estimation_iterations=10`**: Number of gradient steps used to estimate the leaf values. *Increased from 1 to 10*.

- **`leaf_estimation_method='Newton'`**: Method for leaf value estimation. Uses Newton-Raphson method for faster convergence.

- **`leaf_estimation_backtracking='No'`**: Specifies backtracking behavior during leaf value estimation. `'No'` disables any backtracking. *Changed from 'AnyImprovement'*.

---

## **Learning Rate and Convergence**

- **`learning_rate=1`**: Step size used in gradient boosting. A higher learning rate speeds up training but risks overfitting. *Increased from 0.5*.

---

## **Loss Function and Metrics**

- **`loss_function='Logloss'`**: Loss function used for binary classification, which minimizes the log loss.

- **`eval_metric='Logloss'`**: Metric to evaluate during model training, matching the loss function used.

---

## **Data Handling**

- **`auto_class_weights='None'`**: No automatic class weight balancing. This assumes that class distribution in the data is balanced.

- **`eval_fraction=0`**: Fraction of data used for evaluation. A value of `0` indicates no separate evaluation set.

- **`subsample=1`**: Fraction of data used for training each tree. A value of 1 means no subsampling. *Increased from 0.8*.

---

## **Cross-Validation and Permutations**

- **`fold_permutation_block=0`**: Specifies block size for folding data during cross-validation. A value of 0 disables folding by blocks.

- **`counter_calc_method='SkipTest'`**: Method used for calculating counter features. `'SkipTest'` means counters are calculated only on training data.

- **`posterior_sampling=False`**: Disables posterior sampling for Bayesian-like updates of leaf values.

---

## **Scoring and Bootstrapping**

- **`score_function='Cosine'`**: The scoring function used to evaluate splits. `'Cosine'` computes the cosine similarity between vectors.

- **`sampling_frequency='PerTree'`**: Frequency for resampling data. `PerTree` means resampling occurs once per tree, which adds randomness.

---

## **Bootstrapping and Shrinkage**

- **`boost_from_average=False`**: Whether to initialize leaf values from the mean of the target variable. Setting it to `False` can help with imbalanced datasets.

- **`best_model_min_trees=1`**: Minimum number of trees required to determine the best model. *Set to 1*.

---

## **Randomness and Reproducibility**

- **`random_seed=0`**: Seed for random number generation to ensure reproducibility.

- **`random_score_type='NormalWithModelSizeDecrease'`**: Type of randomness to introduce in scoring during splits. This type introduces noise proportional to model size.

---

## **Additional Configurations**

- **`penalties_coefficient=1`**: Penalty coefficient applied to regularization terms, set to 1 for no additional penalties.

- **`task_type='CPU'`**: Specifies the type of processor to use for training (CPU-based training).

- **`use_best_model=False`**: Disables the automatic selection of the best model during training.

- **`nan_mode='Min'`**: Specifies how to handle missing values (NaN). `'Min'` treats missing values as the smallest possible values.

---

## Which Parameters to tune?

| Parameter | Description | CB1 | CB2 | CB_tut | AWS | Forecastegy | Optuna |
| --- | --- | --- | --- | --- | --- | --- | --- |
| `learning_rate` | Step size used in gradient boosting | X | X | X | X | X | |
| `random_strength` | Strength of random noise added to splits' scores | X | X |  | X | | |
| `one_hot_max_size` | Maximum number of unique categorical values for which one-hot encoding is used | X |  |  |  | | |
| `l2_leaf_reg` | L2 regularization coefficient applied to leaf values | X | X | X | X | | |
| `bagging_temperature` | Controls the intensity of sampling | X | X |  |  |  | |
| `iterations` | Number of trees to build | X | X |  |  | X | |
| `use_best_model` | Use the best model found during training |  | X |  |  | | |
| `eval_metric` | Metric to evaluate during model training |  | X |  |  | | X |
| `od_type` | Type of overfitting detector to use |  | X |  |  | | |
| `od_pval` | Threshold for the overfitting detector |  | X |  |  | | |
| `od_wait` | Number of iterations to continue training after overfitting is detected |  | X |  |  | | |
| `depth` | Depth of the trees |  | X | X | X | X | X |
| `border_count` | Number of splits for numeric features |  | X |  |  | | |
| `has_time` | Use time as a feature (data has time order) |  | X |  |  | | |
| `grow_policy` | Strategy for growing trees |  | X |  |  | | |
| `min_data_in_leaf` | Minimum number of training samples in a leaf |  | X |  |  | X | |
| `max_leaves` | Maximum number of leaves in a tree |  | X |  |  | | |
| `per_float_feature_quantization` | Number of bits to use for quantizing numerical features |  | X |  |  | | |
| `max_ctr_complexity` | Maximum complexity of combinatorial feature transformations for categorical features |  |  | X |  | |  |  
| `boosting_type` | Type of boosting algorithm |  |  | X |  | |  X |
| `subsample` | Fraction of data used for training each tree |  |  |  |  | X | |
| `col_sample_bylevel` | Fraction of features to consider for each level |  |  |  |  | X | X |
| `bootstrap_type` | Sampling method for bagging |  |  |  |  | |  X |
| `used_ram_limit` | Maximum amount of RAM to use for training |  |  |  |  | |  X |
| `objective` | Objective function to optimize |  |  |  |  | | X |


Taken from:
- CB1 - CatBoost PDF 
- CB2 - https://catboost.ai/en/docs/concepts/parameter-tuning
- CB_tut - https://github.com/catboost/tutorials/blob/master/hyperparameters_tuning/hyperparameters_tuning_using_optuna_and_hyperopt.ipynb
- AWS - https://docs.aws.amazon.com/sagemaker/latest/dg/catboost-tuning.html
- Forecastegy - https://forecastegy.com/posts/catboost-hyperparameter-tuning-guide-with-optuna/
- Optuna - https://github.com/optuna/optuna-examples/blob/main/catboost/catboost_pruning.py

In [None]:
# takes about 45 seconds to run
import matplotlib.pyplot as plt
from yellowbrick import model_selection as ms
from yellowbrick.utils import types, helpers
from yellowbrick import base

base.get_model_name = lambda model: 'CatBoost'
#helpers.is_esitmator = lambda model: print(f'calling {model=}') or True

fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y, param_name='depth', param_range=range(1,6))
            

In [None]:
# takes about 90 seconds to run
import matplotlib.pyplot as plt
from yellowbrick import model_selection as ms
from yellowbrick.utils import types, helpers
from yellowbrick import base

base.get_model_name = lambda model: 'CatBoost'
#helpers.is_esitmator = lambda model: print(f'calling {model=}') or True

fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y, param_name='l2_leaf_reg', param_range=[0, 1, 3, 5, 10, 100])
            

In [None]:
# takes about 90 seconds to run
import matplotlib.pyplot as plt
from yellowbrick import model_selection as ms
from yellowbrick.utils import types, helpers
from yellowbrick import base

base.get_model_name = lambda model: 'CatBoost'
#helpers.is_esitmator = lambda model: print(f'calling {model=}') or True

fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y, param_name='l2_leaf_reg', param_range=[8, 10, 12, 20])
            

In [None]:
# takes about 90 seconds to run
import matplotlib.pyplot as plt
from yellowbrick import model_selection as ms
from yellowbrick.utils import types, helpers
from yellowbrick import base

base.get_model_name = lambda model: 'CatBoost'
#helpers.is_esitmator = lambda model: print(f'calling {model=}') or True

fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y, param_name='min_data_in_leaf', param_range=[1, 2, 5, 10, 20])
            

In [None]:
%%time
from sklearn import model_selection
# this takes a while to run (about 2 minutes)
# can set scoring in GridSearchCV to 
# recall, precision, f1, accuracy
params = {
          'learning_rate': [.1, .3], # makes each boost more conservative (0 - no shrinkage) 
          'random_strength': [.5, 1, 2],
          'one_hot_max_size': [1, 32, 64],
          #'gamma': [0, 1],
          'l2_leaf_reg': [0, 1, 2],
          
          'bagging_temperature': [0, 1],
          #'early_stopping_rounds':[10],
          'n_estimators': [200]}
cb3 = catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False)
cv = (model_selection.GridSearchCV(cb3, params, cv=3)#, n_jobs=-1)
    .fit(X, y)
#         eval_set=[(kag_X_test, kag_y_test)],
 #        early_stopping_rounds=5, verbose=10) 
     )

In [None]:
bcv.best_params_

In [None]:
params = {'bagging_temperature': 0,
 'l2_leaf_reg': 2,
 'learning_rate': 0.1,
 'n_estimators': 200,
 'one_hot_max_size': 1,
 'random_strength': 0.5}
cb4 = catboost.CatBoostClassifier(**params, cat_features=list(X_train.columns), verbose=False)
cb4.fit(X_train, y_train)
cb4.score(X_test, y_test)

In [None]:
cb_def = catboost.CatBoostClassifier( cat_features=list(X_train.columns), verbose=True)
cb_def.fit(X_train, y_train)
cb_def.score(X_test, y_test)

342:	learn: 0.4791788	total: 787ms	remaining: 1.51s
343:	learn: 0.4790781	total: 789ms	remaining: 1.5s
344:	learn: 0.4790209	total: 792ms	remaining: 1.5s
345:	learn: 0.4787917	total: 795ms	remaining: 1.5s
346:	learn: 0.4787917	total: 795ms	remaining: 1.5s
347:	learn: 0.4786865	total: 799ms	remaining: 1.5s
348:	learn: 0.4786023	total: 808ms	remaining: 1.51s
349:	learn: 0.4783452	total: 810ms	remaining: 1.5s
350:	learn: 0.4780642	total: 812ms	remaining: 1.5s
351:	learn: 0.4778878	total: 815ms	remaining: 1.5s
352:	learn: 0.4777294	total: 820ms	remaining: 1.5s
353:	learn: 0.4776565	total: 826ms	remaining: 1.51s
354:	learn: 0.4774580	total: 832ms	remaining: 1.51s
355:	learn: 0.4773928	total: 841ms	remaining: 1.52s
356:	learn: 0.4772339	total: 912ms	remaining: 1.64s
357:	learn: 0.4771537	total: 923ms	remaining: 1.65s
358:	learn: 0.4768508	total: 944ms	remaining: 1.68s
359:	learn: 0.4767956	total: 947ms	remaining: 1.68s
360:	learn: 0.4765704	total: 950ms	remaining: 1.68s
361:	learn: 0.4765070

527:	learn: 0.4539446	total: 1.37s	remaining: 1.23s
528:	learn: 0.4539335	total: 1.37s	remaining: 1.22s
529:	learn: 0.4536430	total: 1.38s	remaining: 1.22s
530:	learn: 0.4533376	total: 1.38s	remaining: 1.22s
531:	learn: 0.4531387	total: 1.38s	remaining: 1.22s
532:	learn: 0.4530995	total: 1.38s	remaining: 1.21s
533:	learn: 0.4530374	total: 1.39s	remaining: 1.21s
534:	learn: 0.4528198	total: 1.39s	remaining: 1.21s
535:	learn: 0.4527700	total: 1.39s	remaining: 1.2s
536:	learn: 0.4525953	total: 1.39s	remaining: 1.2s
537:	learn: 0.4524333	total: 1.4s	remaining: 1.2s
538:	learn: 0.4523671	total: 1.4s	remaining: 1.2s
539:	learn: 0.4523319	total: 1.4s	remaining: 1.19s
540:	learn: 0.4522350	total: 1.4s	remaining: 1.19s
541:	learn: 0.4521543	total: 1.41s	remaining: 1.19s
542:	learn: 0.4520414	total: 1.41s	remaining: 1.19s
543:	learn: 0.4519647	total: 1.41s	remaining: 1.18s
544:	learn: 0.4518761	total: 1.41s	remaining: 1.18s
545:	learn: 0.4517504	total: 1.42s	remaining: 1.18s
546:	learn: 0.451652

690:	learn: 0.4326579	total: 1.77s	remaining: 791ms
691:	learn: 0.4325782	total: 1.77s	remaining: 788ms
692:	learn: 0.4325767	total: 1.77s	remaining: 785ms
693:	learn: 0.4325277	total: 1.77s	remaining: 783ms
694:	learn: 0.4324034	total: 1.78s	remaining: 780ms
695:	learn: 0.4322107	total: 1.78s	remaining: 778ms
696:	learn: 0.4321532	total: 1.78s	remaining: 775ms
697:	learn: 0.4320413	total: 1.78s	remaining: 772ms
698:	learn: 0.4318441	total: 1.79s	remaining: 770ms
699:	learn: 0.4318009	total: 1.79s	remaining: 767ms
700:	learn: 0.4317142	total: 1.79s	remaining: 764ms
701:	learn: 0.4316562	total: 1.79s	remaining: 762ms
702:	learn: 0.4314328	total: 1.8s	remaining: 759ms
703:	learn: 0.4312737	total: 1.8s	remaining: 757ms
704:	learn: 0.4311999	total: 1.8s	remaining: 754ms
705:	learn: 0.4311410	total: 1.8s	remaining: 751ms
706:	learn: 0.4310753	total: 1.81s	remaining: 749ms
707:	learn: 0.4309722	total: 1.81s	remaining: 746ms
708:	learn: 0.4309316	total: 1.81s	remaining: 744ms
709:	learn: 0.43

914:	learn: 0.4072654	total: 2.36s	remaining: 219ms
915:	learn: 0.4072357	total: 2.36s	remaining: 217ms
916:	learn: 0.4070926	total: 2.36s	remaining: 214ms
917:	learn: 0.4070209	total: 2.37s	remaining: 211ms
918:	learn: 0.4069820	total: 2.37s	remaining: 209ms
919:	learn: 0.4068667	total: 2.37s	remaining: 206ms
920:	learn: 0.4068289	total: 2.38s	remaining: 204ms
921:	learn: 0.4067341	total: 2.38s	remaining: 201ms
922:	learn: 0.4065894	total: 2.38s	remaining: 199ms
923:	learn: 0.4064286	total: 2.39s	remaining: 196ms
924:	learn: 0.4064278	total: 2.39s	remaining: 194ms
925:	learn: 0.4063342	total: 2.39s	remaining: 191ms
926:	learn: 0.4062606	total: 2.4s	remaining: 189ms
927:	learn: 0.4061754	total: 2.4s	remaining: 186ms
928:	learn: 0.4060492	total: 2.4s	remaining: 184ms
929:	learn: 0.4060090	total: 2.41s	remaining: 181ms
930:	learn: 0.4059671	total: 2.41s	remaining: 179ms
931:	learn: 0.4058540	total: 2.41s	remaining: 176ms
932:	learn: 0.4055985	total: 2.41s	remaining: 173ms
933:	learn: 0.4

0.7559681697612732

In [None]:
# note with tuned XGBoost model, I get 0.7253814147018031

## Model Evaluation
Now that we've tuned our model, let's look at how it performs

In [None]:
from sklearn import metrics
metrics.accuracy_score(y_test, cb4.predict(X_test))

In [None]:
metrics.precision_score(y_test, cb4.predict(X_test))

In [None]:
cb4.classes_

In [None]:
metrics.precision_score(y_test, cb4.predict(X_test), pos_label='Software Engineer')

In [None]:
metrics.recall_score(y_test, cb4.predict(X_test), pos_label='Software Engineer')

In [None]:
metrics.f1_score(y_test, cb4.predict(X_test), pos_label='Software Engineer')

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.ConfusionMatrixDisplay.from_estimator(cb4,
                       X_test, y_test,ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.RocCurveDisplay.from_estimator(cb4,
                       X_test, y_test,ax=ax)


In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.RocCurveDisplay.from_estimator(cb4,
                       X_test, y_test,ax=ax)
metrics.RocCurveDisplay.from_estimator(cb4,
                       X_train, y_train,ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.RocCurveDisplay.from_estimator(stump,
                       X_test, y_test,ax=ax)
metrics.RocCurveDisplay.from_estimator(stump,
                       X_train, y_train,ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.RocCurveDisplay.from_estimator(hi_variance,
                       X_test, y_test,ax=ax)
metrics.RocCurveDisplay.from_estimator(hi_variance,
                       X_train, y_train,ax=ax)

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
metrics.PrecisionRecallDisplay.from_estimator(cb4,
                       X_test, y_test,ax=ax)
metrics.PrecisionRecallDisplay.from_estimator(cb4,
                       X_train, y_train,ax=ax)

## Tuning

Tune for differeent metrics

In [None]:
# takes about 45 seconds to run
import matplotlib.pyplot as plt
from yellowbrick import model_selection as ms
from yellowbrick.utils import types, helpers
from yellowbrick import base

base.get_model_name = lambda model: 'CatBoost'
#helpers.is_esitmator = lambda model: print(f'calling {model=}') or True

fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y, param_name='depth', param_range=range(1,6))
            

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y, param_name='depth', param_range=range(1,6),
                    scoring='recall', pos_label='Software Engineer')
            

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y=='Software Engineer', param_name='depth', param_range=range(1,6),
                    scoring='recall')
            

In [None]:
fig, ax = plt.subplots(figsize=(8,4))
ms.validation_curve(catboost.CatBoostClassifier(cat_features=list(X_train.columns), verbose=False),
                    X, y=='Software Engineer', param_name='depth', param_range=range(1,6),
                    scoring='precision')
            

## Model Interpretation

In [None]:
params = {'bagging_temperature': 0,
 'l2_leaf_reg': 2,
 'learning_rate': 0.1,
 'n_estimators': 200,
 'one_hot_max_size': 1,
 'random_strength': 0.5}
cb4 = catboost.CatBoostClassifier(**params, cat_features=list(X_train.columns), verbose=False)
cb4.fit(X_train, y_train)
cb4.score(X_test, y_test)

In [None]:
cb4.feature_importances_

In [None]:
pd.Series(cb4.feature_importances_, index=X_train.columns)

In [None]:
(pd.Series(cb4.feature_importances_, index=X_train.columns)
 .sort_values()
 .plot.barh()
)

In [None]:
def add_ranks(df_):
    return (df_
            .assign(**{f'{col}_rank':df_[col].rank(ascending=False) for col in 
                     df_.columns})
           )
    
pool = catboost.Pool(X_train, y_train, cat_features=list(X_train.columns))
(pd.DataFrame({typ:cb4.get_feature_importance(type=typ, data=pool)
             for typ in ['PredictionValuesChange',
                        'LossFunctionChange',
                'FeatureImportance', 
                         #'ShapValues',
        #'ShapInteractionValues',
        #'Interaction',
        #'PredictionDiff',
        #'SageValues'
                        ]},
            index=X_train.columns)
 .pipe(add_ranks)
 .sort_values(by='FeatureImportance_rank')
 .loc[:, 'PredictionValuesChange_rank':]
 .plot.barh()
)

## SHAP (SHapley Additive exPlantations)
Should be *globally* consistent and accurate

 Shapley value (SHAP).
 
 From game theory, indicates how to distribute attribution of label



In [None]:
import shap
shap.initjs()

# make sure you initialize the js side
shap_ex = shap.TreeExplainer(cb4)
vals = shap_ex(X_test)

In [None]:
vals

In [None]:
shap.plots.beeswarm(vals, alpha=.5)

In [None]:
# specify interaction
fig, ax = plt.subplots(figsize=(8,3))
shap.plots.scatter(vals[:,'major'])#, color=vals[:, 'education'], ax=ax, x_jitter=.5, alpha=.5)  

In [None]:
# shap doesn't like categories 😭
import seaborn as sns

(pd.DataFrame(vals.values, columns=X_test.columns, index=X_test.index)
 .rename(columns=lambda col: f'{col}_shap')
 .assign(base_values=vals.base_values, **X_test)
 .pipe(lambda df_:
       sns.catplot(y='major', x='major_shap', data=df_, 
                   aspect=2, height=10,
                   #hue='education'
                  )
      )
)

In [None]:
# shap doesn't like categories 😭
import seaborn as sns

(pd.DataFrame(vals.values, columns=X_test.columns, index=X_test.index)
 .rename(columns=lambda col: f'{col}_shap')
 .assign(base_values=vals.base_values, **X_test)
 .pipe(lambda df_:
       sns.catplot(y='r', x='r_shap', data=df_, 
                   aspect=2, height=10,
                   #hue='education'
                  )
      )
)

In [None]:
# shap doesn't like categories 😭
import seaborn as sns

(pd.DataFrame(vals.values, columns=X_test.columns, index=X_test.index)
 .rename(columns=lambda col: f'{col}_shap')
 .assign(base_values=vals.base_values, **X_test)
 .pipe(lambda df_:
       sns.catplot(y='education', x='education_shap', data=df_, 
                   aspect=2, height=10,
                   hue='r'
                  )
      )
)

In [None]:
# blue - DS
# red - SE

shap.initjs()
shap.plots.waterfall(vals[0])

In [None]:
cb4.predict(X_test.iloc[[0]])

In [None]:
X_test.iloc[[0]]

# Summary

CatBoost is very powerful.

Explore your data and your results.

Suggestions:

* Pandas skills come in useful for manipulating data
* Make sure you discuss business value with stake holders


Questions?


Connect on LinkedIn or Twitter `@__mharrison__`

In [None]:
import random
random.randrange(0,12)

In [None]:
random.randrange(0,4)