# **Catboost Regression**

* Developed by a Russian company Yandex in 2017
* An algorithm that has in built ability to deal woth categorical features with ease (which makes certain feature engineering tasks like one hot encoding futile)
* [Nice resource](https://towardsdatascience.com/catboost-regression-in-6-minutes-3487f3e5b329) that I have been using to learn CatBoost
* November 2021 update: [been using this too](https://towardsdatascience.com/how-do-you-use-categorical-features-directly-with-catboost-947b211c2923)

Noah Rubin - July 2021

In [1]:
# Python files
import data_prep
import helper_funcs

import joblib
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from catboost import CatBoostRegressor, Pool
from sklearn.impute import KNNImputer

# Personal display settings
#===========================

# Suppress scientific notation
np.set_printoptions(suppress=True)

# Get dataset values showing only 2dp
pd.options.display.float_format = '{:.2f}'.format
pd.set_option('display.max_colwidth', None)

plt.style.use('seaborn-whitegrid') 
%matplotlib inline

%load_ext autoreload
%autoreload 2

In [2]:
train = pd.read_csv('../datasets/train_updated.csv')
test = pd.read_csv('../datasets/test_updated.csv')

# Split data
to_drop = ['HDI', 'Life_exp']

X_train = train.drop(to_drop, axis='columns')
X_test = test.drop(to_drop, axis='columns')

y_train = train['Life_exp']
y_test = test['Life_exp']

## Build Model

In [4]:
training_set = Pool(X_train, y_train, cat_features = ['Country', 'Status'])

test_set = Pool(X_test, cat_features = ['Country','Status'])

model = CatBoostRegressor(n_estimators=200,
                          loss_function='RMSE',
                          learning_rate=0.4,
                          depth=3, 
                          task_type='CPU',
                          random_state=1,
                          verbose=False).fit(training_set)

## Evaluation Metrics

In [8]:
r2, mse, rmse, mae = helper_funcs.display_regression_metrics(y_test, model.predict(X_test))

All metrics are in terms of the unseen test set

R^2 = 0.9847196109507116
Mean Squared Error = 1.2379560848566173
Root Mean Squared Error = 1.1126347490783386
Mean Absolute Error = 0.8012319974786861


## Save Model

In [6]:
model.predict(X_test)

array([70.81158089, 78.29404813, 80.78959462, 80.26962486, 73.93476922,
       63.67415154, 71.8466496 , 71.25985343, 51.64176329, 74.45610898,
       71.99516364, 82.84468601, 56.15023769, 70.8573371 , 52.04080594,
       71.78157601, 75.0725807 , 70.59789084, 61.74532514, 61.3776564 ,
       60.11336112, 60.24011773, 73.377715  , 82.92160437, 69.38679405,
       72.73183656, 68.26970494, 74.60009078, 64.29687521, 82.311614  ,
       73.07588034, 71.30855086, 74.13910464, 53.54568786, 51.5608803 ,
       61.48513198, 65.51810014, 75.15695572, 80.58313146, 64.27630586,
       60.06306425, 74.99869713, 82.7745047 , 79.24267167, 63.32046295,
       52.65314703, 72.0465031 , 59.3139944 , 51.72530343, 76.15601089,
       65.43205607, 70.0660721 , 69.93864103, 68.23898865, 66.91501802,
       69.15391381, 48.39810639, 74.52332973, 69.25436714, 74.58316082,
       69.53330914, 77.43558769, 78.96463684, 77.33169897, 79.51108751,
       61.31700323, 83.00141044, 74.42489789, 69.62944097, 69.10

In [None]:
joblib.dump(final_model, './saved_models/XGBoost.joblib')

In [None]:
from catboost import CatBoostRegressor, Pool

cb = CatBoostRegressor(n_estimators=200,
                       loss_function='RMSE',
                       learning_rate=0.4,
                       depth=3, task_type='CPU',
                       random_state=1,
                       verbose=False)

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, 
                                                    random_state=1)

pool_train = Pool(X_train, y_train,
                  cat_features = ['cut', 'color', 'clarity'])

pool_test = Pool(X_test, cat_features = ['cut', 'color', 'clarity'])

cb.fit(pool_train)
y_pred = cb.predict(pool_test)

import numpy as np
from sklearn.metrics import mean_squared_error as mse

cb_rmse = np.sqrt(mse(y_test, y_pred))
print("RMSE in y units:", np.mean(cb_rmse))

In [None]:
model = CatBoostRegressor().fit(X_train, y_train, eval_set = (X_test, y_test), use_best_model = True)

In [None]:

          
params = {
    'use_best_model':True,
    'iterations':5000,
    'learning_rate':0.001,
    'depth':3,
    'loss_function':'RMSE',
    'eval_metric':'RMSE',
    'random_seed':55,
    'metric_period':50, # calculate metrics once per 50 iterations
          'od_type':"Iter", # overfit detector
          'od_wait':20, # most recent best iteration to wait before stopping
          'verbose':True,
          'use_best_model':True}
