In [1]:
import pandas as pd
import numpy as np
from catboost import cv, CatBoost,Pool

import optuna

class CatboostTuner():
    def __init__(self,loss_function='RMSE',eval_metric='RMSE',time_budget=3600,feature_selection=True):
        self.loss_function=loss_function
        self.eval_metric=eval_metric
        self.time_budget=time_budget #for parameter tuning, additional time needed to fit final model
        self.feature_selection=feature_selection
        self.tuned_model=None
        self.final_cv=None
        self.selected_features=None
     
    
    ##tuning plan:
    ##First, tune sequentially
    ##next, remove features by eliminating them on basis of feature importance using optuna
    ##retune within neighborhood of sequential solution
    ##tune learning rate and number of trees
    ##calc final cv statistic
    ##fit final model

    def tune_rsm(self,X,y,w=None,rsm_lb=.1,rsm_ub=1,n_fold=3,learning_rate=.2,subsample=.8,random_strength=1,
        max_depth=4,l2_leaf_reg=3,grow_policy='SymmetricTree',n_trials=10,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':subsample,
            'random_strength':random_strength,
            'rsm':trial.suggest_uniform('rsm', rsm_lb,rsm_ub),
            'max_depth':max_depth,
            'grow_policy':grow_policy,
            'l2_leaf_reg':l2_leaf_reg

            }
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)
        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params['rsm'],study.best_value])

    def tune_depth(self,X,y,w=None,n_fold=3,learning_rate=.2,rsm=.8,subsample=.8,random_strength=1,
        max_depth_lb=2,max_depth_ub=7,l2_leaf_reg=3,grow_policy='SymmetricTree',n_trials=10,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':subsample,
            'random_strength':random_strength,
            'rsm':rsm,
            'max_depth':trial.suggest_int('max_depth',max_depth_lb,max_depth_ub),
            'grow_policy':grow_policy,
            'l2_leaf_reg':l2_leaf_reg

            }
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)
        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params['max_depth'],study.best_value])

    def tune_subsample(self,X,y,w=None,n_fold=3,learning_rate=.2,rsm=.8,subsample_lb=.1,subsample_ub=1,random_strength=1,
        max_depth=4,l2_leaf_reg=3,grow_policy='SymmetricTree',n_trials=10,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':trial.suggest_uniform('subsample',subsample_lb,subsample_ub),
            'random_strength':random_strength,
            'rsm':rsm,
            'max_depth':max_depth,
            'grow_policy':grow_policy,
            'l2_leaf_reg':l2_leaf_reg

            }
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)
        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params['subsample'],study.best_value])

    def tune_regularization(self,X,y,w=None,n_fold=3,learning_rate=.2,rsm=.8,subsample=.8,random_strength_lb=.05,random_strength_ub=3,
        max_depth=4,l2_leaf_reg_lb=.05,l2_leaf_reg_ub=5,grow_policy='SymmetricTree',n_trials=10,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':subsample,
            'random_strength':trial.suggest_uniform('random_strength',random_strength_lb,random_strength_ub),
            'rsm':rsm,
            'max_depth':max_depth,
            'grow_policy':grow_policy,
            'l2_leaf_reg':trial.suggest_uniform('l2_leaf_reg',l2_leaf_reg_lb,l2_leaf_reg_ub)

            }
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)
        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params['random_strength'],study.best_params['l2_leaf_reg'],study.best_value])
    
    def tune_grow_policy(self,X,y,w=None,n_fold=3,learning_rate=.2,rsm=.8,subsample=.8,random_strength=1,
        max_depth=4,l2_leaf_reg=3,n_trials=4,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':subsample,
            'random_strength':random_strength,
            'rsm':rsm,
            'max_depth':max_depth,
            'grow_policy':trial.suggest_categorical('grow_policy',['SymmetricTree','Lossguide','Depthwise']),
            'l2_leaf_reg':l2_leaf_reg

            }
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)
        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params['grow_policy'],study.best_value])

    def tune_iterations(self,X,y,w=None,n_fold=3,learning_rate=.25,rsm=.8,subsample=.8,random_strength=1,       grow_policy="SymmetricTree",
        max_depth=4,l2_leaf_reg=3,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)
        params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':subsample,
            'random_strength':random_strength,
            'rsm':rsm,
            'max_depth':max_depth,
            'grow_policy':grow_policy,
            'l2_leaf_reg':l2_leaf_reg,
            'iterations':4000

            }
        cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)

        min_score=np.min(cv_out["test-RMSE-mean"])
        iterations=np.max(cv_out["iterations"])
        return(iterations,min_score)

    def fine_tune_all(self,X,y,w=None,n_fold=3,learning_rate=.2,rsm=.8,subsample=.8,random_strength=1,grow_policy="SymmetricTree",
        max_depth=4,l2_leaf_reg=3,n_trials=30,time_budget=600):
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        pool_data=Pool(data=X,label=y,weight=w)

        rsm_lb=rsm-.1
        if rsm_lb<=0:
            rsm_lb=.01
        rsm_ub=rsm+.1
        if rsm_ub>1:
            rsm=1
        
        subsample_lb=subsample-.1
        if subsample_lb<=0:
            subsample=.01
        subsample_ub=subsample+.1
        if subsample_ub >1:
            subsample_ub=1
        
        random_strength_lb=random_strength-.5
        random_strength_ub=random_strength+.5
        if random_strength_lb <=0:
            random_strength_lb=.01
        
        max_depth_lb=max_depth-1
        max_depth_ub=max_depth+1

        if max_depth_lb <2:
            max_depth_lb=2
        
        l2_leaf_reg_lb=l2_leaf_reg-.5
        l2_leaf_reg_ub=l2_leaf_reg+.5

        if l2_leaf_reg_lb<=0:
            l2_leaf_reg_lb=0.01
        
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':trial.suggest_uniform('subsample',subsample_lb,subsample_ub),
            'random_strength':trial.suggest_uniform('random_strength',random_strength_lb,random_strength_ub),
            'rsm':trial.suggest_uniform('rsm',rsm_lb,rsm_ub),
            'max_depth':trial.suggest_int('max_depth',max_depth_lb,max_depth_ub),
            'grow_policy':grow_policy,
            'l2_leaf_reg':trial.suggest_uniform('l2_leaf_reg',l2_leaf_reg_lb,l2_leaf_reg_ub)

            }
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)

        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params,study.best_value])
        
    def tune_feature_selection(self,X,y, model,w=None,n_fold=3,learning_rate=.2,rsm=.8,subsample=.8,random_strength=1,
              grow_policy="SymmetricTree", max_depth=4,l2_leaf_reg=3,n_trials=10,time_budget=600):
        ##use importance scores for feature selection
        early_rounds=round(1/learning_rate+5) #heuristic that seems to work well
        def catboost_cv_result(trial):
            params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'learning_rate':learning_rate,
            'bootstrap_type':'Bernoulli',
            'subsample':subsample,
            'random_strength':random_strength,
            'rsm':rsm,
            'max_depth':max_depth,
            'grow_policy':grow_policy,
            'l2_leaf_reg':l2_leaf_reg

            }

            threshold=trial.suggest_uniform('feat_import_threshold',0,10)
            
            X_subset=self.feature_selection_subsetter(X,model,threshold)
            
            pool_data=Pool(data=X_subset,label=y,weight=w)
            cv_out=cv(params=params,pool=pool_data,nfold=n_fold,early_stopping_rounds=early_rounds,
                    partition_random_seed=2021,verbose=False)
            #need to generalize this code
            out=np.min(cv_out["test-RMSE-mean"])
            return(out)
        study = optuna.create_study(direction='minimize')
        study.optimize(catboost_cv_result, n_trials=n_trials,timeout=time_budget)
        return([study.best_params['feat_import_threshold'],study.best_value])

    def feature_selection_subsetter(self,X,model,threshold):
        imp=model.get_feature_importance()
        var_list=[]
        for i in range(len(imp)):
            if imp[i]> threshold:
                var_list.append(X.columns[i])

        X_out=X[var_list]
        return(X_out)

    def predict(self,X):
        X=X[self.selected_features]
        pred=self.tuned_model.predict(X)
        return(pred)

    def run(self,X,y,w=None,final_learning_rate=.03,tuning_learning_rate=.15,n_fold=3):
        result1=self.tune_rsm(X,y,w,n_fold=n_fold,learning_rate=tuning_learning_rate)
        rsm=result1[0]

        result2= self.tune_grow_policy(X,y,w,rsm=rsm,n_fold=n_fold,learning_rate=tuning_learning_rate)
        grow_policy=result2[0]

        result3= self.tune_depth(X,y,w,rsm=rsm,grow_policy=grow_policy,n_fold=n_fold,learning_rate=tuning_learning_rate)
        max_depth=result3[0]

        result4=self.tune_iterations(X,y,w,learning_rate=final_learning_rate,rsm=rsm,max_depth=max_depth,grow_policy=grow_policy,
            n_fold=n_fold)
        iterations=result4[0]

        params1={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'iterations':iterations,
            'learning_rate':final_learning_rate,
            'bootstrap_type':'Bernoulli',
            'rsm':rsm,
            'grow_policy':grow_policy,
            'max_depth':max_depth,
            'subsample':0.8,
            'random_strength':1,
            'l2_leaf_reg':3
        }
        model1=CatBoost(params=params1)
        model1.fit(X,y,w)

        

        result5=self.tune_feature_selection(X,y,model1,w,rsm=rsm,grow_policy=grow_policy,max_depth=max_depth,n_fold=3,learning_rate=tuning_learning_rate)

        feat_import_threshold=result5[0]

        X=self.feature_selection_subsetter(X,model1,feat_import_threshold)

        #retune first 3

        result1=self.tune_rsm(X,y,w,n_fold=n_fold,learning_rate=tuning_learning_rate)
        rsm=result1[0]

        result2= self.tune_grow_policy(X,y,w,rsm=rsm,n_fold=n_fold,learning_rate=tuning_learning_rate)
        grow_policy=result2[0]

        result3= self.tune_depth(X,y,w,rsm=rsm,grow_policy=grow_policy,n_fold=n_fold,learning_rate=tuning_learning_rate)
        max_depth=result3[0]

        result4=self.tune_subsample(X,y,w,rsm=rsm,max_depth=max_depth,grow_policy=grow_policy,n_fold=n_fold,learning_rate=tuning_learning_rate)
        subsample=result4[0]

        result5=self.tune_regularization(X,y,w,rsm=rsm,max_depth=max_depth,grow_policy=grow_policy,
                subsample=subsample,n_fold=n_fold,learning_rate=tuning_learning_rate)
        random_strength=result5[0]
        l2_leaf_reg=result5[1]

        result6=self.fine_tune_all(X,y,w,rsm=rsm,max_depth=max_depth,grow_policy=grow_policy,
                subsample=subsample,random_strength=random_strength,l2_leaf_reg=l2_leaf_reg,n_fold=n_fold,learning_rate=tuning_learning_rate)

        if result6[1]<result5[1]:
            rsm=result6[0]['rsm']
            max_depth=result6[0]['max_depth']
            subsample=result6[0]['subsample']
            grow_policy=result6[0]['grow_policy']
            random_strength=result6[0]['random_strength']
            l2_leaf_reg=result6[0]['l2_leaf_reg']


        result7=self.tune_iterations(X,y,w,learning_rate=final_learning_rate,rsm=rsm,max_depth=max_depth,grow_policy=grow_policy,
                subsample=subsample,random_strength=random_strength,l2_leaf_reg=l2_leaf_reg,n_fold=n_fold)
        iterations=result7[0]
        self.final_cv=result7[1]

        params={
            'loss_function':self.loss_function,
            'eval_metric':self.eval_metric,
            'iterations':iterations,
            'learning_rate':final_learning_rate,
            'bootstrap_type':'Bernoulli',
            'rsm':rsm,
            'grow_policy':grow_policy,
            'max_depth':max_depth,
            'subsample':subsample,
            'random_strength':random_strength,
            'l2_leaf_reg':l2_leaf_reg
        }
            
        self.selected_features=X.columns
        self.tuned_model=CatBoost(params=params)
        self.tuned_model.fit(X,y,w)
        
    


In [2]:
from sklearn import datasets

d_housing=datasets.load_boston()

X=pd.DataFrame(d_housing['data'])
y=d_housing['target']

In [3]:
tuned_catboost=CatboostTuner()
tuned_catboost.run(X=X,y=y,n_fold=5)

n: 1.6197076	total: 255ms	remaining: 78.5ms
881:	learn: 1.6197076	total: 255ms	remaining: 78.2ms
882:	learn: 1.6184281	total: 256ms	remaining: 77.9ms
883:	learn: 1.6171778	total: 256ms	remaining: 77.6ms
884:	learn: 1.6153482	total: 256ms	remaining: 77.3ms
885:	learn: 1.6145149	total: 257ms	remaining: 77.1ms
886:	learn: 1.6136427	total: 257ms	remaining: 76.8ms
887:	learn: 1.6136427	total: 257ms	remaining: 76.4ms
888:	learn: 1.6130829	total: 257ms	remaining: 76.1ms
889:	learn: 1.6123910	total: 258ms	remaining: 75.8ms
890:	learn: 1.6115628	total: 258ms	remaining: 75.5ms
891:	learn: 1.6110926	total: 258ms	remaining: 75.2ms
892:	learn: 1.6109907	total: 258ms	remaining: 74.9ms
893:	learn: 1.6094972	total: 258ms	remaining: 74.6ms
894:	learn: 1.6087805	total: 259ms	remaining: 74.3ms
895:	learn: 1.6077124	total: 259ms	remaining: 74ms
896:	learn: 1.6077124	total: 259ms	remaining: 73.7ms
897:	learn: 1.6077124	total: 259ms	remaining: 73.3ms
898:	learn: 1.6060469	total: 259ms	remaining: 73ms
899:	l

In [4]:
1-(tuned_catboost.final_cv**2/np.var(y))

0.8867787217756278

In [5]:
tuned_catboost.tuned_model.get_all_params()

{'nan_mode': 'Min',
 'eval_metric': 'RMSE',
 'iterations': 600,
 'sampling_frequency': 'PerTree',
 'leaf_estimation_method': 'Newton',
 'grow_policy': 'Depthwise',
 'penalties_coefficient': 1,
 'boosting_type': 'Plain',
 'model_shrink_mode': 'Constant',
 'feature_border_type': 'GreedyLogSum',
 'bayesian_matrix_reg': 0.10000000149011612,
 'l2_leaf_reg': 2.697526693344116,
 'random_strength': 2.2563984394073486,
 'rsm': 0.8434796333312988,
 'boost_from_average': True,
 'model_size_reg': 0.5,
 'pool_metainfo_options': {'tags': {}},
 'subsample': 0.9419885873794556,
 'use_best_model': False,
 'random_seed': 0,
 'depth': 5,
 'posterior_sampling': False,
 'border_count': 254,
 'classes_count': 0,
 'auto_class_weights': 'None',
 'sparse_features_conflict_fraction': 0,
 'leaf_estimation_backtracking': 'AnyImprovement',
 'best_model_min_trees': 1,
 'model_shrink_rate': 0,
 'min_data_in_leaf': 1,
 'loss_function': 'RMSE',
 'learning_rate': 0.029999999329447743,
 'score_function': 'Cosine',
 'tas

In [3]:
params1={
            'loss_function':'RMSE',
            'eval_metric':'RMSE',
            'iterations':500,
            'learning_rate':.2
        }
catmod_test=CatBoost(params1)
catmod_test.fit(X,y)

4ms	remaining: 568ms
117:	learn: 0.9102977	total: 174ms	remaining: 565ms
118:	learn: 0.9051737	total: 175ms	remaining: 562ms
119:	learn: 0.8970092	total: 177ms	remaining: 559ms
120:	learn: 0.8892976	total: 178ms	remaining: 556ms
121:	learn: 0.8804787	total: 179ms	remaining: 553ms
122:	learn: 0.8749897	total: 180ms	remaining: 550ms
123:	learn: 0.8710618	total: 181ms	remaining: 548ms
124:	learn: 0.8630404	total: 182ms	remaining: 545ms
125:	learn: 0.8567009	total: 183ms	remaining: 542ms
126:	learn: 0.8492526	total: 183ms	remaining: 539ms
127:	learn: 0.8427014	total: 184ms	remaining: 535ms
128:	learn: 0.8348195	total: 185ms	remaining: 532ms
129:	learn: 0.8303350	total: 186ms	remaining: 529ms
130:	learn: 0.8288563	total: 187ms	remaining: 526ms
131:	learn: 0.8196702	total: 188ms	remaining: 523ms
132:	learn: 0.8110906	total: 188ms	remaining: 520ms
133:	learn: 0.8045963	total: 189ms	remaining: 517ms
134:	learn: 0.8035185	total: 190ms	remaining: 515ms
135:	learn: 0.7968725	total: 192ms	remainin

<catboost.core.CatBoost at 0x7f8e822be970>

In [4]:
imp=catmod_test.get_feature_importance()

In [8]:
imp

KeyError: 0

In [7]:
np.std(y)

9.188011545278203

In [5]:
pool_data=Pool(data=X,label=y)
cv(pool_data)

CatBoostError: params should be set.