In [1]:
import gc
import os
import sys
import json

import catboost as cb
from sklearn.datasets import *

%load_ext autoreload
%autoreload 2

In [2]:
cb_clf_booster_params = dict(
                         iterations=None,
                         learning_rate=None,
                         depth=None,
                         l2_leaf_reg=None,
                         model_size_reg=None,
                         rsm=None,
                         loss_function='Logloss',
                         border_count=None,
                         feature_border_type=None,
                         old_permutation_block_size=None,
                         od_pval=None,
                         od_wait=None,
                         od_type=None,
                         nan_mode=None,
                         counter_calc_method=None,
                         leaf_estimation_iterations=None,
                         leaf_estimation_method=None,
                         thread_count=None,
                         random_seed=None,
                         use_best_model=None,
                         verbose=None,
                         logging_level=None,
                         metric_period=None,
                         ctr_leaf_count_limit=None,
                         store_all_simple_ctr=None,
                         max_ctr_complexity=None,
                         has_time=None,
                         allow_const_label=None,
                         classes_count=None,
                         class_weights=None,
                         one_hot_max_size=None,
                         random_strength=None,
                         name=None,
                         ignored_features=None,
                         train_dir=None,
                         custom_loss=None,
                         custom_metric=None,
                         eval_metric=None,
                         bagging_temperature=None,
                         save_snapshot=None,
                         snapshot_file=None,
                         snapshot_interval=None,
                         fold_len_multiplier=None,
                         used_ram_limit=None,
                         gpu_ram_part=None,
                         allow_writing_files=None,
                         final_ctr_computation_mode=None,
                         approx_on_full_history=None,
                         boosting_type=None,
                         simple_ctr=None,
                         combinations_ctr=None,
                         per_feature_ctr=None,
                         task_type=None,
                         device_config=None,
                         devices=None,
                         bootstrap_type=None,
                         subsample=None,
                         max_depth=None,
                         n_estimators=None,
                         num_boost_round=None,
                         num_trees=None,
                         colsample_bylevel=None,
                         random_state=None,
                         reg_lambda=None,
                         objective=None,
                         eta=None,
                         max_bin=None,
                         scale_pos_weight=None,
                         gpu_cat_features_storage=None,
                         data_partition=None,
                         metadata=None, 
                         early_stopping_rounds=None,
                         cat_features=None)

In [3]:
cb_reg_booster_params = dict(
                        iterations=None,
                        learning_rate=None,
                        depth=None,
                        l2_leaf_reg=None,
                        model_size_reg=None,
                        rsm=None,
                        loss_function='RMSE',
                        border_count=None,
                        feature_border_type=None,
                        fold_permutation_block_size=None,
                        od_pval=None,
                        od_wait=None,
                        od_type=None,
                        nan_mode=None,
                        counter_calc_method=None,
                        leaf_estimation_iterations=None,
                        leaf_estimation_method=None,
                        thread_count=None,
                        random_seed=None,
                        use_best_model=None,
                        verbose=None,
                        logging_level=None,
                        metric_period=None,
                        ctr_leaf_count_limit=None,
                        store_all_simple_ctr=None,
                        max_ctr_complexity=None,
                        has_time=None,
                        allow_const_label=None, 
                        one_hot_max_size=None,
                        random_strength=None,
                        name=None,
                        ignored_features=None,
                        train_dir=None,
                        custom_metric=None,
                        eval_metric=None,
                        bagging_temperature=None,
                        save_snapshot=None,
                        snapshot_file=None,
                        snapshot_interval=None,
                        fold_len_multiplier=None,
                        used_ram_limit=None,
                        gpu_ram_part=None,
                        allow_writing_files=None,
                        final_ctr_computation_mode=None,
                        approx_on_full_history=None,
                        boosting_type=None,
                        simple_ctr=None,
                        combinations_ctr=None,
                        per_feature_ctr=None,
                        task_type=None,
                        device_config=None,
                        devices=None,
                        bootstrap_type=None,
                        subsample=None,
                        max_depth=None,
                        n_estimators=None,
                        num_boost_round=None,
                        num_trees=None,
                        colsample_bylevel=None,
                        random_state=None,
                        reg_lambda=None,
                        objective=None,
                        eta=None,
                        max_bin=None,
                        gpu_cat_features_storage=None,
                        data_partition=None,
                        metadata=None,
                        early_stopping_rounds=None,
                        cat_features=None) 

In [5]:
cb_default_booster_params = {
    'regression':cb_reg_booster_params,
    'classification':cb_clf_booster_params
}

In [8]:
with open('./config/catboost_default_booster_params.json', 'a') as file:
    json.dump(cb_default_booster_params, file)

In [None]:
cb_default_eval_metrics = [
    'RMSE',
    'Logloss',
    'MAE',
    'CrossEntropy',
    'Quantile',
    'LogLinQuantile',
    'Lq',
    'MultiClass',
    'MultiClassOneVsAll',
    'MAPE',
    'Poisson',
    'PairLogit',
    'PairLogitPairwise',
    'QueryRMSE',
    'QuerySoftMax',
    'SMAPE',
    'Recall',
    'Precision',
    'F1',
    'TotalF1',
    'Accuracy',
    'BalancedAccuracy',
    'BalancedErrorRate',
    'Kappa',
    'WKappa',
    'LogLikelihoodOfPrediction',
    'AUC',
    'R2',
    'NumErrors',
    'MCC',
    'BrierScore',
    'HingeLoss',
    'HammingLoss',
    'ZeroOneLoss',
    'MSLE',
    'MedianAbsoluteError',
    'PairAccuracy',
    'AverageGain',
    'PFound',
    'NDCG',
    'PrecisionAt',
    'RecallAt',
    'MAP'
]

In [15]:
cb_loss_functions_dict = {
    'regression':['RMSE', 'MAE', 'MAPE', 'Quantile','QueryRMSE','Poisson',] , 
    'classification':['Logloss', 'CrossEntropy', 'MultiClass','MultiClassOneVsAll',],
    'others':['LogLinQuantile','Lq','PairLogit','PairLogitPairwise','QuerySoftMax','YetiRank','YetiRankPairwise']
}

In [5]:
def find_common_keys(dict1, dict2):
    dict1_keys = dict1.keys()
    dict2_keys = dict2.keys()
    
    common_keys = []
    if len(dict1_keys) > len(dict2_keys):
        smaller = list(dict2_keys)
        larger = list(dict1_keys)
    else:
        smaller = list(dict1_keys)
        larger = list(dict2_keys)
        
    for _key in larger:
        if _key in smaller:
            common_keys.append(_key)
    smaller_unique = [ele for ele in smaller if ele not in common_keys]
    larger_unique = [ele for ele in larger if ele not in common_keys]
            
    return common_keys, larger_unique, smaller_unique

In [6]:
%time

common_keys, larger, smaller = find_common_keys(cb_clf_booster_params, cb_reg_booster_params)
print('CLF keys: ', len(cb_clf_booster_params), 'REG keys: ', len(cb_reg_booster_params), 'Common keys:', len(common_keys))

Wall time: 0 ns
CLF keys:  73 REG keys:  69 Common keys: 68


In [7]:
print(larger)

['old_permutation_block_size', 'class_weights', 'classes_count', 'scale_pos_weight', 'custom_loss']


In [8]:
print(smaller)

['fold_permutation_block_size']


In [9]:
cb_booster_params = {
    'general_params':common_keys,
    'clf':larger,
    'reg':smaller,
}

print(cb_booster_params)

{'reg': ['fold_permutation_block_size'], 'clf': ['old_permutation_block_size', 'class_weights', 'classes_count', 'scale_pos_weight', 'custom_loss'], 'general_params': ['eval_metric', 'eta', 'gpu_cat_features_storage', 'has_time', 'max_bin', 'num_boost_round', 'combinations_ctr', 'metadata', 'fold_len_multiplier', 'gpu_ram_part', 'simple_ctr', 'metric_period', 'snapshot_file', 'approx_on_full_history', 'l2_leaf_reg', 'border_count', 'bootstrap_type', 'depth', 'ignored_features', 'use_best_model', 'logging_level', 'objective', 'bagging_temperature', 'store_all_simple_ctr', 'devices', 'od_wait', 'snapshot_interval', 'task_type', 'leaf_estimation_iterations', 'ctr_leaf_count_limit', 'custom_metric', 'one_hot_max_size', 'device_config', 'max_ctr_complexity', 'used_ram_limit', 'model_size_reg', 'nan_mode', 'od_pval', 'data_partition', 'subsample', 'loss_function', 'counter_calc_method', 'allow_const_label', 'leaf_estimation_method', 'train_dir', 'boosting_type', 'feature_border_type', 'max_d

In [20]:
cb_loss_functions_dict

{'classification': ['Logloss',
  'CrossEntropy',
  'MultiClass',
  'MultiClassOneVsAll'],
 'others': ['LogLinQuantile',
  'Lq',
  'PairLogit',
  'PairLogitPairwise',
  'QuerySoftMax',
  'YetiRank',
  'YetiRankPairwise'],
 'regression': ['RMSE', 'MAE', 'MAPE', 'Quantile', 'QueryRMSE', 'Poisson']}

TypeError: 'list' object is not a mapping

In [22]:
cb_booster_params

{'clf': ['old_permutation_block_size',
  'class_weights',
  'classes_count',
  'scale_pos_weight',
  'custom_loss'],
 'general_params': ['eval_metric',
  'eta',
  'gpu_cat_features_storage',
  'has_time',
  'max_bin',
  'num_boost_round',
  'combinations_ctr',
  'metadata',
  'fold_len_multiplier',
  'gpu_ram_part',
  'simple_ctr',
  'metric_period',
  'snapshot_file',
  'approx_on_full_history',
  'l2_leaf_reg',
  'border_count',
  'bootstrap_type',
  'depth',
  'ignored_features',
  'use_best_model',
  'logging_level',
  'objective',
  'bagging_temperature',
  'store_all_simple_ctr',
  'devices',
  'od_wait',
  'snapshot_interval',
  'task_type',
  'leaf_estimation_iterations',
  'ctr_leaf_count_limit',
  'custom_metric',
  'one_hot_max_size',
  'device_config',
  'max_ctr_complexity',
  'used_ram_limit',
  'model_size_reg',
  'nan_mode',
  'od_pval',
  'data_partition',
  'subsample',
  'loss_function',
  'counter_calc_method',
  'allow_const_label',
  'leaf_estimation_method',
  't

In [29]:
def merge_two_dict(dict1, dict2):
    output_dict = dict1.copy()
    output_dict.update(dict2)
    
    return output_dict

In [31]:
p = {'a':1}
q = {'b':2}

print(merge_two_dict(p, q), p, q)

{'b': 2, 'a': 1} {'a': 1} {'b': 2}
