## StructureBoost Quick Config - Regression
### Predicting temperature in California counties

This is intended to show the quickest way to configure and run StructureBoost for a regression problem

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
import copy

import structureboost as stb

pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

## Load and Process Data

In [2]:
df_ca_TEMP = pd.read_csv('data/CA_County_TEMP.csv')

In [3]:
df_ca_TEMP.sample(5)

Unnamed: 0,county,month,DATE,STATION,TMAX,TMIN
402334,Yuba,9,2018-09-18,USR0000CPIK,75.0,56.0
259509,San_Diego,6,2012-06-23,USC00046657,75.0,55.0
98597,Inyo,1,2017-01-22,USR0000COWV,35.0,31.0
63715,Fresno,3,2000-03-12,USC00044176,58.0,25.0
5699,Alameda,8,2015-08-09,USC00046144,74.0,59.0


In [4]:
## We'll use just two features, county and month
## Target is maximum temperature
X = df_ca_TEMP.loc[:,['county','month']]
y = df_ca_TEMP.TMAX.values

In [5]:
X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000)
X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, test_size = 10000)

In [6]:
num_train_pts = 1000
X_train = X_train_big.iloc[:num_train_pts,:]
y_train = y_train_big[:num_train_pts]

In [7]:
default_configs = stb.default_config_dict()
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contraction_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [8]:
feature_configs_default = stb.get_basic_config(X_train, default_configs)
feature_configs_default

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13ae61128>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

In [9]:
stboost_CA_def = stb.StructureBoost(num_trees = 1500,
                                    learning_rate=.02,
                                    feature_configs=feature_configs_default, 
                                    max_depth=2,
                                    mode='regression')

In [10]:
stboost_CA_def.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_mse = 291.9643824
i=20, eval_set_mse = 221.36102314889334
i=40, eval_set_mse = 187.21059135488235
i=60, eval_set_mse = 167.8191487064826
i=80, eval_set_mse = 157.44493691577173
i=100, eval_set_mse = 150.01541813114002
i=120, eval_set_mse = 144.5611042452211
i=140, eval_set_mse = 140.56406351436848
i=160, eval_set_mse = 137.59020465039617
i=180, eval_set_mse = 134.49005639783906
i=200, eval_set_mse = 132.14363351430234
i=220, eval_set_mse = 129.76196444124523
i=240, eval_set_mse = 127.67767570604296
i=260, eval_set_mse = 125.88280020775667
i=280, eval_set_mse = 124.46554322786962
i=300, eval_set_mse = 123.24169600007977
i=320, eval_set_mse = 122.2097928249228
i=340, eval_set_mse = 121.3868512308369
i=360, eval_set_mse = 120.61841917704751
i=380, eval_set_mse = 119.73485692328005
i=400, eval_set_mse = 119.03682463822274
i=420, eval_set_mse = 118.50644001670842
i=440, eval_set_mse = 118.02703783973125
i=460, eval_set_mse = 117.4535599590854
i=480, eval_set_mse = 116.88555014

In [11]:
pred_temps_stb_def = stboost_CA_def.predict(X_test)

In [12]:
stb_default_loss = mean_squared_error(y_test, pred_temps_stb_def)
stb_default_loss

117.00744247497322

## Incorporate structure of "County" feature
Right now, the county feature is using a *complete* graph, where every vertex is adjacent to every other vertex.  This represents the situation where there is essentially no structure in the different values.


In [13]:
# 1653 edges means that every vertex is adjacent to every other one
default_graph = feature_configs_default['county']['graph']
len(default_graph.vertices), len(default_graph.edges), 58*57/2

(57, 1596, 1653.0)

In [14]:
# If you look at the edges, you will see there is an edge for every pair of counties
# whether they are adjacent or not
default_graph.edges

{frozenset({'Mono', 'Orange'}),
 frozenset({'Alameda', 'Marin'}),
 frozenset({'Calaveras', 'Sacramento'}),
 frozenset({'Inyo', 'Marin'}),
 frozenset({'Kings', 'Modoc'}),
 frozenset({'Colusa', 'Del_Norte'}),
 frozenset({'Alpine', 'Colusa'}),
 frozenset({'Yolo', 'Yuba'}),
 frozenset({'Glenn', 'Mariposa'}),
 frozenset({'Kern', 'San_Bernardino'}),
 frozenset({'Lassen', 'Sonoma'}),
 frozenset({'Del_Norte', 'Santa_Cruz'}),
 frozenset({'Kings', 'Sierra'}),
 frozenset({'Contra_Costa', 'Riverside'}),
 frozenset({'Napa', 'San_Francisco'}),
 frozenset({'Kings', 'Madera'}),
 frozenset({'Contra_Costa', 'Humboldt'}),
 frozenset({'San_Bernardino', 'Santa_Clara'}),
 frozenset({'Glenn', 'San_Joaquin'}),
 frozenset({'Stanislaus', 'Trinity'}),
 frozenset({'Santa_Clara', 'Siskiyou'}),
 frozenset({'Amador', 'Fresno'}),
 frozenset({'Contra_Costa', 'Kern'}),
 frozenset({'El_Dorado', 'Los_Angeles'}),
 frozenset({'Amador', 'Modoc'}),
 frozenset({'San_Francisco', 'San_Joaquin'}),
 frozenset({'Lake', 'San_Luis_O

Let's make a new config that uses the actual CA County adjacency graph. This graph is included in the package, so we don't have to build it from scratch

In [15]:
county_graph = stb.graphs.CA_county_graph()
len(county_graph.vertices), len(county_graph.edges)

(58, 133)

In [16]:
# This graph only has edges between adjacent counties
county_graph.edges

{frozenset({'San_Joaquin', 'Stanislaus'}),
 frozenset({'Monterey', 'Santa_Clara'}),
 frozenset({'Del_Norte', 'Humboldt'}),
 frozenset({'Amador', 'El_Dorado'}),
 frozenset({'Humboldt', 'Trinity'}),
 frozenset({'Santa_Clara', 'Stanislaus'}),
 frozenset({'Lassen', 'Modoc'}),
 frozenset({'Alpine', 'El_Dorado'}),
 frozenset({'Inyo', 'Tulare'}),
 frozenset({'Modoc', 'Siskiyou'}),
 frozenset({'Amador', 'San_Joaquin'}),
 frozenset({'Shasta', 'Trinity'}),
 frozenset({'Alameda', 'Contra_Costa'}),
 frozenset({'Merced', 'San_Benito'}),
 frozenset({'Monterey', 'San_Benito'}),
 frozenset({'Madera', 'Mono'}),
 frozenset({'Madera', 'Merced'}),
 frozenset({'Lake', 'Mendocino'}),
 frozenset({'Colusa', 'Lake'}),
 frozenset({'Mendocino', 'Sonoma'}),
 frozenset({'Alameda', 'Santa_Clara'}),
 frozenset({'Butte', 'Colusa'}),
 frozenset({'San_Francisco', 'San_Mateo'}),
 frozenset({'Kern', 'San_Bernardino'}),
 frozenset({'Kern', 'Ventura'}),
 frozenset({'Placer', 'Sutter'}),
 frozenset({'Merced', 'Stanislaus'})

In [17]:
# Copy the default configuration, and replace 
# the complete graph (created by the 'get_default_config')
# with the actual CA county graph
feature_configs_1 = copy.deepcopy(feature_configs_default)
feature_configs_1['county']['graph'] = county_graph

In [18]:
feature_configs_1

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13c040630>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

### Run the model with the new config

In [19]:
stboost_CA_1 = stb.StructureBoost(num_trees = 1500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_1, 
                                  max_depth=2,
                                  mode='regression')

In [20]:
stboost_CA_1.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_mse = 291.9643824
i=20, eval_set_mse = 220.15898109776307
i=40, eval_set_mse = 184.67862275668693
i=60, eval_set_mse = 165.30616854069547
i=80, eval_set_mse = 154.08689787243054
i=100, eval_set_mse = 145.48027990094818
i=120, eval_set_mse = 139.27737134666012
i=140, eval_set_mse = 134.17118524365796
i=160, eval_set_mse = 130.63247299119612
i=180, eval_set_mse = 127.32135415698991
i=200, eval_set_mse = 125.10763185275837
i=220, eval_set_mse = 122.84328726067389
i=240, eval_set_mse = 121.03363545467323
i=260, eval_set_mse = 119.83515401251707
i=280, eval_set_mse = 118.94070940079591
i=300, eval_set_mse = 117.96560569835839
i=320, eval_set_mse = 117.12838805264649
i=340, eval_set_mse = 116.6082858473403
i=360, eval_set_mse = 115.89002451213835
i=380, eval_set_mse = 115.34403095331646
i=400, eval_set_mse = 114.92860510833172
i=420, eval_set_mse = 114.36597957827026
i=440, eval_set_mse = 113.92185373847111
i=460, eval_set_mse = 113.35474018501353
i=480, eval_set_mse = 112.9530

In [21]:
pred_temps_1 = stboost_CA_1.predict(X_test)

In [22]:
stb_1_loss = mean_squared_error(y_test, pred_temps_1)
stb_1_loss

114.44594010796209

### Incorporating structure in the 'month' feature
Previously we treated the month as a numerical variable.  However, doing so ignores
the fact that December is "adjacent" to January in the same way that "July" is
adjacent to "August".  To more accurately model the structure, we will represent the months
by a "cycle" graph

In [23]:
month_graph = stb.graphs.cycle_int_graph(1,12)
month_graph.vertices, month_graph.edges

({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
 {frozenset({3, 4}),
  frozenset({2, 3}),
  frozenset({11, 12}),
  frozenset({9, 10}),
  frozenset({1, 2}),
  frozenset({4, 5}),
  frozenset({6, 7}),
  frozenset({8, 9}),
  frozenset({7, 8}),
  frozenset({1, 12}),
  frozenset({5, 6}),
  frozenset({10, 11})})

In [24]:
# Copy the previous config, and change the settings for month
feature_configs_2 = copy.deepcopy(feature_configs_1)
feature_configs_2['month']['feature_type'] = 'categorical_int'
feature_configs_2['month']['graph'] = month_graph
feature_configs_2

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13e5986a0>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x13e57cb70>}}

In [25]:
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contraction_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [26]:
feature_configs_2 = stb.apply_defaults(feature_configs_2, default_configs)
feature_configs_2
    

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13e5986a0>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x13e57cb70>,
  'split_method': 'span_tree',
  'num_span_trees': 1}}

In [27]:
stboost_CA_2 = stb.StructureBoost(num_trees = 3000,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_2, 
                                  max_depth=2,
                                  mode='regression')

In [28]:
stboost_CA_2.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_mse = 291.9643824
i=20, eval_set_mse = 215.0554324972405
i=40, eval_set_mse = 176.80553454561908
i=60, eval_set_mse = 155.55344430264452
i=80, eval_set_mse = 142.90027984302165
i=100, eval_set_mse = 135.30335601072437
i=120, eval_set_mse = 129.71609560027892
i=140, eval_set_mse = 125.53395105836577
i=160, eval_set_mse = 122.68806784680714
i=180, eval_set_mse = 120.48878336727103
i=200, eval_set_mse = 118.46600196887914
i=220, eval_set_mse = 117.05366995912703
i=240, eval_set_mse = 115.98756408288789
i=260, eval_set_mse = 114.89135857519722
i=280, eval_set_mse = 114.12126403001677
i=300, eval_set_mse = 113.34698432870901
i=320, eval_set_mse = 112.75050972874793
i=340, eval_set_mse = 112.1272256475911
i=360, eval_set_mse = 111.7052243785307
i=380, eval_set_mse = 111.2036189505226
i=400, eval_set_mse = 110.83189274535503
i=420, eval_set_mse = 110.51194700224207
i=440, eval_set_mse = 110.25751409735511
i=460, eval_set_mse = 109.94870382107071
i=480, eval_set_mse = 109.6044805

In [29]:
pred_temps_2 = stboost_CA_2.predict(X_test)

In [30]:
stb_2_loss = mean_squared_error(y_test, pred_temps_2)
stb_2_loss

111.40926185145774

In [31]:
import catboost as cb

In [32]:
cat_features = ['county']
cb1 = cb.CatBoostRegressor(iterations=1500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [33]:
cb1.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 17.0817608	test: 16.9547919	best: 16.9547919 (0)	total: 62.2ms	remaining: 1m 33s
25:	learn: 14.6130140	test: 14.5918007	best: 14.5918007 (25)	total: 85ms	remaining: 4.82s
50:	learn: 13.4022028	test: 13.4441136	best: 13.4441136 (50)	total: 100ms	remaining: 2.85s
75:	learn: 12.8505275	test: 12.8785999	best: 12.8785999 (75)	total: 123ms	remaining: 2.31s
100:	learn: 12.4971583	test: 12.4718042	best: 12.4718042 (100)	total: 145ms	remaining: 2s
125:	learn: 12.2689888	test: 12.1945630	best: 12.1945630 (125)	total: 160ms	remaining: 1.74s
150:	learn: 12.0827959	test: 12.0024993	best: 12.0024993 (150)	total: 173ms	remaining: 1.54s
175:	learn: 11.9424591	test: 11.8394598	best: 11.8394598 (175)	total: 186ms	remaining: 1.4s
200:	learn: 11.8266657	test: 11.7045706	best: 11.7045706 (200)	total: 199ms	remaining: 1.29s
225:	learn: 11.7351143	test: 11.5877438	best: 11.5877438 (225)	total: 212ms	remaining: 1.19s
250:	learn: 11.6703263	test: 11.5176730	best: 11.5176730 (250)	total: 224ms	remaini

<catboost.core.CatBoostRegressor at 0x13c040860>

In [34]:
pred_temps_cb1 = cb1.predict(X_test)

In [35]:
cb1_loss = mean_squared_error(y_test, pred_temps_cb1)
cb1_loss

123.57992707907233

In [36]:
cat_features_2 = ['county', 'month']
cb2 = cb.CatBoostRegressor(iterations=1500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features_2)

In [37]:
cb2.fit(X_train, y_train, cat_features_2, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 17.0701475	test: 16.9393865	best: 16.9393865 (0)	total: 3.32ms	remaining: 4.97s
25:	learn: 14.5092421	test: 14.3271130	best: 14.3271130 (25)	total: 25.7ms	remaining: 1.46s
50:	learn: 13.3047552	test: 13.0894651	best: 13.0894651 (50)	total: 39.4ms	remaining: 1.12s
75:	learn: 12.7074200	test: 12.4671838	best: 12.4671838 (75)	total: 52.1ms	remaining: 976ms
100:	learn: 12.3903027	test: 12.1257301	best: 12.1257301 (100)	total: 67.2ms	remaining: 930ms
125:	learn: 12.1948230	test: 11.8940529	best: 11.8940529 (125)	total: 95.1ms	remaining: 1.04s
150:	learn: 12.0650839	test: 11.7097287	best: 11.7097287 (150)	total: 113ms	remaining: 1.01s
175:	learn: 11.9620604	test: 11.5780592	best: 11.5780592 (175)	total: 126ms	remaining: 947ms
200:	learn: 11.8963707	test: 11.4916042	best: 11.4916042 (200)	total: 138ms	remaining: 893ms
225:	learn: 11.8511644	test: 11.4305501	best: 11.4305501 (225)	total: 151ms	remaining: 851ms
250:	learn: 11.7978057	test: 11.3774813	best: 11.3774813 (250)	total: 164m

<catboost.core.CatBoostRegressor at 0x142c440b8>

In [38]:
pred_temps_cb2 = cb2.predict(X_test)

In [39]:
cb2_loss = mean_squared_error(y_test, pred_temps_cb2)
cb2_loss

125.79487866126954

### Summary of results

In [40]:
print('StructureBoost (w no Structure):             loss = {}'.format(np.round(stb_default_loss, 5)))
print('StructureBoost w County Structure:           loss = {}'.format(np.round(stb_1_loss,5)))
print('StructureBoost w County+Month Structure:     loss = {}'.format(np.round(stb_2_loss,5)))
print('CatBoost:  (County categorical)              loss = {}'.format(np.round(cb1_loss,5)))
print('CatBoost:  (County, Month categorical)       loss = {}'.format(np.round(cb2_loss,5)))

StructureBoost (w no Structure):             loss = 117.00744
StructureBoost w County Structure:           loss = 114.44594
StructureBoost w County+Month Structure:     loss = 111.40926
CatBoost:  (County categorical)              loss = 123.57993
CatBoost:  (County, Month categorical)       loss = 125.79488


## Repeat multiple trials to show results aren't noise
In case you think the above results are just noise, let's run multiple trials and compare.

In [41]:
num_train_pts = 1000
num_trials = 10
stb_def_loss_vec = np.zeros(num_trials)
stb_1_loss_vec = np.zeros(num_trials)
stb_2_loss_vec = np.zeros(num_trials)
cb1_loss_vec = np.zeros(num_trials)
cb2_loss_vec = np.zeros(num_trials)
for i in range(num_trials):
    X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000, random_state=i)
    X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, 
                                                                  test_size = 10000, random_state=i)
    X_train = X_train_big.iloc[:num_train_pts,:]
    y_train = y_train_big[:num_train_pts]
    

    stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                        learning_rate=.02,
                                        feature_configs=feature_configs_default, 
                                        max_depth=2,
                                        mode='regression',
                                        random_seed=i)
    stboost_CA_def.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)

    pred_temps_def = stboost_CA_def.predict(X_test)
    stb_def_loss_vec[i] = mean_squared_error(y_test, pred_temps_def)

    stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_1, 
                                      max_depth=2,
                                      mode='regression',
                                        random_seed=i)
    stboost_CA_1.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_temps_1 = stboost_CA_1.predict(X_test)
    stb_1_loss_vec[i] = mean_squared_error(y_test, pred_temps_1)
        

    stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_2, 
                                      max_depth=2,
                                      mode='regression',
                                        random_seed=i)
    stboost_CA_2.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_temps_2 = stboost_CA_2.predict(X_test)
    stb_2_loss_vec[i] = mean_squared_error(y_test, pred_temps_2)

    
    cat_features_1 = ['county']
    cb1 = cb.CatBoostRegressor(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_1, random_seed=i)
    cb1.fit(X_train, y_train, cat_features_1, eval_set=(X_valid, y_valid), verbose=25)
    pred_temps_cb1 = cb1.predict(X_test)
    cb1_loss_vec = mean_squared_error(y_test, pred_temps_cb1)
    
    
    cat_features_2 = ['county', 'month']
    cb2 = cb.CatBoostRegressor(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_2, random_seed=i)

    cb2.fit(X_train, y_train, cat_features_2, eval_set=(X_valid, y_valid), verbose=25)
    pred_temps_cb2 = cb2.predict(X_test)
    cb2_loss_vec[i] = mean_squared_error(y_test, pred_temps_cb2)
    

i=0, eval_set_mse = 287.5330526
i=20, eval_set_mse = 221.36355309372655
i=40, eval_set_mse = 187.15124246730008
i=60, eval_set_mse = 168.70139904081816
i=80, eval_set_mse = 158.13729689864496
i=100, eval_set_mse = 151.73155900708042
i=120, eval_set_mse = 146.11520494920347
i=140, eval_set_mse = 142.7596311263404
i=160, eval_set_mse = 139.7020594934463
i=180, eval_set_mse = 137.29698832615915
i=200, eval_set_mse = 134.97572794952603
i=220, eval_set_mse = 132.90427455646326
i=240, eval_set_mse = 131.0483522424845
i=260, eval_set_mse = 129.72632039405877
i=280, eval_set_mse = 128.36498732056234
i=300, eval_set_mse = 127.34410542839761
i=320, eval_set_mse = 126.60119451947244
i=340, eval_set_mse = 125.84256673880445
i=360, eval_set_mse = 125.05767219956273
i=380, eval_set_mse = 124.415584409826
i=400, eval_set_mse = 124.19087978563323
i=420, eval_set_mse = 123.57438634867817
i=440, eval_set_mse = 123.44686373133122
i=460, eval_set_mse = 123.25099983576902
i=480, eval_set_mse = 122.90073776

### Summary of results (multiple trials)

In [42]:
print('StructureBoost w County+Month Structure:     avg_loss = {}'.format(np.round(np.mean(stb_2_loss_vec),5)))
print('StructureBoost w County Structure:           avg_loss = {}'.format(np.round(np.mean(stb_1_loss_vec),5)))
print('StructureBoost (w no Structure):             avg_loss = {}'.format(np.round(np.mean(stb_def_loss_vec), 5)))
print('CatBoost:  (County categorical)              avg_loss = {}'.format(np.round(np.mean(cb1_loss_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_loss = {}'.format(np.round(np.mean(cb2_loss_vec),5)))

StructureBoost w County+Month Structure:     avg_loss = 113.4804
StructureBoost w County Structure:           avg_loss = 116.5741
StructureBoost (w no Structure):             avg_loss = 118.76429
CatBoost:  (County categorical)              avg_loss = 131.20469
CatBoost:  (County, Month categorical)       avg_loss = 130.62965


In [43]:
## difference in loss fn for each different split
diff_vec = cb1_loss_vec-stb_2_loss_vec
diff_vec

array([14.62587543, 14.59959526, 16.37199249, 16.94429479, 18.42428842,
       19.14314381, 21.47389483, 17.48870133, 19.95067789, 18.22045268])

In [44]:
## mean and std deviation of loss_vec
np.mean(diff_vec), np.std(diff_vec)

(17.724291693310686, 2.088591724220677)

In [45]:
# min and max of discrepancy
np.min(diff_vec), np.max(diff_vec)

(14.599595256698422, 21.473894833245055)