## StructureBoost Quick Config
### Predicting rain and temperature in California counties

This is intended to show the quickest way to configure and run StructureBoost

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
import copy

import structureboost as stb

pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

## Load and Process Data

In [2]:
df_ca_PRCP = pd.read_csv('data/CA_County_PRCP.csv')

In [3]:
df_ca_PRCP.sample(5)

Unnamed: 0,county,month,DATE,STATION,PRCP,rained,county_DATE
344842,Stanislaus,5,2000-05-24,USC00049073,0.0,0,Stanislaus___2000-05-24
311304,Shasta,2,2006-02-17,USW00004222,0.02,1,Shasta___2006-02-17
404956,Yolo,9,2008-09-10,US1CAYL0002,0.0,0,Yolo___2008-09-10
384602,Tulare,1,2012-01-04,US1CATL0011,0.0,0,Tulare___2012-01-04
151494,Mariposa,12,2006-12-06,USC00048380,0.0,0,Mariposa___2006-12-06


In [4]:
## We'll use just two features, county and month
## Target is whether or not it rained
X = df_ca_PRCP.loc[:,['county','month']]
y = df_ca_PRCP.rained.values

In [5]:
X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000)
X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, test_size = 10000)

In [6]:
num_train_pts = 1000
X_train = X_train_big.iloc[:num_train_pts,:]
y_train = y_train_big[:num_train_pts]

In [7]:
default_configs = stb.default_config_dict()
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contract_enum_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [8]:
feature_configs_default = stb.get_basic_config(X_train, default_configs)
feature_configs_default

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x110809668>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

In [9]:
stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                    learning_rate=.02,
                                    feature_configs=feature_configs_default, 
                                    max_depth=2,
                                    mode='classification')

In [10]:
stboost_CA_def.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.32590229327757214
i=20, eval_set_log_loss = 0.30894817878093545
i=40, eval_set_log_loss = 0.3003506215425933
i=60, eval_set_log_loss = 0.29546991367391523
i=80, eval_set_log_loss = 0.290440894958872
i=100, eval_set_log_loss = 0.2904352925049023
i=120, eval_set_log_loss = 0.2893880464788341
i=140, eval_set_log_loss = 0.28883602948015
i=160, eval_set_log_loss = 0.2870946282797986
i=180, eval_set_log_loss = 0.28645427580616395
i=200, eval_set_log_loss = 0.2847343449218809
i=220, eval_set_log_loss = 0.2853781049141028
Stopping early: curr_loss of 0.2853781049141028
                                        exceeds compare_loss of 0.2847343449218809


In [11]:
pred_probs = stboost_CA_def.predict(X_test)

In [12]:
stb_default_loss = log_loss(y_test, pred_probs)
stb_default_loss

0.419451697586669

## Incorporate structure of "County" feature
Right now, the county feature is using a *complete* graph, where every vertex is adjacent to every other vertex.  This represents the situation where there is essentially no structure in the different values.


In [13]:
# 1653 edges means that every vertex is adjacent to every other one
default_graph = feature_configs_default['county']['graph']
len(default_graph.vertices), len(default_graph.edges), 58*57/2

(58, 1653, 1653.0)

In [14]:
# If you look at the edges, you will see there is an edge for every pair of counties
# whether they are adjacent or not
default_graph.edges

{frozenset({'Calaveras', 'Riverside'}),
 frozenset({'Alpine', 'Glenn'}),
 frozenset({'Kern', 'Yolo'}),
 frozenset({'Marin', 'Santa_Cruz'}),
 frozenset({'Del_Norte', 'Trinity'}),
 frozenset({'Kern', 'Ventura'}),
 frozenset({'Marin', 'Nevada'}),
 frozenset({'San_Luis_Obispo', 'Solano'}),
 frozenset({'Del_Norte', 'Santa_Cruz'}),
 frozenset({'Imperial', 'Yolo'}),
 frozenset({'San_Francisco', 'Sierra'}),
 frozenset({'Imperial', 'Santa_Barbara'}),
 frozenset({'Kings', 'Sutter'}),
 frozenset({'Trinity', 'Yolo'}),
 frozenset({'Nevada', 'Ventura'}),
 frozenset({'Calaveras', 'Siskiyou'}),
 frozenset({'Lake', 'San_Francisco'}),
 frozenset({'Santa_Cruz', 'Ventura'}),
 frozenset({'Merced', 'Santa_Barbara'}),
 frozenset({'Ventura', 'Yuba'}),
 frozenset({'Santa_Cruz', 'Yolo'}),
 frozenset({'San_Bernardino', 'San_Francisco'}),
 frozenset({'Kings', 'Sacramento'}),
 frozenset({'Lake', 'Yolo'}),
 frozenset({'Calaveras', 'Kern'}),
 frozenset({'Alameda', 'San_Bernardino'}),
 frozenset({'Kings', 'Napa'}),
 

Let's make a new config that uses the actual CA County adjacency graph. This graph is included in the package, so we don't have to build it from scratch

In [15]:
county_graph = stb.graphs.CA_county_graph()
len(county_graph.vertices), len(county_graph.edges)

(58, 133)

In [16]:
# This graph only has edges between adjacent counties
county_graph.edges

{frozenset({'Napa', 'Solano'}),
 frozenset({'Madera', 'Mono'}),
 frozenset({'Glenn', 'Lake'}),
 frozenset({'Mendocino', 'Sonoma'}),
 frozenset({'Santa_Barbara', 'Ventura'}),
 frozenset({'Kern', 'Ventura'}),
 frozenset({'Humboldt', 'Trinity'}),
 frozenset({'Butte', 'Glenn'}),
 frozenset({'Mendocino', 'Trinity'}),
 frozenset({'San_Mateo', 'Santa_Cruz'}),
 frozenset({'Monterey', 'San_Luis_Obispo'}),
 frozenset({'Alameda', 'Contra_Costa'}),
 frozenset({'Sacramento', 'Sutter'}),
 frozenset({'Fresno', 'Madera'}),
 frozenset({'Riverside', 'San_Bernardino'}),
 frozenset({'Butte', 'Yuba'}),
 frozenset({'Kern', 'San_Luis_Obispo'}),
 frozenset({'San_Joaquin', 'Stanislaus'}),
 frozenset({'Nevada', 'Yuba'}),
 frozenset({'Nevada', 'Placer'}),
 frozenset({'Lassen', 'Shasta'}),
 frozenset({'Lake', 'Yolo'}),
 frozenset({'Contra_Costa', 'Solano'}),
 frozenset({'Fresno', 'San_Benito'}),
 frozenset({'Fresno', 'Inyo'}),
 frozenset({'Fresno', 'Monterey'}),
 frozenset({'Santa_Clara', 'Santa_Cruz'}),
 frozens

In [17]:
# Copy the default configuration, and replace 
# the complete graph (created by the 'get_default_config')
# with the actual CA county graph
feature_configs_1 = copy.deepcopy(feature_configs_default)
feature_configs_1['county']['graph'] = county_graph

In [18]:
feature_configs_1

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x140c70a20>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

### Run the model with the new config

In [19]:
stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_1, 
                                  max_depth=2,
                                  mode='classification')

In [20]:
stboost_CA_1.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.32590229327757214
i=20, eval_set_log_loss = 0.3084853794774657
i=40, eval_set_log_loss = 0.2977134389958118
i=60, eval_set_log_loss = 0.2921539470460491
i=80, eval_set_log_loss = 0.2863091353247218
i=100, eval_set_log_loss = 0.2858277907767698
i=120, eval_set_log_loss = 0.28575681583463397
i=140, eval_set_log_loss = 0.2846861139212959
i=160, eval_set_log_loss = 0.2834378171878932
i=180, eval_set_log_loss = 0.28438558163641464
Stopping early: curr_loss of 0.28438558163641464
                                        exceeds compare_loss of 0.2834378171878932


In [21]:
pred_probs_1 = stboost_CA_1.predict(X_test)

In [22]:
stb_1_loss = log_loss(y_test, pred_probs_1)
stb_1_loss

0.4165023724440902

### Incorporating structure in the 'month' feature
Previously we treated the month as a numerical variable.  However, doing so ignores
the fact that December is "adjacent" to January in the same way that "July" is
adjacent to "August".  To more accurately model the structure, we will represent the months
by a "cycle" graph

In [23]:
month_graph = stb.graphs.cycle_int_graph(1,12)
month_graph.vertices, month_graph.edges

({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
 {frozenset({3, 4}),
  frozenset({2, 3}),
  frozenset({11, 12}),
  frozenset({9, 10}),
  frozenset({1, 2}),
  frozenset({4, 5}),
  frozenset({6, 7}),
  frozenset({8, 9}),
  frozenset({7, 8}),
  frozenset({1, 12}),
  frozenset({5, 6}),
  frozenset({10, 11})})

In [24]:
# Copy the previous config, and change the settings for month
feature_configs_2 = copy.deepcopy(feature_configs_1)
feature_configs_2['month']['feature_type'] = 'categorical_int'
feature_configs_2['month']['graph'] = month_graph
feature_configs_2

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x14141a588>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x14141a198>}}

In [25]:
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contract_enum_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [26]:
feature_configs_2 = stb.apply_defaults(feature_configs_2, default_configs)
feature_configs_2
    

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x14141a588>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x14141a198>,
  'split_method': 'span_tree',
  'num_span_trees': 1}}

In [27]:
stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_2, 
                                  max_depth=2,
                                  mode='classification',
                                  loss_fn='entropy')

In [28]:
stboost_CA_2.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.32590229327757214
i=20, eval_set_log_loss = 0.3049229832330311
i=40, eval_set_log_loss = 0.29327660454952215
i=60, eval_set_log_loss = 0.2872463567037838
i=80, eval_set_log_loss = 0.28171685890567505
i=100, eval_set_log_loss = 0.2812647870996864
i=120, eval_set_log_loss = 0.28109108424748547
i=140, eval_set_log_loss = 0.28176265338525297
Stopping early: curr_loss of 0.28176265338525297
                                        exceeds compare_loss of 0.28109108424748547


In [29]:
pred_probs_2 = stboost_CA_2.predict(X_test)

In [30]:
stb_2_loss = log_loss(y_test, pred_probs_2)
stb_2_loss

0.4144562438524898

In [31]:
import catboost as cb

In [32]:
cat_features = ['county']
cb1 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [33]:
cb1.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 0.6802545	test: 0.6810051	best: 0.6810051 (0)	total: 62.4ms	remaining: 2m 35s
25:	learn: 0.4867946	test: 0.5010124	best: 0.5010124 (25)	total: 121ms	remaining: 11.5s
50:	learn: 0.4275286	test: 0.4485353	best: 0.4485353 (50)	total: 160ms	remaining: 7.69s
75:	learn: 0.4077797	test: 0.4333481	best: 0.4333481 (75)	total: 195ms	remaining: 6.22s
100:	learn: 0.3978765	test: 0.4272911	best: 0.4272911 (100)	total: 229ms	remaining: 5.43s
125:	learn: 0.3918622	test: 0.4238112	best: 0.4238112 (125)	total: 263ms	remaining: 4.96s
150:	learn: 0.3878487	test: 0.4219185	best: 0.4218764 (148)	total: 315ms	remaining: 4.9s
175:	learn: 0.3852963	test: 0.4206484	best: 0.4206484 (175)	total: 354ms	remaining: 4.67s
200:	learn: 0.3829957	test: 0.4193014	best: 0.4193014 (200)	total: 390ms	remaining: 4.46s
225:	learn: 0.3813407	test: 0.4186958	best: 0.4186958 (225)	total: 430ms	remaining: 4.32s
250:	learn: 0.3800150	test: 0.4181878	best: 0.4181462 (242)	total: 476ms	remaining: 4.27s
275:	learn: 0.37870

<catboost.core.CatBoostClassifier at 0x143bab128>

In [34]:
pred_probs_cb1 = cb1.predict_proba(X_test)

In [35]:
cb1_loss = log_loss(y_test, pred_probs_cb1)
cb1_loss

0.421860049836407

In [36]:
cat_features = ['county', 'month']
cb2 = cb.CatBoostClassifier(iterations=1500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [37]:
cb2.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 0.6804801	test: 0.6807121	best: 0.6807121 (0)	total: 5.06ms	remaining: 7.58s
25:	learn: 0.4910984	test: 0.5018932	best: 0.5018932 (25)	total: 58.9ms	remaining: 3.34s
50:	learn: 0.4327486	test: 0.4507187	best: 0.4507187 (50)	total: 106ms	remaining: 3.02s
75:	learn: 0.4116585	test: 0.4339647	best: 0.4339647 (75)	total: 146ms	remaining: 2.74s
100:	learn: 0.4024418	test: 0.4274307	best: 0.4274307 (100)	total: 190ms	remaining: 2.63s
125:	learn: 0.3973322	test: 0.4244698	best: 0.4244698 (125)	total: 244ms	remaining: 2.66s
150:	learn: 0.3937632	test: 0.4222099	best: 0.4221696 (148)	total: 288ms	remaining: 2.58s
175:	learn: 0.3918812	test: 0.4213257	best: 0.4213257 (175)	total: 329ms	remaining: 2.47s
200:	learn: 0.3905579	test: 0.4209208	best: 0.4209208 (200)	total: 370ms	remaining: 2.39s
225:	learn: 0.3891087	test: 0.4202722	best: 0.4202722 (225)	total: 411ms	remaining: 2.31s
250:	learn: 0.3879287	test: 0.4199273	best: 0.4199273 (250)	total: 451ms	remaining: 2.24s
275:	learn: 0.3865

<catboost.core.CatBoostClassifier at 0x143bb8470>

In [38]:
pred_probs_cb2 = cb2.predict_proba(X_test)

In [39]:
cb2_loss = log_loss(y_test, pred_probs_cb2)
cb2_loss

0.42601201309805

### Summary of results

In [40]:
print('StructureBoost (w no Structure):             loss = {}'.format(np.round(stb_default_loss, 5)))
print('StructureBoost w County Structure:           loss = {}'.format(np.round(stb_1_loss,5)))
print('StructureBoost w County+Month Structure:     loss = {}'.format(np.round(stb_2_loss,5)))
print('CatBoost:  (County categorical)              loss = {}'.format(np.round(cb1_loss,5)))
print('CatBoost:  (County, Month categorical)       loss = {}'.format(np.round(cb2_loss,5)))

StructureBoost (w no Structure):             loss = 0.41945
StructureBoost w County Structure:           loss = 0.4165
StructureBoost w County+Month Structure:     loss = 0.41446
CatBoost:  (County categorical)              loss = 0.42186
CatBoost:  (County, Month categorical)       loss = 0.42601


## Repeat multiple trials to show results aren't noise
In case you think the above results are just noise, let's run multiple trials and compare.

In [41]:
num_train_pts = 1000
num_trials = 10
stb_def_loss_vec = np.zeros(num_trials)
stb_1_loss_vec = np.zeros(num_trials)
stb_2_loss_vec = np.zeros(num_trials)
cb1_loss_vec = np.zeros(num_trials)
cb2_loss_vec = np.zeros(num_trials)
stb_def_AUC_vec = np.zeros(num_trials)
stb_1_AUC_vec = np.zeros(num_trials)
stb_2_AUC_vec = np.zeros(num_trials)
cb1_AUC_vec = np.zeros(num_trials)
cb2_AUC_vec = np.zeros(num_trials)
for i in range(num_trials):
    X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000, random_state=i)
    X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, 
                                                                  test_size = 10000, random_state=i)
    X_train = X_train_big.iloc[:num_train_pts,:]
    y_train = y_train_big[:num_train_pts]
    

    stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                        learning_rate=.02,
                                        feature_configs=feature_configs_default, 
                                        max_depth=2,
                                        mode='classification',
                                        random_seed=i)
    stboost_CA_def.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)

    pred_probs_def = stboost_CA_def.predict(X_test)
    stb_def_loss_vec[i] = log_loss(y_test, pred_probs_def)
    stb_def_AUC_vec[i] = roc_auc_score(y_test, pred_probs_def)

    stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_1, 
                                      max_depth=2,
                                      mode='classification',
                                      random_seed=i)
    stboost_CA_1.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_probs_1 = stboost_CA_1.predict(X_test)
    stb_1_loss_vec[i] = log_loss(y_test, pred_probs_1)
    stb_1_AUC_vec[i] = roc_auc_score(y_test, pred_probs_1)
        

    stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_2, 
                                      max_depth=2,
                                      mode='classification',
                                      random_seed=i)
    stboost_CA_2.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_probs_2 = stboost_CA_2.predict(X_test)
    stb_2_loss_vec[i] = log_loss(y_test, pred_probs_2)
    stb_2_AUC_vec[i] = roc_auc_score(y_test, pred_probs_2)

    
    cat_features_1 = ['county']
    cb1 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_1, random_seed=i)
    cb1.fit(X_train, y_train, cat_features_1, eval_set=(X_valid, y_valid), verbose=25)
    pred_probs_cb1 = cb1.predict_proba(X_test)
    cb1_loss_vec = log_loss(y_test, pred_probs_cb1)
    cb1_AUC_vec = roc_auc_score(y_test, pred_probs_cb1[:,1])
    
    
    cat_features_2 = ['county', 'month']
    cb2 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_2, random_seed=i)

    cb2.fit(X_train, y_train, cat_features_2, eval_set=(X_valid, y_valid), verbose=25)
    pred_probs_cb2 = cb2.predict_proba(X_test)
    cb2_loss_vec[i] = log_loss(y_test, pred_probs_cb2)
    cb2_AUC_vec[i] = roc_auc_score(y_test, pred_probs_cb2[:,1])
    

i=0, eval_set_log_loss = 0.32913163088919645
i=20, eval_set_log_loss = 0.30614917384146645
i=40, eval_set_log_loss = 0.29789810656868615
i=60, eval_set_log_loss = 0.2940673567900231
i=80, eval_set_log_loss = 0.29116487597991136
i=100, eval_set_log_loss = 0.2884212481088492
i=120, eval_set_log_loss = 0.2876242980571692
i=140, eval_set_log_loss = 0.2878666315978835
Stopping early: curr_loss of 0.2878666315978835
                                        exceeds compare_loss of 0.2876242980571692
i=0, eval_set_log_loss = 0.32913163088919645
i=20, eval_set_log_loss = 0.30463609723402774
i=40, eval_set_log_loss = 0.2942140347564769
i=60, eval_set_log_loss = 0.29038723373804515
i=80, eval_set_log_loss = 0.2861844193934331
i=100, eval_set_log_loss = 0.2839441516459698
i=120, eval_set_log_loss = 0.28289733395288497
i=140, eval_set_log_loss = 0.28380628632775307
Stopping early: curr_loss of 0.28380628632775307
                                        exceeds compare_loss of 0.28289733395288497
i=0

### Summary of results (multiple trials)

In [42]:
print('Average log_loss Scores:')
print('StructureBoost w County+Month Structure:     avg_loss = {}'.format(np.round(np.mean(stb_2_loss_vec),5)))
print('StructureBoost w County Structure:           avg_loss = {}'.format(np.round(np.mean(stb_1_loss_vec),5)))
print('StructureBoost (w no Structure):             avg_loss = {}'.format(np.round(np.mean(stb_def_loss_vec), 5)))
print('CatBoost:  (County categorical)              avg_loss = {}'.format(np.round(np.mean(cb1_loss_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_loss = {}'.format(np.round(np.mean(cb2_loss_vec),5)))
print('\nAverage ROC AUC Scores:')
print('StructureBoost w County+Month Structure:     avg_auc = {}'.format(np.round(np.mean(stb_2_AUC_vec),5)))
print('StructureBoost w County Structure:           avg_auc = {}'.format(np.round(np.mean(stb_1_AUC_vec),5)))
print('StructureBoost (w no Structure):             avg_auc = {}'.format(np.round(np.mean(stb_def_AUC_vec), 5)))
print('CatBoost:  (County categorical)              avg_auc = {}'.format(np.round(np.mean(cb1_AUC_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_auc = {}'.format(np.round(np.mean(cb2_AUC_vec),5)))

Average log_loss Scores:
StructureBoost w County+Month Structure:     avg_loss = 0.41114
StructureBoost w County Structure:           avg_loss = 0.41426
StructureBoost (w no Structure):             avg_loss = 0.4213
CatBoost:  (County categorical)              avg_loss = 0.42216
CatBoost:  (County, Month categorical)       avg_loss = 0.42129

Average ROC AUC Scores:
StructureBoost w County+Month Structure:     avg_auc = 0.75135
StructureBoost w County Structure:           avg_auc = 0.74762
StructureBoost (w no Structure):             avg_auc = 0.7327
CatBoost:  (County categorical)              avg_auc = 0.73303
CatBoost:  (County, Month categorical)       avg_auc = 0.72498


In [43]:
log_loss_diff_vec = cb1_loss_vec-stb_2_loss_vec
log_loss_diff_vec

array([0.01614603, 0.01004504, 0.00954294, 0.01229329, 0.01048549,
       0.01264241, 0.013971  , 0.0148619 , 0.00550564, 0.00477185])

In [44]:
## mean and std deviation of log_loss difference
## between StructureBoost and CatBoost
np.mean(log_loss_diff_vec), np.std(log_loss_diff_vec)

(0.011026557709573342, 0.003561787934779056)

In [45]:
# min and max of discrepancy
np.min(log_loss_diff_vec), np.max(log_loss_diff_vec)

(0.0047718490817256765, 0.01614603065233311)

In [46]:
roc_auc_score(y_test,pred_probs_2), roc_auc_score(y_test,pred_probs_cb1[:,1])

(0.7478541381400026, 0.7330267813829326)

In [47]:
auc_diff_vec = stb_2_AUC_vec - cb1_AUC_vec
auc_diff_vec

array([0.02731177, 0.01961536, 0.01997082, 0.01266592, 0.01747888,
       0.01871308, 0.01471183, 0.01905561, 0.01890042, 0.01482736])

In [48]:
## mean and std deviation of AUC difference
## between StructureBoost and CatBoost
np.mean(auc_diff_vec), np.std(auc_diff_vec)

(0.018325103539361166, 0.0037971147873122747)

In [49]:
# min and max of discrepancy
np.min(auc_diff_vec), np.max(auc_diff_vec)

(0.012665920290745314, 0.027311765113495534)