## StructureBoost Quick Config
### Predicting rain and temperature in California counties

This is intended to show the quickest way to configure and run StructureBoost

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
import copy

import structureboost as stb

pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

## Load and Process Data

In [2]:
df_ca_PRCP = pd.read_csv('data/CA_County_PRCP.csv')

In [3]:
df_ca_PRCP.sample(5)

Unnamed: 0,county,month,DATE,STATION,PRCP,rained,county_DATE
35231,Calaveras,2,2019-02-09,USC00041277,0.22,1,Calaveras___2019-02-09
93040,Inyo,9,2003-09-14,USC00040820,0.0,0,Inyo___2003-09-14
338232,Sonoma,9,2001-09-01,USC00043191,0.0,0,Sonoma___2001-09-01
52339,Del_Norte,3,2010-03-22,USW00024286,0.01,1,Del_Norte___2010-03-22
259786,San_Diego,2,2015-02-05,US1CASD0090,0.0,0,San_Diego___2015-02-05


In [4]:
## We'll use just two features, county and month
## Target is whether or not it rained
X = df_ca_PRCP.loc[:,['county','month']]
y = df_ca_PRCP.rained.values

In [5]:
X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000)
X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, test_size = 10000)

In [6]:
num_train_pts = 1000
X_train = X_train_big.iloc[:num_train_pts,:]
y_train = y_train_big[:num_train_pts]

In [7]:
default_configs = stb.default_config_dict()
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contract_enum_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [8]:
feature_configs_default = stb.get_basic_config(X_train, default_configs)
feature_configs_default

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x1107146d8>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

In [9]:
stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                    learning_rate=.02,
                                    feature_configs=feature_configs_default, 
                                    max_depth=2,
                                    mode='classification')

In [10]:
stboost_CA_def.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.30470751384716727
i=20, eval_set_log_loss = 0.2825339871660573
i=40, eval_set_log_loss = 0.27464135285139607
i=60, eval_set_log_loss = 0.2691100875442132
i=80, eval_set_log_loss = 0.2681711265306045
i=100, eval_set_log_loss = 0.26794133719429203
i=120, eval_set_log_loss = 0.26653040329200894
i=140, eval_set_log_loss = 0.2660143717750938
i=160, eval_set_log_loss = 0.26296295568504885
i=180, eval_set_log_loss = 0.2625737662964804
i=200, eval_set_log_loss = 0.2635148381333408
Stopping early: curr_loss of 0.2635148381333408
                                        exceeds compare_loss of 0.2625737662964804


In [11]:
pred_probs = stboost_CA_def.predict(X_test)

In [12]:
stb_default_loss = log_loss(y_test, pred_probs)
stb_default_loss

0.4207399947148069

## Incorporate structure of "County" feature
Right now, the county feature is using a *complete* graph, where every vertex is adjacent to every other vertex.  This represents the situation where there is essentially no structure in the different values.


In [13]:
# 1653 edges means that every vertex is adjacent to every other one
default_graph = feature_configs_default['county']['graph']
len(default_graph.vertices), len(default_graph.edges), 58*57/2

(58, 1653, 1653.0)

In [14]:
# If you look at the edges, you will see there is an edge for every pair of counties
# whether they are adjacent or not
default_graph.edges;

Let's make a new config that uses the actual CA County adjacency graph. This graph is included in the package, so we don't have to build it from scratch

In [15]:
county_graph = stb.graphs.CA_county_graph()
len(county_graph.vertices), len(county_graph.edges)

(58, 133)

In [16]:
# This graph only has edges between adjacent counties
county_graph.edges;

In [17]:
# Copy the default configuration, and replace 
# the complete graph (created by the 'get_default_config')
# with the actual CA county graph
feature_configs_1 = copy.deepcopy(feature_configs_default)
feature_configs_1['county']['graph'] = county_graph

In [18]:
feature_configs_1

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13cf2d080>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

### Run the model with the new config

In [19]:
stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_1, 
                                  max_depth=2,
                                  mode='classification')

In [20]:
stboost_CA_1.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.30470751384716727
i=20, eval_set_log_loss = 0.2820092883920109
i=40, eval_set_log_loss = 0.2729456848445308
i=60, eval_set_log_loss = 0.2672527721504525
i=80, eval_set_log_loss = 0.2653239438357137
i=100, eval_set_log_loss = 0.264194013124146
i=120, eval_set_log_loss = 0.26167533105143165
i=140, eval_set_log_loss = 0.26117118748327706
i=160, eval_set_log_loss = 0.25840507138617064
i=180, eval_set_log_loss = 0.257723073703359
i=200, eval_set_log_loss = 0.2581142055624295
Stopping early: curr_loss of 0.2581142055624295
                                        exceeds compare_loss of 0.257723073703359


In [21]:
pred_probs_1 = stboost_CA_1.predict(X_test)

In [22]:
stb_1_loss = log_loss(y_test, pred_probs_1)
stb_1_loss

0.41488352662292005

### Incorporating structure in the 'month' feature
Previously we treated the month as a numerical variable.  However, doing so ignores
the fact that December is "adjacent" to January in the same way that "July" is
adjacent to "August".  To more accurately model the structure, we will represent the months
by a "cycle" graph

In [23]:
month_graph = stb.graphs.cycle_int_graph(1,12)
month_graph.vertices, month_graph.edges

({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
 {frozenset({3, 4}),
  frozenset({2, 3}),
  frozenset({11, 12}),
  frozenset({9, 10}),
  frozenset({1, 2}),
  frozenset({4, 5}),
  frozenset({6, 7}),
  frozenset({8, 9}),
  frozenset({7, 8}),
  frozenset({1, 12}),
  frozenset({5, 6}),
  frozenset({10, 11})})

In [24]:
# Copy the previous config, and change the settings for month
feature_configs_2 = copy.deepcopy(feature_configs_1)
feature_configs_2['month']['feature_type'] = 'categorical_int'
feature_configs_2['month']['graph'] = month_graph
feature_configs_2

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13cf2df28>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x13d003438>}}

In [25]:
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contract_enum_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [26]:
feature_configs_2 = stb.apply_defaults(feature_configs_2, default_configs)
feature_configs_2
    

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13cf2df28>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x13d003438>,
  'split_method': 'span_tree',
  'num_span_trees': 1}}

In [27]:
stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_2, 
                                  max_depth=2,
                                  mode='classification',
                                  loss_fn='entropy')

In [28]:
stboost_CA_2.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.30470751384716727
i=20, eval_set_log_loss = 0.28073725937933014
i=40, eval_set_log_loss = 0.2705290790572338
i=60, eval_set_log_loss = 0.2635102217535028
i=80, eval_set_log_loss = 0.26082585474221887
i=100, eval_set_log_loss = 0.26030615427055237
i=120, eval_set_log_loss = 0.2585064760940734
i=140, eval_set_log_loss = 0.25816707212182555
i=160, eval_set_log_loss = 0.2556342081843798
i=180, eval_set_log_loss = 0.255506337493032
i=200, eval_set_log_loss = 0.2563616988557239
Stopping early: curr_loss of 0.2563616988557239
                                        exceeds compare_loss of 0.255506337493032


In [29]:
pred_probs_2 = stboost_CA_2.predict(X_test)

In [30]:
stb_2_loss = log_loss(y_test, pred_probs_2)
stb_2_loss

0.41221413014323555

In [31]:
import catboost as cb

In [32]:
cat_features = ['county']
cb1 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [33]:
cb1.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 0.6819753	test: 0.6820015	best: 0.6820015 (0)	total: 56.7ms	remaining: 2m 21s
25:	learn: 0.5110601	test: 0.5112570	best: 0.5112570 (25)	total: 100ms	remaining: 9.56s
50:	learn: 0.4563865	test: 0.4584775	best: 0.4584775 (50)	total: 131ms	remaining: 6.27s
75:	learn: 0.4358038	test: 0.4402527	best: 0.4402527 (75)	total: 159ms	remaining: 5.07s
100:	learn: 0.4262112	test: 0.4321280	best: 0.4321280 (100)	total: 187ms	remaining: 4.44s
125:	learn: 0.4169196	test: 0.4251842	best: 0.4251842 (125)	total: 214ms	remaining: 4.04s
150:	learn: 0.4128215	test: 0.4227207	best: 0.4227207 (150)	total: 241ms	remaining: 3.75s
175:	learn: 0.4094387	test: 0.4213565	best: 0.4213565 (175)	total: 269ms	remaining: 3.56s
200:	learn: 0.4073417	test: 0.4201174	best: 0.4201174 (200)	total: 299ms	remaining: 3.42s
225:	learn: 0.4052567	test: 0.4194354	best: 0.4194354 (225)	total: 327ms	remaining: 3.29s
250:	learn: 0.4031150	test: 0.4184901	best: 0.4184901 (250)	total: 355ms	remaining: 3.18s
275:	learn: 0.4016

<catboost.core.CatBoostClassifier at 0x143077780>

In [34]:
pred_probs_cb1 = cb1.predict_proba(X_test)

In [35]:
cb1_loss = log_loss(y_test, pred_probs_cb1)
cb1_loss

0.42137835844026195

In [36]:
cat_features = ['county', 'month']
cb2 = cb.CatBoostClassifier(iterations=1500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [37]:
cb2.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 0.6799462	test: 0.6801343	best: 0.6801343 (0)	total: 2.6ms	remaining: 3.9s
25:	learn: 0.5083886	test: 0.5039575	best: 0.5039575 (25)	total: 35.4ms	remaining: 2.01s
50:	learn: 0.4550643	test: 0.4495641	best: 0.4495641 (50)	total: 71.5ms	remaining: 2.03s
75:	learn: 0.4324459	test: 0.4281425	best: 0.4281425 (75)	total: 101ms	remaining: 1.9s
100:	learn: 0.4248116	test: 0.4223347	best: 0.4223347 (100)	total: 133ms	remaining: 1.84s
125:	learn: 0.4191064	test: 0.4193370	best: 0.4193370 (125)	total: 162ms	remaining: 1.77s
150:	learn: 0.4150074	test: 0.4181670	best: 0.4181670 (150)	total: 195ms	remaining: 1.74s
175:	learn: 0.4126938	test: 0.4177484	best: 0.4177484 (175)	total: 225ms	remaining: 1.69s
200:	learn: 0.4110198	test: 0.4174588	best: 0.4174588 (200)	total: 255ms	remaining: 1.65s
225:	learn: 0.4099088	test: 0.4173291	best: 0.4173254 (223)	total: 285ms	remaining: 1.6s
250:	learn: 0.4087219	test: 0.4173688	best: 0.4172857 (230)	total: 314ms	remaining: 1.56s
Stopped by overfittin

<catboost.core.CatBoostClassifier at 0x1406c37b8>

In [38]:
pred_probs_cb2 = cb2.predict_proba(X_test)

In [39]:
cb2_loss = log_loss(y_test, pred_probs_cb2)
cb2_loss

0.42065301005902733

### Summary of results

In [40]:
print('StructureBoost (w no Structure):             loss = {}'.format(np.round(stb_default_loss, 5)))
print('StructureBoost w County Structure:           loss = {}'.format(np.round(stb_1_loss,5)))
print('StructureBoost w County+Month Structure:     loss = {}'.format(np.round(stb_2_loss,5)))
print('CatBoost:  (County categorical)              loss = {}'.format(np.round(cb1_loss,5)))
print('CatBoost:  (County, Month categorical)       loss = {}'.format(np.round(cb2_loss,5)))

StructureBoost (w no Structure):             loss = 0.42074
StructureBoost w County Structure:           loss = 0.41488
StructureBoost w County+Month Structure:     loss = 0.41221
CatBoost:  (County categorical)              loss = 0.42138
CatBoost:  (County, Month categorical)       loss = 0.42065


## Repeat multiple trials to show results aren't noise
In case you think the above results are just noise, let's run multiple trials and compare.

In [41]:
num_train_pts = 1000
num_trials = 10
stb_def_loss_vec = np.zeros(num_trials)
stb_1_loss_vec = np.zeros(num_trials)
stb_2_loss_vec = np.zeros(num_trials)
cb1_loss_vec = np.zeros(num_trials)
cb2_loss_vec = np.zeros(num_trials)
stb_def_AUC_vec = np.zeros(num_trials)
stb_1_AUC_vec = np.zeros(num_trials)
stb_2_AUC_vec = np.zeros(num_trials)
cb1_AUC_vec = np.zeros(num_trials)
cb2_AUC_vec = np.zeros(num_trials)
for i in range(num_trials):
    X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000, random_state=i)
    X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, 
                                                                  test_size = 10000, random_state=i)
    X_train = X_train_big.iloc[:num_train_pts,:]
    y_train = y_train_big[:num_train_pts]
    

    stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                        learning_rate=.02,
                                        feature_configs=feature_configs_default, 
                                        max_depth=2,
                                        mode='classification',
                                        random_seed=i)
    stboost_CA_def.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)

    pred_probs_def = stboost_CA_def.predict(X_test)
    stb_def_loss_vec[i] = log_loss(y_test, pred_probs_def)
    stb_def_AUC_vec[i] = roc_auc_score(y_test, pred_probs_def)

    stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_1, 
                                      max_depth=2,
                                      mode='classification',
                                      random_seed=i)
    stboost_CA_1.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_probs_1 = stboost_CA_1.predict(X_test)
    stb_1_loss_vec[i] = log_loss(y_test, pred_probs_1)
    stb_1_AUC_vec[i] = roc_auc_score(y_test, pred_probs_1)
        

    stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_2, 
                                      max_depth=2,
                                      mode='classification',
                                      random_seed=i)
    stboost_CA_2.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_probs_2 = stboost_CA_2.predict(X_test)
    stb_2_loss_vec[i] = log_loss(y_test, pred_probs_2)
    stb_2_AUC_vec[i] = roc_auc_score(y_test, pred_probs_2)

    
    cat_features_1 = ['county']
    cb1 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_1, random_seed=i)
    cb1.fit(X_train, y_train, cat_features_1, eval_set=(X_valid, y_valid), verbose=25)
    pred_probs_cb1 = cb1.predict_proba(X_test)
    cb1_loss_vec = log_loss(y_test, pred_probs_cb1)
    cb1_AUC_vec = roc_auc_score(y_test, pred_probs_cb1[:,1])
    
    
    cat_features_2 = ['county', 'month']
    cb2 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_2, random_seed=i)

    cb2.fit(X_train, y_train, cat_features_2, eval_set=(X_valid, y_valid), verbose=25)
    pred_probs_cb2 = cb2.predict_proba(X_test)
    cb2_loss_vec[i] = log_loss(y_test, pred_probs_cb2)
    cb2_AUC_vec[i] = roc_auc_score(y_test, pred_probs_cb2[:,1])
    

i=0, eval_set_log_loss = 0.32913163088919645
i=20, eval_set_log_loss = 0.30568408321873275
i=40, eval_set_log_loss = 0.2976570441967077
i=60, eval_set_log_loss = 0.29477411287661714
i=80, eval_set_log_loss = 0.291514853059641
i=100, eval_set_log_loss = 0.28849017963294954
i=120, eval_set_log_loss = 0.2871746936030187
i=140, eval_set_log_loss = 0.28776707993936307
Stopping early: curr_loss of 0.28776707993936307
                                        exceeds compare_loss of 0.2871746936030187
i=0, eval_set_log_loss = 0.32913163088919645
i=20, eval_set_log_loss = 0.3046327359529405
i=40, eval_set_log_loss = 0.2936356391202663
i=60, eval_set_log_loss = 0.28949039583502656
i=80, eval_set_log_loss = 0.2858321939889164
i=100, eval_set_log_loss = 0.2832541483413318
i=120, eval_set_log_loss = 0.2816698698017459
i=140, eval_set_log_loss = 0.2828234106386253
Stopping early: curr_loss of 0.2828234106386253
                                        exceeds compare_loss of 0.2816698698017459
i=0, ev

### Summary of results (multiple trials)

In [42]:
print('Average log_loss Scores:')
print('StructureBoost w County+Month Structure:     avg_loss = {}'.format(np.round(np.mean(stb_2_loss_vec),5)))
print('StructureBoost w County Structure:           avg_loss = {}'.format(np.round(np.mean(stb_1_loss_vec),5)))
print('StructureBoost (w no Structure):             avg_loss = {}'.format(np.round(np.mean(stb_def_loss_vec), 5)))
print('CatBoost:  (County categorical)              avg_loss = {}'.format(np.round(np.mean(cb1_loss_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_loss = {}'.format(np.round(np.mean(cb2_loss_vec),5)))
print('\nAverage ROC AUC Scores:')
print('StructureBoost w County+Month Structure:     avg_auc = {}'.format(np.round(np.mean(stb_2_AUC_vec),5)))
print('StructureBoost w County Structure:           avg_auc = {}'.format(np.round(np.mean(stb_1_AUC_vec),5)))
print('StructureBoost (w no Structure):             avg_auc = {}'.format(np.round(np.mean(stb_def_AUC_vec), 5)))
print('CatBoost:  (County categorical)              avg_auc = {}'.format(np.round(np.mean(cb1_AUC_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_auc = {}'.format(np.round(np.mean(cb2_AUC_vec),5)))

Average log_loss Scores:
StructureBoost w County+Month Structure:     avg_loss = 0.4105
StructureBoost w County Structure:           avg_loss = 0.41495
StructureBoost (w no Structure):             avg_loss = 0.42002
CatBoost:  (County categorical)              avg_loss = 0.42216
CatBoost:  (County, Month categorical)       avg_loss = 0.42129

Average ROC AUC Scores:
StructureBoost w County+Month Structure:     avg_auc = 0.75148
StructureBoost w County Structure:           avg_auc = 0.74744
StructureBoost (w no Structure):             avg_auc = 0.73501
CatBoost:  (County categorical)              avg_auc = 0.73303
CatBoost:  (County, Month categorical)       avg_auc = 0.72498


In [43]:
log_loss_diff_vec = cb1_loss_vec-stb_2_loss_vec
log_loss_diff_vec

array([0.01579697, 0.01061359, 0.01067496, 0.01261173, 0.01131626,
       0.01287495, 0.01398749, 0.01436585, 0.0053724 , 0.00898121])

In [44]:
## mean and std deviation of log_loss difference
## between StructureBoost and CatBoost
np.mean(log_loss_diff_vec), np.std(log_loss_diff_vec)

(0.011659540488329345, 0.0028483070398436614)

In [45]:
# min and max of discrepancy
np.min(log_loss_diff_vec), np.max(log_loss_diff_vec)

(0.005372401296449725, 0.01579697357119242)

In [46]:
roc_auc_score(y_test,pred_probs_2), roc_auc_score(y_test,pred_probs_cb1[:,1])

(0.7490373908348027, 0.7330267813829326)

In [47]:
auc_diff_vec = stb_2_AUC_vec - cb1_AUC_vec
auc_diff_vec

array([0.02671785, 0.02033665, 0.02035566, 0.01356166, 0.01831135,
       0.01951565, 0.01429184, 0.01791736, 0.01751857, 0.01601061])

In [48]:
## mean and std deviation of AUC difference
## between StructureBoost and CatBoost
np.mean(auc_diff_vec), np.std(auc_diff_vec)

(0.01845371970478523, 0.0035354061933001303)

In [49]:
# min and max of discrepancy
np.min(auc_diff_vec), np.max(auc_diff_vec)

(0.013561658639675955, 0.02671784619986084)