## StructureBoost Quick Config
### Predicting rain and temperature in California counties

This is intended to show the quickest way to configure and run StructureBoost

## Imports

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

from sklearn.metrics import log_loss, roc_auc_score
from sklearn.model_selection import train_test_split
import copy

import structureboost as stb

pd.set_option("display.max_rows",999)
pd.set_option("display.max_columns",999)

## Load and Process Data

In [2]:
df_ca_PRCP = pd.read_csv('data/CA_County_PRCP.csv')

In [3]:
df_ca_PRCP.sample(5)

Unnamed: 0,county,month,DATE,STATION,PRCP,rained,county_DATE
260157,San_Diego,2,2016-02-11,US1CASD0004,0.0,0,San_Diego___2016-02-11
397243,Ventura,4,2007-04-20,USC00046399,0.15,1,Ventura___2007-04-20
106486,Kings,3,2001-03-13,USW00023110,0.0,0,Kings___2001-03-13
298389,Santa_Clara,7,2006-07-06,USC00043417,0.0,0,Santa_Clara___2006-07-06
99494,Kern,1,2002-01-01,USC00041244,0.0,0,Kern___2002-01-01


In [4]:
## We'll use just two features, county and month
## Target is whether or not it rained
X = df_ca_PRCP.loc[:,['county','month']]
y = df_ca_PRCP.rained.values

In [5]:
X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000)
X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, test_size = 10000)

In [6]:
num_train_pts = 1000
X_train = X_train_big.iloc[:num_train_pts,:]
y_train = y_train_big[:num_train_pts]

In [7]:
default_configs = stb.default_config_dict()
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contraction_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [8]:
feature_configs_default = stb.get_basic_config(X_train, default_configs)
feature_configs_default

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x10ea75860>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

In [9]:
stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                    learning_rate=.02,
                                    feature_configs=feature_configs_default, 
                                    max_depth=2,
                                    mode='classification')

In [10]:
stboost_CA_def.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.2950068658367074
i=20, eval_set_log_loss = 0.273869668591157
i=40, eval_set_log_loss = 0.26352277525089984
i=60, eval_set_log_loss = 0.2585443496552743
i=80, eval_set_log_loss = 0.2551305891278421
i=100, eval_set_log_loss = 0.2530101200913495
i=120, eval_set_log_loss = 0.25203265136796865
i=140, eval_set_log_loss = 0.2512958219084071
i=160, eval_set_log_loss = 0.2498105951257714
i=180, eval_set_log_loss = 0.2486438565654821
i=200, eval_set_log_loss = 0.24936482648871333
Stopping early: curr_loss of 0.24936482648871333
                                        exceeds compare_loss of 0.2486438565654821


In [11]:
pred_probs = stboost_CA_def.predict(X_test)

In [12]:
stb_default_loss = log_loss(y_test, pred_probs)
stb_default_loss

0.41478261748957146

## Incorporate structure of "County" feature
Right now, the county feature is using a *complete* graph, where every vertex is adjacent to every other vertex.  This represents the situation where there is essentially no structure in the different values.


In [13]:
# 1653 edges means that every vertex is adjacent to every other one
default_graph = feature_configs_default['county']['graph']
len(default_graph.vertices), len(default_graph.edges), 58*57/2

(58, 1653, 1653.0)

In [14]:
# If you look at the edges, you will see there is an edge for every pair of counties
# whether they are adjacent or not
default_graph.edges;

Let's make a new config that uses the actual CA County adjacency graph. This graph is included in the package, so we don't have to build it from scratch

In [15]:
county_graph = stb.graphs.CA_county_graph()
len(county_graph.vertices), len(county_graph.edges)

(58, 133)

In [16]:
# This graph only has edges between adjacent counties
county_graph.edges;

In [17]:
# Copy the default configuration, and replace 
# the complete graph (created by the 'get_default_config')
# with the actual CA county graph
feature_configs_1 = copy.deepcopy(feature_configs_default)
feature_configs_1['county']['graph'] = county_graph

In [18]:
feature_configs_1

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13ee96630>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'numerical', 'max_splits_to_search': 25}}

### Run the model with the new config

In [19]:
stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_1, 
                                  max_depth=2,
                                  mode='classification')

In [20]:
stboost_CA_1.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.2950068658367074
i=20, eval_set_log_loss = 0.2726518808818964
i=40, eval_set_log_loss = 0.26203654198882814
i=60, eval_set_log_loss = 0.2567204706213276
i=80, eval_set_log_loss = 0.25174241901218275
i=100, eval_set_log_loss = 0.2498603534452938
i=120, eval_set_log_loss = 0.24849547010548903
i=140, eval_set_log_loss = 0.24771869922786433
i=160, eval_set_log_loss = 0.24634152485548652
i=180, eval_set_log_loss = 0.24440095976315515
i=200, eval_set_log_loss = 0.2452928763886461
Stopping early: curr_loss of 0.2452928763886461
                                        exceeds compare_loss of 0.24440095976315515


In [21]:
pred_probs_1 = stboost_CA_1.predict(X_test)

In [22]:
stb_1_loss = log_loss(y_test, pred_probs_1)
stb_1_loss

0.4085955364220997

### Incorporating structure in the 'month' feature
Previously we treated the month as a numerical variable.  However, doing so ignores
the fact that December is "adjacent" to January in the same way that "July" is
adjacent to "August".  To more accurately model the structure, we will represent the months
by a "cycle" graph

In [23]:
month_graph = stb.graphs.cycle_int_graph(1,12)
month_graph.vertices, month_graph.edges

({1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12},
 {frozenset({3, 4}),
  frozenset({2, 3}),
  frozenset({11, 12}),
  frozenset({9, 10}),
  frozenset({1, 2}),
  frozenset({4, 5}),
  frozenset({6, 7}),
  frozenset({8, 9}),
  frozenset({7, 8}),
  frozenset({1, 12}),
  frozenset({5, 6}),
  frozenset({10, 11})})

In [24]:
# Copy the previous config, and change the settings for month
feature_configs_2 = copy.deepcopy(feature_configs_1)
feature_configs_2['month']['feature_type'] = 'categorical_int'
feature_configs_2['month']['graph'] = month_graph
feature_configs_2

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13f707780>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x13f6d8e48>}}

In [25]:
default_configs

{'default_categorical_method': 'span_tree',
 'default_num_span_trees': 1,
 'default_contraction_size': 9,
 'default_contraction_max_splits_to_search': 25,
 'default_numerical_max_splits_to_search': 25}

In [26]:
feature_configs_2 = stb.apply_defaults(feature_configs_2, default_configs)
feature_configs_2
    

{'county': {'feature_type': 'categorical_str',
  'graph': <graphs.graph_undirected at 0x13f707780>,
  'split_method': 'span_tree',
  'num_span_trees': 1},
 'month': {'feature_type': 'categorical_int',
  'max_splits_to_search': 25,
  'graph': <graphs.graph_undirected at 0x13f6d8e48>,
  'split_method': 'span_tree',
  'num_span_trees': 1}}

In [27]:
stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                  learning_rate=.02,
                                  feature_configs=feature_configs_2, 
                                  max_depth=2,
                                  mode='classification',
                                  loss_fn='entropy')

In [28]:
stboost_CA_2.fit(X_train, y_train, 
                   eval_set = ((X_valid, y_valid)), 
                   early_stop_past_steps=1, 
                   eval_freq=20)

i=0, eval_set_log_loss = 0.2950068658367074
i=20, eval_set_log_loss = 0.27173331227289244
i=40, eval_set_log_loss = 0.25946034559825787
i=60, eval_set_log_loss = 0.2531984667569163
i=80, eval_set_log_loss = 0.24854775379834618
i=100, eval_set_log_loss = 0.24558035849497245
i=120, eval_set_log_loss = 0.24455806150149828
i=140, eval_set_log_loss = 0.2442784147599842
i=160, eval_set_log_loss = 0.24292634070762797
i=180, eval_set_log_loss = 0.24164818020222237
i=200, eval_set_log_loss = 0.2424998660306479
Stopping early: curr_loss of 0.2424998660306479
                                        exceeds compare_loss of 0.24164818020222237


In [29]:
pred_probs_2 = stboost_CA_2.predict(X_test)

In [30]:
stb_2_loss = log_loss(y_test, pred_probs_2)
stb_2_loss

0.4057312252707137

In [31]:
import catboost as cb

In [32]:
cat_features = ['county']
cb1 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [33]:
cb1.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 0.6827221	test: 0.6825414	best: 0.6825414 (0)	total: 57.7ms	remaining: 2m 24s
25:	learn: 0.5222256	test: 0.5195576	best: 0.5195576 (25)	total: 99.6ms	remaining: 9.48s
50:	learn: 0.4694625	test: 0.4649570	best: 0.4649570 (50)	total: 137ms	remaining: 6.6s
75:	learn: 0.4504730	test: 0.4456358	best: 0.4456358 (75)	total: 173ms	remaining: 5.52s
100:	learn: 0.4395716	test: 0.4344969	best: 0.4344969 (100)	total: 206ms	remaining: 4.9s
125:	learn: 0.4333829	test: 0.4286156	best: 0.4286156 (125)	total: 247ms	remaining: 4.66s
150:	learn: 0.4284970	test: 0.4244657	best: 0.4244657 (150)	total: 293ms	remaining: 4.55s
175:	learn: 0.4241794	test: 0.4212565	best: 0.4212565 (175)	total: 333ms	remaining: 4.39s
200:	learn: 0.4215343	test: 0.4196578	best: 0.4196578 (200)	total: 377ms	remaining: 4.31s
225:	learn: 0.4198845	test: 0.4188936	best: 0.4188466 (219)	total: 423ms	remaining: 4.26s
250:	learn: 0.4177629	test: 0.4179504	best: 0.4179504 (250)	total: 459ms	remaining: 4.12s
275:	learn: 0.41580

<catboost.core.CatBoostClassifier at 0x142146a58>

In [34]:
pred_probs_cb1 = cb1.predict_proba(X_test)

In [35]:
cb1_loss = log_loss(y_test, pred_probs_cb1)
cb1_loss

0.41509696026702203

In [36]:
cat_features = ['county', 'month']
cb2 = cb.CatBoostClassifier(iterations=1500, early_stopping_rounds=20, max_depth=2,
                             learning_rate=.02, cat_features=cat_features)

In [37]:
cb2.fit(X_train, y_train, cat_features, eval_set=(X_valid, y_valid), verbose=25)

0:	learn: 0.6814046	test: 0.6809097	best: 0.6809097 (0)	total: 3.85ms	remaining: 5.78s
25:	learn: 0.5163533	test: 0.5075877	best: 0.5075877 (25)	total: 50.1ms	remaining: 2.84s
50:	learn: 0.4669458	test: 0.4561073	best: 0.4561073 (50)	total: 97.9ms	remaining: 2.78s
75:	learn: 0.4469424	test: 0.4344445	best: 0.4344445 (75)	total: 135ms	remaining: 2.53s
100:	learn: 0.4397294	test: 0.4272769	best: 0.4272769 (100)	total: 169ms	remaining: 2.34s
125:	learn: 0.4346002	test: 0.4220242	best: 0.4220242 (125)	total: 202ms	remaining: 2.21s
150:	learn: 0.4313632	test: 0.4192101	best: 0.4192101 (150)	total: 239ms	remaining: 2.14s
175:	learn: 0.4290511	test: 0.4185213	best: 0.4185213 (175)	total: 280ms	remaining: 2.11s
200:	learn: 0.4271111	test: 0.4176377	best: 0.4176258 (199)	total: 315ms	remaining: 2.03s
225:	learn: 0.4260005	test: 0.4172890	best: 0.4172366 (223)	total: 349ms	remaining: 1.97s
250:	learn: 0.4249126	test: 0.4169510	best: 0.4169510 (250)	total: 389ms	remaining: 1.93s
275:	learn: 0.423

<catboost.core.CatBoostClassifier at 0x142156e10>

In [38]:
pred_probs_cb2 = cb2.predict_proba(X_test)

In [39]:
cb2_loss = log_loss(y_test, pred_probs_cb2)
cb2_loss

0.41556872064906875

### Summary of results

In [40]:
print('StructureBoost (w no Structure):             loss = {}'.format(np.round(stb_default_loss, 5)))
print('StructureBoost w County Structure:           loss = {}'.format(np.round(stb_1_loss,5)))
print('StructureBoost w County+Month Structure:     loss = {}'.format(np.round(stb_2_loss,5)))
print('CatBoost:  (County categorical)              loss = {}'.format(np.round(cb1_loss,5)))
print('CatBoost:  (County, Month categorical)       loss = {}'.format(np.round(cb2_loss,5)))

StructureBoost (w no Structure):             loss = 0.41478
StructureBoost w County Structure:           loss = 0.4086
StructureBoost w County+Month Structure:     loss = 0.40573
CatBoost:  (County categorical)              loss = 0.4151
CatBoost:  (County, Month categorical)       loss = 0.41557


## Repeat multiple trials to show results aren't noise
In case you think the above results are just noise, let's run multiple trials and compare.

In [41]:
num_train_pts = 1000
num_trials = 10
stb_def_loss_vec = np.zeros(num_trials)
stb_1_loss_vec = np.zeros(num_trials)
stb_2_loss_vec = np.zeros(num_trials)
cb1_loss_vec = np.zeros(num_trials)
cb2_loss_vec = np.zeros(num_trials)
stb_def_AUC_vec = np.zeros(num_trials)
stb_1_AUC_vec = np.zeros(num_trials)
stb_2_AUC_vec = np.zeros(num_trials)
cb1_AUC_vec = np.zeros(num_trials)
cb2_AUC_vec = np.zeros(num_trials)
for i in range(num_trials):
    X_trva, X_test, y_trva, y_test = train_test_split(X,y, test_size = 70000, random_state=i)
    X_train_big, X_valid, y_train_big, y_valid = train_test_split(X_trva, y_trva, 
                                                                  test_size = 10000, random_state=i)
    X_train = X_train_big.iloc[:num_train_pts,:]
    y_train = y_train_big[:num_train_pts]
    

    stboost_CA_def = stb.StructureBoost(num_trees = 2500,
                                        learning_rate=.02,
                                        feature_configs=feature_configs_default, 
                                        max_depth=2,
                                        mode='classification',
                                        random_seed=i)
    stboost_CA_def.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)

    pred_probs_def = stboost_CA_def.predict(X_test)
    stb_def_loss_vec[i] = log_loss(y_test, pred_probs_def)
    stb_def_AUC_vec[i] = roc_auc_score(y_test, pred_probs_def)

    stboost_CA_1 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_1, 
                                      max_depth=2,
                                      mode='classification',
                                      random_seed=i)
    stboost_CA_1.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_probs_1 = stboost_CA_1.predict(X_test)
    stb_1_loss_vec[i] = log_loss(y_test, pred_probs_1)
    stb_1_AUC_vec[i] = roc_auc_score(y_test, pred_probs_1)
        

    stboost_CA_2 = stb.StructureBoost(num_trees = 2500,
                                      learning_rate=.02,
                                      feature_configs=feature_configs_2, 
                                      max_depth=2,
                                      mode='classification',
                                      random_seed=i)
    stboost_CA_2.fit(X_train, y_train, 
                       eval_set = ((X_valid, y_valid)), 
                       early_stop_past_steps=1, 
                       eval_freq=20)
    pred_probs_2 = stboost_CA_2.predict(X_test)
    stb_2_loss_vec[i] = log_loss(y_test, pred_probs_2)
    stb_2_AUC_vec[i] = roc_auc_score(y_test, pred_probs_2)

    
    cat_features_1 = ['county']
    cb1 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_1, random_seed=i)
    cb1.fit(X_train, y_train, cat_features_1, eval_set=(X_valid, y_valid), verbose=25)
    pred_probs_cb1 = cb1.predict_proba(X_test)
    cb1_loss_vec = log_loss(y_test, pred_probs_cb1)
    cb1_AUC_vec = roc_auc_score(y_test, pred_probs_cb1[:,1])
    
    
    cat_features_2 = ['county', 'month']
    cb2 = cb.CatBoostClassifier(iterations=2500, early_stopping_rounds=20,
                                max_depth=2,learning_rate=.02,
                                cat_features=cat_features_2, random_seed=i)

    cb2.fit(X_train, y_train, cat_features_2, eval_set=(X_valid, y_valid), verbose=25)
    pred_probs_cb2 = cb2.predict_proba(X_test)
    cb2_loss_vec[i] = log_loss(y_test, pred_probs_cb2)
    cb2_AUC_vec[i] = roc_auc_score(y_test, pred_probs_cb2[:,1])
    

i=0, eval_set_log_loss = 0.32913163088919645
i=20, eval_set_log_loss = 0.30565889343150027
i=40, eval_set_log_loss = 0.2973471834599709
i=60, eval_set_log_loss = 0.29487934439256525
i=80, eval_set_log_loss = 0.2920862718225693
i=100, eval_set_log_loss = 0.28941300606022996
i=120, eval_set_log_loss = 0.28820175092697253
i=140, eval_set_log_loss = 0.28881783420227114
Stopping early: curr_loss of 0.28881783420227114
                                        exceeds compare_loss of 0.28820175092697253
i=0, eval_set_log_loss = 0.32913163088919645
i=20, eval_set_log_loss = 0.3038959520588518
i=40, eval_set_log_loss = 0.2941514358886911
i=60, eval_set_log_loss = 0.2904796261874991
i=80, eval_set_log_loss = 0.28647153530181063
i=100, eval_set_log_loss = 0.2837046401803958
i=120, eval_set_log_loss = 0.2828454761060959
i=140, eval_set_log_loss = 0.28337100279975597
Stopping early: curr_loss of 0.28337100279975597
                                        exceeds compare_loss of 0.2828454761060959
i=

### Summary of results (multiple trials)

In [42]:
print('Average log_loss Scores:')
print('StructureBoost w County+Month Structure:     avg_loss = {}'.format(np.round(np.mean(stb_2_loss_vec),5)))
print('StructureBoost w County Structure:           avg_loss = {}'.format(np.round(np.mean(stb_1_loss_vec),5)))
print('StructureBoost (w no Structure):             avg_loss = {}'.format(np.round(np.mean(stb_def_loss_vec), 5)))
print('CatBoost:  (County categorical)              avg_loss = {}'.format(np.round(np.mean(cb1_loss_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_loss = {}'.format(np.round(np.mean(cb2_loss_vec),5)))
print('\nAverage ROC AUC Scores:')
print('StructureBoost w County+Month Structure:     avg_auc = {}'.format(np.round(np.mean(stb_2_AUC_vec),5)))
print('StructureBoost w County Structure:           avg_auc = {}'.format(np.round(np.mean(stb_1_AUC_vec),5)))
print('StructureBoost (w no Structure):             avg_auc = {}'.format(np.round(np.mean(stb_def_AUC_vec), 5)))
print('CatBoost:  (County categorical)              avg_auc = {}'.format(np.round(np.mean(cb1_AUC_vec),5)))
print('CatBoost:  (County, Month categorical)       avg_auc = {}'.format(np.round(np.mean(cb2_AUC_vec),5)))

Average log_loss Scores:
StructureBoost w County+Month Structure:     avg_loss = 0.41081
StructureBoost w County Structure:           avg_loss = 0.41523
StructureBoost (w no Structure):             avg_loss = 0.42056
CatBoost:  (County categorical)              avg_loss = 0.42216
CatBoost:  (County, Month categorical)       avg_loss = 0.42129

Average ROC AUC Scores:
StructureBoost w County+Month Structure:     avg_auc = 0.75111
StructureBoost w County Structure:           avg_auc = 0.74739
StructureBoost (w no Structure):             avg_auc = 0.7341
CatBoost:  (County categorical)              avg_auc = 0.73303
CatBoost:  (County, Month categorical)       avg_auc = 0.72498


In [43]:
log_loss_diff_vec = cb1_loss_vec-stb_2_loss_vec
log_loss_diff_vec

array([0.01519417, 0.00933463, 0.01346197, 0.01246671, 0.0104533 ,
       0.01257437, 0.01436055, 0.01492446, 0.00498313, 0.00571704])

In [44]:
## mean and std deviation of log_loss difference
## between StructureBoost and CatBoost
np.mean(log_loss_diff_vec), np.std(log_loss_diff_vec)

(0.011347032400036993, 0.0034778912969132725)

In [45]:
# min and max of discrepancy
np.min(log_loss_diff_vec), np.max(log_loss_diff_vec)

(0.004983129855378243, 0.015194170935822593)

In [46]:
roc_auc_score(y_test,pred_probs_2), roc_auc_score(y_test,pred_probs_cb1[:,1])

(0.7492064642677905, 0.7330267813829326)

In [47]:
auc_diff_vec = stb_2_AUC_vec - cb1_AUC_vec
auc_diff_vec

array([0.02449478, 0.0187097 , 0.01982976, 0.01269632, 0.01728712,
       0.01883773, 0.01567058, 0.01849225, 0.01861538, 0.01617968])

In [48]:
## mean and std deviation of AUC difference
## between StructureBoost and CatBoost
np.mean(auc_diff_vec), np.std(auc_diff_vec)

(0.018081332172357932, 0.002909769332852116)

In [49]:
# min and max of discrepancy
np.min(auc_diff_vec), np.max(auc_diff_vec)

(0.01269631555146844, 0.024494783560344735)