In [1]:
%load_ext autoreload
%autoreload 2

## Auditing XGBoost model

In [8]:
from pprint import pprint
import numpy as np

In [2]:
from hyperdt.toy_data import wrapped_normal_mixture

X, y = wrapped_normal_mixture(num_points=1_000, num_classes=4, num_dims=2)

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [4]:
from xgboost import XGBClassifier
from hyperdt import HyperbolicRandomForestClassifier
from sklearn.metrics import accuracy_score

# We use 50 trees for XGBoost and 100 for RF because 50 trees * 2 classes = 100 total trees, i.e. it's the same
clf = XGBClassifier(n_estimators=25, max_depth=3, learning_rate=0.1, validate_parameters=True)
clf.fit(X_train, y_train)
print(accuracy_score(y_test, clf.predict(X_test)))


rf = HyperbolicRandomForestClassifier(n_estimators=100, max_depth=3)
rf.fit(X_train, y_train)
print(accuracy_score(y_test, rf.predict(X_test)))

0.99
0.995


In [5]:
# Inspect the structure of the xgboost model

clf.get_booster().get_dump()

['0:[f2<-0.632465422] yes=1,no=2,missing=2\n\t1:[f1<0.672248602] yes=3,no=4,missing=4\n\t\t3:[f1<-0.0802244172] yes=7,no=8,missing=8\n\t\t\t7:leaf=0.184147462\n\t\t\t8:leaf=0.140609428\n\t\t4:[f1<3.04906774] yes=9,no=10,missing=10\n\t\t\t9:leaf=0.0736994222\n\t\t\t10:leaf=-0.0232121926\n\t2:[f0<1.21838093] yes=5,no=6,missing=6\n\t\t5:[f2<-0.0294386204] yes=11,no=12,missing=12\n\t\t\t11:leaf=0.0747331008\n\t\t\t12:leaf=-0.047826089\n\t\t6:[f2<-0.436357737] yes=13,no=14,missing=14\n\t\t\t13:leaf=-0.0149779739\n\t\t\t14:leaf=-0.0655422136\n',
 '0:[f2<-0.750716686] yes=1,no=2,missing=2\n\t1:[f1<11.7420912] yes=3,no=4,missing=4\n\t\t3:[f1<2.76864147] yes=7,no=8,missing=8\n\t\t\t7:leaf=-0.0630697682\n\t\t\t8:leaf=-0.00403889315\n\t\t4:[f2<-10.0307398] yes=9,no=10,missing=10\n\t\t\t9:leaf=0.0245161299\n\t\t\t10:leaf=0.190857157\n\t2:[f1<0.234930784] yes=5,no=6,missing=6\n\t\t5:[f1<-0.159134358] yes=11,no=12,missing=12\n\t\t\t11:leaf=-0.0556823872\n\t\t\t12:leaf=0.0319597982\n\t\t6:[f1<0.67224

In [49]:
booster = clf.get_booster()

In [50]:
booster.trees_to_dataframe()

Unnamed: 0,Tree,Node,ID,Feature,Split,Yes,No,Missing,Gain,Cover,Category
0,0,0,0-0,f1,-0.533402,0-1,0-2,0-2,29.307251,3000.000000,
1,0,1,0-1,f2,-1.471681,0-3,0-4,0-4,24.694458,1264.875000,
2,0,2,0-2,f2,-0.370468,0-5,0-6,0-6,2.032593,1735.125000,
3,0,3,0-3,f2,-3.169234,0-7,0-8,0-8,3.050278,470.625000,
4,0,4,0-4,f2,0.535078,0-9,0-10,0-10,24.587402,794.250000,
...,...,...,...,...,...,...,...,...,...,...,...
1489,99,10,99-10,Leaf,,,,,-0.026594,35.408577,
1490,99,11,99-11,Leaf,,,,,-0.058119,24.533184,
1491,99,12,99-12,Leaf,,,,,0.049587,7.177583,
1492,99,13,99-13,Leaf,,,,,0.095849,70.808792,


## Editing model as JSON

In [51]:
booster.save_model("xgb.json")

In [9]:
# Edit the JSON file to tweak the thresholds, then reload
import json
from pprint import pprint

with open("xgb.json", "r") as f:
    data = json.load(f)

# What are the keys?
pprint(data.keys())
pprint(data["learner"].keys())
pprint(data["learner"]["gradient_booster"].keys())
pprint(data["learner"]["gradient_booster"]["model"].keys())

# Trees should be stored in "gradient_booster" -> "model" -> "trees"
# Let's look at one of them
pprint(data["learner"]["gradient_booster"]["model"]["trees"][0].keys())

dict_keys(['learner', 'version'])
dict_keys(['attributes', 'feature_names', 'feature_types', 'gradient_booster', 'learner_model_param', 'objective'])
dict_keys(['model', 'name'])
dict_keys(['gbtree_model_param', 'iteration_indptr', 'tree_info', 'trees'])
dict_keys(['base_weights', 'categories', 'categories_nodes', 'categories_segments', 'categories_sizes', 'default_left', 'id', 'left_children', 'loss_changes', 'parents', 'right_children', 'split_conditions', 'split_indices', 'split_type', 'sum_hessian', 'tree_param'])


In [65]:
pprint(data["learner"]["gradient_booster"]["model"]["trees"][0])

{'base_weights': [-0.51316226,
                  -0.39676112,
                  -0.5977392,
                  -0.57831967,
                  -0.2885885,
                  -0.64993507,
                  -0.57121295,
                  -0.06465899,
                  -0.047406435,
                  -0.014033209,
                  -0.04977238,
                  -0.051063832,
                  -0.06546601,
                  -0.051215805,
                  -0.064742476],
 'categories': [],
 'categories_nodes': [],
 'categories_segments': [],
 'categories_sizes': [],
 'default_left': [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
 'id': 0,
 'left_children': [1, 3, 5, 7, 9, 11, 13, -1, -1, -1, -1, -1, -1, -1, -1],
 'loss_changes': [29.307251,
                  24.694458,
                  2.0325928,
                  3.0502777,
                  24.587402,
                  0.045959473,
                  4.886383,
                  0.0,
                  0.0,
                  0.0,
            

In [74]:
# Let's try to make an edited version of the JSON file and save it, then load it again

data_modified = data.copy()

for tree in data_modified["learner"]["gradient_booster"]["model"]["trees"]:
    tree["split_conditions"] = [-t for t in tree["split_conditions"]]

with open("xgb_modified.json", "w") as f:
    json.dump(data_modified, f)

clf2 = XGBClassifier(n_estimators=25, max_depth=3, learning_rate=0.1, validate_parameters=True)
clf2.load_model("xgb_modified.json")

print(accuracy_score(y_test, clf2.predict(X_test)))

0.0


In [2]:
%load_ext autoreload
%autoreload 2

from sklearn.model_selection import train_test_split
from hyperdt.toy_data import wrapped_normal_mixture
from hyperdt.xgboost import HyperbolicXGBoostClassifier
from sklearn.metrics import accuracy_score

X, y = wrapped_normal_mixture(num_points=1_000, num_classes=4, num_dims=2)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

clf = HyperbolicXGBoostClassifier(n_estimators=25, max_depth=3, learning_rate=0.1, validate_parameters=True)
clf.fit(X_train, y_train)
print(accuracy_score(y_test, clf.predict(X_test)))


0.775


In [7]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_estimators=25, max_depth=3)
rf.fit(X_train, y_train)
print(accuracy_score(y_test, rf.predict(X_test)))

0.765


In [8]:
from hyperdt.ensemble import HyperbolicRandomForestClassifier

hrf = HyperbolicRandomForestClassifier(n_estimators=25, max_depth=3)
hrf.fit(X_train, y_train)
print(accuracy_score(y_test, hrf.predict(X_test)))

0.785


In [9]:
hrf.estimator_.estimators_samples_

[array([231, 666, 211, 451, 253, 202, 188, 622,  93, 640, 216, 350, 258,
        698, 109, 508,  16, 661, 622, 285, 317,  92,   3, 640, 141, 284,
        584, 149, 149,  80, 362, 761, 735,  99, 714, 589, 503, 380, 790,
        661, 568, 265,  25, 338, 440, 690, 191, 561, 646, 129, 241, 287,
        319, 768, 676, 124, 411,  12, 588, 678, 258,  94, 257, 547, 734,
          2, 168, 645, 685, 480, 241, 651, 162, 657, 534, 234, 508, 420,
        655, 651, 528, 670,  72, 299, 101, 715, 649, 708,  12, 672,  63,
        511, 794, 685,  59, 404, 605, 314, 380, 781, 112, 133, 344, 391,
        534, 574, 631,  23, 756, 293, 514,  54, 442,   2, 294, 577, 243,
        178,  16,  77, 528, 241, 493, 307, 230, 266, 341, 196, 321, 229,
        540, 332, 773, 205, 527, 222, 128, 225,  88, 572, 162, 720, 734,
        676,  34, 712, 608, 149, 261, 296, 529, 626, 467, 479,   6, 464,
        635, 293, 684, 777, 677, 173, 234, 459, 245, 596, 275, 397, 451,
        269, 253, 631, 685,  62, 708, 459, 648, 779

## Auditing indices

TODO: XGBoost makes it really difficult to figure out what indices went into your subsampling