In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Import Tree Models from scratch functions
import sys
import os

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path+"/TreeModelsFromScratch")

from DecisionTree import DecisionTree
from RandomForest import RandomForest
from SmoothShap import verify_shap_model, smooth_shap

# Reestimate node values based on oob/ test set

## Regression

In [319]:
# Load data
X, y = datasets.load_diabetes(return_X_y=True, as_frame=True)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [320]:
reg_tree = DecisionTree(max_depth=2, treetype="regression", random_state=42)
reg_tree.fit(X_train, y_train)

In [321]:
[node.value for node in reg_tree.node_list]

[153.73654390934846,
 118.04306220095694,
 100.5592105263158,
 164.66666666666666,
 205.54166666666666,
 191.10169491525423,
 271.0769230769231]

In [322]:
traversed_nodes = reg_tree.explain_decision_path(X_train)[:,0].copy()

In [323]:
y_vals_array = np.full((reg_tree.n_nodes, X_train.shape[0]), np.nan)

In [324]:
for i, (idxs, y) in enumerate(zip(traversed_nodes, y_train.values)):
    y_vals_array[list(idxs),[i]] = y

In [325]:
y_vals_array

array([[144., 150., 280., ..., 148.,  64., 302.],
       [ nan, 150.,  nan, ..., 148.,  64.,  nan],
       [ nan, 150.,  nan, ..., 148.,  64.,  nan],
       ...,
       [144.,  nan, 280., ...,  nan,  nan, 302.],
       [144.,  nan, 280., ...,  nan,  nan, 302.],
       [ nan,  nan,  nan, ...,  nan,  nan,  nan]])

In [326]:
# Check if one of the nodes only contains nan values -> no observation reached this node
np.isnan(y_vals_array.all(axis=1))

array([False, False, False, False, False, False, False])

In [327]:
#calculate mean value per node 
np.nanmean(y_vals_array, axis=1)

array([153.73654391, 118.0430622 , 100.55921053, 164.66666667,
       205.54166667, 191.10169492, 271.07692308])

In [328]:
#Check if node values train are equal to newly calculated node values (should be the case since we are only passing training data)
assert np.round([node.value for node in reg_tree.node_list],8).all() == np.nanmean(y_vals_array, axis=1).all()

Lets calculate the node values based on the test set

In [329]:
traversed_nodes_test = reg_tree.explain_decision_path(X_test)[:,0].copy()

In [330]:
y_vals_array_test = np.full((reg_tree.n_nodes, X_test.shape[0]), np.nan)

In [331]:
for i, (idxs, y) in enumerate(zip(traversed_nodes_test, y_test.values)):
    y_vals_array_test[list(idxs),[i]] = y

In [332]:
# Check if one of the nodes only contains nan values -> no observation reached this node
np.isnan(y_vals_array_test.all(axis=1))

array([False, False, False, False, False, False, False])

In [333]:
#calculate mean value per node 
node_vals_test = np.nanmean(y_vals_array_test, axis=1)
node_vals_test

array([145.7752809 , 123.07142857,  98.97222222, 166.45      ,
       184.3030303 , 179.4137931 , 219.75      ])

In [334]:
orig_node_vals = np.round([node.value for node in reg_tree.node_list],8)

In [335]:
# Difference between original node values and node values based on test set
orig_node_vals -node_vals_test

array([ 7.96126301, -5.02836637,  1.58698831, -1.78333333, 21.23863637,
       11.68790182, 51.32692308])

In [336]:
n_samples = np.count_nonzero(~np.isnan(y_vals_array_test), axis=1)

In [337]:
#store regression results in dict 
res = {}
node_vals = np.nanmean(y_vals_array_test, axis=1)
n_samples = np.count_nonzero(~np.isnan(y_vals_array_test), axis=1)

for i in range(y_vals_array_test.shape[0]):
    
    res[i]={"samples": n_samples[i],
            "value": node_vals[i]
           }

In [338]:
res

{0: {'samples': 89, 'value': 145.77528089887642},
 1: {'samples': 56, 'value': 123.07142857142857},
 2: {'samples': 36, 'value': 98.97222222222223},
 3: {'samples': 20, 'value': 166.45},
 4: {'samples': 33, 'value': 184.3030303030303},
 5: {'samples': 29, 'value': 179.41379310344828},
 6: {'samples': 4, 'value': 219.75}}

### Test implemented class function

In [339]:
node_vals_train, result = reg_tree._reestimate_node_values(X_train, y_train)

In [340]:
node_vals_train

array([153.73654391, 118.0430622 , 100.55921053, 164.66666667,
       205.54166667, 191.10169492, 271.07692308])

In [342]:
orig_node_vals = np.array([node.value for node in reg_tree.node_list])

In [354]:
np.array_equal(node_vals_train, orig_node_vals)

True

Works :) 

In [344]:
node_vals_test, result_test = reg_tree._reestimate_node_values(X_test, y_test)

In [346]:
node_vals_test

array([145.7752809 , 123.07142857,  98.97222222, 166.45      ,
       184.3030303 , 179.4137931 , 219.75      ])

In [347]:
orig_node_vals

array([153.73654391, 118.0430622 , 100.55921053, 164.66666667,
       205.54166667, 191.10169492, 271.07692308])

In [353]:
# This should not be equal
np.array_equal(node_vals_test, orig_node_vals)

False

In [355]:
# show difference
orig_node_vals-node_vals_test

array([ 7.96126301, -5.02836637,  1.5869883 , -1.78333333, 21.23863636,
       11.68790181, 51.32692308])

## Classification

In [263]:
# Load data
X, y = datasets.load_breast_cancer(return_X_y=True, as_frame=True)

# Train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [264]:
clf_tree = DecisionTree(max_depth=2, treetype="classification", random_state=42)
clf_tree.fit(X_train, y_train)

In [265]:
[node.value for node in clf_tree.node_list]

[1, 1, 1, 0, 0, 1, 0]

In [266]:
traversed_nodes = clf_tree.explain_decision_path(X_train)[:,0].copy()

In [267]:
y_vals_array = np.full((clf_tree.n_nodes, X_train.shape[0]), np.nan)

In [268]:
for i, (idxs, y) in enumerate(zip(traversed_nodes, y_train.values)):
    y_vals_array[list(idxs),[i]] = y

In [269]:
y_vals_array

array([[ 1.,  0.,  1., ...,  1.,  0.,  1.],
       [ 1., nan,  1., ...,  1., nan,  1.],
       [ 1., nan,  1., ...,  1., nan,  1.],
       ...,
       [nan,  0., nan, ..., nan,  0., nan],
       [nan, nan, nan, ..., nan, nan, nan],
       [nan,  0., nan, ..., nan,  0., nan]])

In [270]:
#store classification results in dict 
res = {}

for i in range(y_vals_array.shape[0]):
    
    val, cnts = np.unique(y_vals_array[i,:], return_counts=True)
    counts = {k: v for k, v in zip(val, cnts)}
    
    clf_value_dis = [counts.get(0) or 0, counts.get(1) or 0]
    n_samples = np.sum(clf_value_dis)
    
    clf_prob_dis = (np.array(clf_value_dis) / n_samples)
    leaf_value = np.argmax(clf_prob_dis)
    
    res[i]={"samples": n_samples,
            "value": leaf_value,
            "value_distribution": clf_value_dis,
            "prob_distribution": clf_prob_dis
           }

In [271]:
res

{0: {'samples': 455,
  'value': 1,
  'value_distribution': [169, 286],
  'prob_distribution': array([0.37142857, 0.62857143])},
 1: {'samples': 282,
  'value': 1,
  'value_distribution': [16, 266],
  'prob_distribution': array([0.05673759, 0.94326241])},
 2: {'samples': 263,
  'value': 1,
  'value_distribution': [5, 258],
  'prob_distribution': array([0.01901141, 0.98098859])},
 3: {'samples': 19,
  'value': 0,
  'value_distribution': [11, 8],
  'prob_distribution': array([0.57894737, 0.42105263])},
 4: {'samples': 173,
  'value': 0,
  'value_distribution': [153, 20],
  'prob_distribution': array([0.88439306, 0.11560694])},
 5: {'samples': 35,
  'value': 1,
  'value_distribution': [17, 18],
  'prob_distribution': array([0.48571429, 0.51428571])},
 6: {'samples': 138,
  'value': 0,
  'value_distribution': [136, 2],
  'prob_distribution': array([0.98550725, 0.01449275])}}

In [272]:
# Check if one of the nodes only contains nan values -> no observation reached this node
np.isnan(y_vals_array.all(axis=1))

array([False, False, False, False, False, False, False])

In [273]:
#calculate mean value per node 
np.nanmean(y_vals_array, axis=1)

array([0.62857143, 0.94326241, 0.98098859, 0.42105263, 0.11560694,
       0.51428571, 0.01449275])

In [274]:
# get probabilities for each class p. node (binary classification)
[(1-val, val) for val in np.nanmean(y_vals_array, axis=1)]

[(0.37142857142857144, 0.6285714285714286),
 (0.05673758865248224, 0.9432624113475178),
 (0.019011406844106515, 0.9809885931558935),
 (0.5789473684210527, 0.42105263157894735),
 (0.8843930635838151, 0.11560693641618497),
 (0.48571428571428577, 0.5142857142857142),
 (0.9855072463768116, 0.014492753623188406)]

In [286]:
#Check if node prob distribution are equal to newly calculated node values (should be the case since we are only passing training data)
assert np.round([node.clf_prob_dis for node in clf_tree.node_list],8).all() == np.array([(1-val, val) for val in np.nanmean(y_vals_array, axis=1)]).all()

Lets calculate the node values based on the test set

In [293]:
traversed_nodes_test = clf_tree.explain_decision_path(X_test)[:,0].copy()

In [294]:
y_vals_array_test = np.full((clf_tree.n_nodes, X_test.shape[0]), np.nan)

In [295]:
for i, (idxs, y) in enumerate(zip(traversed_nodes_test, y_test.values)):
    y_vals_array_test[list(idxs),[i]] = y

In [296]:
# Check if one of the nodes only contains nan values -> no observation reached this node
np.isnan(y_vals_array_test.all(axis=1))

array([False, False, False, False, False, False, False])

In [297]:
#calculate mean value per node 
node_vals_test = np.nanmean(y_vals_array_test, axis=1)
node_vals_test

array([0.62280702, 0.94029851, 0.96923077, 0.        , 0.17021277,
       0.6       , 0.05405405])

In [299]:
# probabilities for class 1
orig_node_vals = np.round([node.clf_prob_dis[1] for node in clf_tree.node_list],8)

In [300]:
# Difference between original node values and node values based on test set
orig_node_vals -node_vals_test

array([ 0.00576441,  0.0029639 ,  0.01175782,  0.42105263, -0.05460583,
       -0.08571429, -0.0395613 ])

### Test implemented class function

In [310]:
node_probs_train, result = clf_tree._reestimate_node_values(X_train, y_train)

"is not" with a literal. Did you mean "!="?


In [311]:
node_probs_train

array([[0.37142857, 0.62857143],
       [0.05673759, 0.94326241],
       [0.01901141, 0.98098859],
       [0.57894737, 0.42105263],
       [0.88439306, 0.11560694],
       [0.48571429, 0.51428571],
       [0.98550725, 0.01449275]])

In [306]:
orig_node_probs = np.array([node.clf_prob_dis for node in clf_tree.node_list])

In [315]:
assert node_probs_train.all() == orig_node_probs.all()

Works :) 

In [316]:
node_probs_test, result_test = clf_tree._reestimate_node_values(X_test, y_test)

In [317]:
# This should not be equal
assert node_probs_test.all() == orig_node_probs.all()

AssertionError: 

In [318]:
# show difference
orig_node_probs-node_probs_test

array([[-0.00576441,  0.00576441],
       [-0.0029639 ,  0.0029639 ],
       [-0.01175782,  0.01175782],
       [-0.42105263,  0.42105263],
       [ 0.05460583, -0.05460583],
       [ 0.08571429, -0.08571429],
       [ 0.0395613 , -0.0395613 ]])