Skip to content

Commit

Permalink
Got basics of new ctree_feature_space() that accepts a features arg. (#…
Browse files Browse the repository at this point in the history
…253)

* Got basics of new ctree_feature_space() that accepts a features arg.

Signed-off-by: Terence Parr <parrt@antlr.org>

* univar working for new ctree_feature_space() that accepts a features arg.

Signed-off-by: Terence Parr <parrt@antlr.org>

* got regression univar working

Signed-off-by: Terence Parr <parrt@antlr.org>

* got regression bivar working

Signed-off-by: Terence Parr <parrt@antlr.org>

* got regression bivar 3D working; update notebook

Signed-off-by: Terence Parr <parrt@antlr.org>

* Clean up the notebook

Signed-off-by: Terence Parr <parrt@antlr.org>

* Clean up the notebook

Signed-off-by: Terence Parr <parrt@antlr.org>

* Show 'Count' as y-axis for barstacked

Signed-off-by: Terence Parr <parrt@antlr.org>

* Handle the case where there are no features specified by passing it on the number of model features.

Signed-off-by: Terence Parr <parrt@antlr.org>

* Handle the case where there are no features specified by passing it on the number of model features.

Signed-off-by: Terence Parr <parrt@antlr.org>

* Handle the case where there are no features specified by passing it on the number of model features.

Signed-off-by: Terence Parr <parrt@antlr.org>

* Handle the case where there are no features specified by passing it on the number of model features.

Signed-off-by: Terence Parr <parrt@antlr.org>

* cleaned up comment

Signed-off-by: Terence Parr <parrt@antlr.org>
Co-authored-by: Matthew Epland <matthewepland@gmail.com>
  • Loading branch information
parrt and mepland committed Jan 22, 2023
1 parent fc8307b commit 8608ba3
Show file tree
Hide file tree
Showing 5 changed files with 18,815 additions and 18,816 deletions.
8 changes: 4 additions & 4 deletions dtreeviz/compatibility.py
Original file line number Diff line number Diff line change
Expand Up @@ -152,10 +152,10 @@ def ctreeviz_bivar(tree_model,
tree_index)
model = DTreeVizAPI(shadow_tree)
model.ctree_feature_space(fontsize=fontsize,
fontname=fontname,
show=show,
colors=colors,
ax=ax)
fontname=fontname,
show=show,
colors=colors,
ax=ax)

def dtreeviz(tree_model,
X_train: (pd.DataFrame, np.ndarray) = None,
Expand Down
31 changes: 0 additions & 31 deletions dtreeviz/models/shadow_decision_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,37 +334,6 @@ def walk(t, x, path):
walk(self.root, x, path)
return path

def tessellation(self):
"""
Walk tree and return list of tuples containing a leaf node and bounding box list of(x1, y1, x2, y2) coordinates.
"""
bboxes = []

def walk(t, bbox):
if t is None:
return None
# print(f"Node {t.id} bbox {bbox} {' LEAF' if t.isleaf() else ''}")
if t.isleaf():
bboxes.append((t, bbox))
return t
# shrink bbox for left, right and recurse
s = t.split()
if t.feature() == 0:
walk(t.left, (bbox[0], bbox[1], s, bbox[3]))
walk(t.right, (s, bbox[1], bbox[2], bbox[3]))
else:
walk(t.left, (bbox[0], bbox[1], bbox[2], s))
walk(t.right, (bbox[0], s, bbox[2], bbox[3]))

# create bounding box in feature space (not zeroed)
f1_values = self.X_train[:, 0]
f2_values = self.X_train[:, 1]
overall_bbox = (np.min(f1_values), np.min(f2_values), # x,y of lower left edge
np.max(f1_values), np.max(f2_values)) # x,y of upper right edge
walk(self.root, overall_bbox)

return bboxes

def get_leaf_sample_counts(self, min_samples=0, max_samples=None):
"""
Get the number of samples for each leaf.
Expand Down
135 changes: 87 additions & 48 deletions dtreeviz/trees.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
from dtreeviz.interpretation import explain_prediction_plain_english, explain_prediction_sklearn_default
from dtreeviz.models.shadow_decision_tree import ShadowDecTree
from dtreeviz.models.shadow_decision_tree import ShadowDecTreeNode
from dtreeviz.utils import myround, DTreeVizRender, add_classifier_legend, _format_axes, _draw_wedge, _set_wedge_ticks
from dtreeviz.utils import myround, DTreeVizRender, add_classifier_legend, _format_axes, _draw_wedge, _set_wedge_ticks, tessellate

# How many bins should we have based upon number of classes
NUM_BINS = [
Expand Down Expand Up @@ -886,27 +886,35 @@ def ctree_feature_space(self,
gtype='strip',
show={'title', 'legend', 'splits'},
colors=None,
features=None,
figsize=None,
ax=None):
# TODO: check if we can find some common functionality between univar and bivar visualisations and refactor
# to a single method.
if len(self.shadow_tree.feature_names) == 1: # univar example
_ctreeviz_univar(self.shadow_tree, fontsize, ticks_fontsize, fontname, nbins, gtype, show, colors, figsize, ax)
elif len(self.shadow_tree.feature_names) == 2: # bivar example
_ctreeviz_bivar(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, colors, figsize, ax)
if features is None:
n_features = len(self.shadow_tree.feature_names)
features = self.shadow_tree.feature_names[0:min(n_features,2)] # pick first one/two features if none given
if len(features) == 1: # univar example
_ctreeviz_univar(self.shadow_tree, fontsize, ticks_fontsize, fontname, nbins, gtype, show, colors, features[0], figsize, ax)
elif len(features) == 2: # bivar example
_ctreeviz_bivar(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, colors, features, figsize, ax)
else:
raise ValueError(f"ctree_feature_space supports a dataset with only one or two features."
f" You provided a dataset with {len(self.shadow_tree.feature_names)} features {self.shadow_tree.feature_names}.")

def rtree_feature_space(self, fontsize: int = 10, ticks_fontsize=8, show={'title', 'splits'}, split_linewidth=.5,
mean_linewidth=2, markersize=15, colors=None, fontname="Arial",
n_colors_in_map=100, figsize=None, ax=None):
if len(self.shadow_tree.feature_names) == 1: # univar example
n_colors_in_map=100, features=None,
figsize=None, ax=None):
if features is None:
n_features = len(self.shadow_tree.feature_names)
features = self.shadow_tree.feature_names[0:min(n_features,2)] # pick first one/two features if none given
if len(features) == 1: # univar example
_rtreeviz_univar(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, split_linewidth, mean_linewidth, markersize, colors,
figsize, ax)
elif len(self.shadow_tree.feature_names) == 2: # bivar example
features[0], figsize, ax)
elif len(features) == 2: # bivar example
_rtreeviz_bivar_heatmap(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, n_colors_in_map, colors,
markersize, figsize, ax)
markersize, features, figsize, ax)
else:
raise ValueError(f"rtree_feature_space() supports a dataset with only one or two features."
f" You provided a dataset with {len(self.shadow_tree.feature_names)} features {self.shadow_tree.feature_names}.")
Expand All @@ -915,14 +923,28 @@ def rtree_feature_space3D(self,
fontsize=10, ticks_fontsize=8, fontname="Arial",
azim=0, elev=0, dist=7,
show={'title'}, colors=None, markersize=15,
n_colors_in_map=100, figsize=None, ax=None):
n_colors_in_map=100,
features=None,
figsize=None, ax=None):
"""
Show 3D feature space for bivariate regression tree. X_train should have
just the 2 variables used for training.
"""
_rtreeviz_bivar_3D(self.shadow_tree, fontsize, ticks_fontsize, fontname, azim, elev, dist, show, colors,
markersize,
n_colors_in_map, figsize, ax)
if features is None:
n_features = len(self.shadow_tree.feature_names)
if n_features >= 2:
features = self.shadow_tree.feature_names[0:2] # pick first two features if none given
else:
raise ValueError(f"rtree_feature_space3D() requires at least two features; but the model has {n_features}")
elif len(features)!=2:
raise ValueError(f"rtree_feature_space3D() requires exactly two features; found {len(features)}")

_rtreeviz_bivar_3D(self.shadow_tree, fontsize, ticks_fontsize, fontname,
azim, elev, dist,
show, colors, markersize,
n_colors_in_map,
features,
figsize, ax)


def _class_split_viz(node: ShadowDecTreeNode,
Expand Down Expand Up @@ -1313,6 +1335,7 @@ def _ctreeviz_univar(shadow_tree,
gtype,
show,
colors,
feature,
figsize,
ax):
if ax is None:
Expand All @@ -1321,24 +1344,26 @@ def _ctreeviz_univar(shadow_tree,
else:
fig, ax = plt.subplots()

X_train = shadow_tree.X_train.reshape(-1, )
featidx = shadow_tree.feature_names.index(feature)
X_train = shadow_tree.X_train
y_train = shadow_tree.y_train
colors = adjust_colors(colors)
n_classes = shadow_tree.nclasses()
overall_feature_range = (np.min(X_train), np.max(X_train))
overall_feature_range = (np.min(X_train[:,featidx]), np.max(X_train[:,featidx]))
class_values = shadow_tree.classes()
color_values = colors['classes'][n_classes]
color_map = {v: color_values[i] for i, v in enumerate(class_values)}
X_colors = [color_map[cl] for cl in class_values]

_format_axes(ax, shadow_tree.feature_names[0], None, colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
ax.yaxis.set_visible(False)
ax.spines['left'].set_visible(False)

X_hist = [X_train[y_train == cl] for cl in class_values]
_format_axes(ax, shadow_tree.feature_names[featidx], 'Count' if gtype=='barstacked' else None,
colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)

X_hist = [X_train[y_train == cl,featidx] for cl in class_values]

if gtype == 'barstacked':
bins = np.linspace(start=overall_feature_range[0], stop=overall_feature_range[1], num=nbins, endpoint=True)
bins = np.linspace(start=overall_feature_range[0], stop=overall_feature_range[1],
num=nbins, endpoint=True)
hist, bins, barcontainers = ax.hist(X_hist,
color=X_colors,
align='mid',
Expand All @@ -1355,6 +1380,8 @@ def _ctreeviz_univar(shadow_tree,
ax.set_yticks([0, max([max(h) for h in hist])])
elif gtype == 'strip':
# user should pass in short and wide fig
ax.spines['left'].set_visible(False)
ax.yaxis.set_visible(False)
sigma = .013
mu = .08
class_step = .08
Expand All @@ -1367,7 +1394,7 @@ def _ctreeviz_univar(shadow_tree,
else:
raise ValueError(f'Unrecognized gtype = {gtype}!')

splits = [node.split() for node in shadow_tree.internal]
splits = [node.split() for node in shadow_tree.internal if node.feature()==featidx]
splits = sorted(splits)

if 'preds' in show: # this gets the horiz bars showing prediction region
Expand Down Expand Up @@ -1401,9 +1428,10 @@ def _ctreeviz_univar(shadow_tree,


def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
colors=None,
figsize=None,
ax=None):
colors,
features,
figsize,
ax):
"""
Show tesselated 2D feature space for bivariate classification tree. X_train can
have lots of features but features lists indexes of 2 features to train tree with.
Expand All @@ -1414,10 +1442,11 @@ def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
else:
fig, ax = plt.subplots()

featidx = [shadow_tree.feature_names.index(f) for f in features]
X_train = shadow_tree.X_train
y_train = shadow_tree.y_train
colors = adjust_colors(colors)
tessellation = shadow_tree.tessellation()
tessellation = tessellate(shadow_tree.root, X_train, featidx)
n_classes = shadow_tree.nclasses()
class_values = shadow_tree.classes()
color_values = colors['classes'][n_classes]
Expand All @@ -1436,10 +1465,11 @@ def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
dot_w = 25
X_hist = [X_train[y_train == cl] for cl in class_values]
for i, h in enumerate(X_hist):
ax.scatter(h[:, 0], h[:, 1], marker='o', s=dot_w, c=color_map[i],
ax.scatter(h[:, featidx[0]], h[:, featidx[1]], marker='o', s=dot_w, c=color_map[i],
edgecolors=colors['scatter_edge'], lw=.3)

_format_axes(ax, shadow_tree.feature_names[0], shadow_tree.feature_names[1], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
_format_axes(ax, shadow_tree.feature_names[featidx[0]], shadow_tree.feature_names[featidx[1]],
colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)

if 'legend' in show:
add_classifier_legend(ax, shadow_tree.class_names, class_values, color_map, shadow_tree.target_name, colors,
Expand All @@ -1455,38 +1485,40 @@ def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,

def _rtreeviz_univar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
split_linewidth, mean_linewidth, markersize, colors,
feature,
figsize, ax):
X_train = shadow_tree.X_train.reshape(-1, )
featidx = shadow_tree.feature_names.index(feature)
X_train = shadow_tree.X_train
y_train = shadow_tree.y_train
if X_train is None or y_train is None:
raise ValueError(f"X_train and y_train must not be none")

if ax is None:
if figsize:
fig, ax = plt.subplots(figsize=figsize)
else:
fig, ax = plt.subplots()

if X_train is None or y_train is None:
raise ValueError(f"x_train and y_train must not be none")

colors = adjust_colors(colors)

y_range = (min(y_train), max(y_train)) # same y axis for all
overall_feature_range = (np.min(X_train), np.max(X_train))
overall_feature_range = (np.min(X_train[:,featidx]), np.max(X_train[:,featidx]))

splits = []
for node in shadow_tree.internal:
splits.append(node.split())
if node.feature()==featidx:
splits.append(node.split())
splits = sorted(splits)
bins = [overall_feature_range[0]] + splits + [overall_feature_range[1]]

means = []
for i in range(len(bins) - 1):
left = bins[i]
right = bins[i + 1]
inrange = y_train[(X_train >= left) & (X_train <= right)]
inrange = y_train[(X_train[:,featidx] >= left) & (X_train[:,featidx] <= right)]
means.append(np.mean(inrange))

ax.scatter(X_train, y_train, marker='o', alpha=colors['scatter_marker_alpha'], c=colors['scatter_marker'],
ax.scatter(X_train[:,featidx], y_train, marker='o', alpha=colors['scatter_marker_alpha'], c=colors['scatter_marker'],
s=markersize,
edgecolor=colors['scatter_edge'], lw=.3)

Expand All @@ -1502,7 +1534,7 @@ def _rtreeviz_univar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
ax.plot([prevX, split], [m, m], '-', color=colors['mean_line'], linewidth=mean_linewidth)
prevX = split

_format_axes(ax, shadow_tree.feature_names[0], shadow_tree.target_name, colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
_format_axes(ax, shadow_tree.feature_names[featidx], shadow_tree.target_name, colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)

if 'title' in show:
title = f"Regression Tree Depth {shadow_tree.get_max_depth()}, Samples per Leaf {shadow_tree.get_min_samples_leaf()},\nTraining $R^2$={shadow_tree.get_score()}"
Expand All @@ -1514,8 +1546,9 @@ def _rtreeviz_bivar_heatmap(shadow_tree, fontsize, ticks_fontsize, fontname,
n_colors_in_map,
colors,
markersize,
features,
figsize,
ax) -> tree.DecisionTreeClassifier:
ax):
"""
Show tesselated 2D feature space for bivariate regression tree. X_train can
have lots of features but features lists indexes of 2 features to train tree with.
Expand All @@ -1536,7 +1569,9 @@ def _rtreeviz_bivar_heatmap(shadow_tree, fontsize, ticks_fontsize, fontname,
color_map = [rgb2hex(c.rgb, force_long=True) for c in
Color(colors['color_map_min']).range_to(Color(colors['color_map_max']),
n_colors_in_map)]
tessellation = shadow_tree.tessellation()
featidx = [shadow_tree.feature_names.index(f) for f in features]

tessellation = tessellate(shadow_tree.root, X_train, featidx)

for node, bbox in tessellation:
pred = node.prediction()
Expand All @@ -1550,23 +1585,24 @@ def _rtreeviz_bivar_heatmap(shadow_tree, fontsize, ticks_fontsize, fontname,
ax.add_patch(rect)

color_map = [color_map[int(((y - y_lim[0]) / y_range) * (n_colors_in_map - 1))] for y in y_train]
x, y, z = X_train[:, 0], X_train[:, 1], y_train
ax.scatter(x, y, marker='o', c=color_map, edgecolor=colors['scatter_edge'], lw=.3, s=markersize)
ax.scatter(X_train[:, featidx[0]], X_train[:, featidx[1]], marker='o', c=color_map,
edgecolor=colors['scatter_edge'], lw=.3, s=markersize)

_format_axes(ax, shadow_tree.feature_names[0], shadow_tree.feature_names[1], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
_format_axes(ax, shadow_tree.feature_names[featidx[0]], shadow_tree.feature_names[featidx[1]],
colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)

if 'title' in show:
accur = shadow_tree.get_score()
title = f"Regression Tree Depth {shadow_tree.get_max_depth()}, Training $R^2$={accur:.3f}"
ax.set_title(title, fontsize=fontsize, color=colors['title'])

return None


def _rtreeviz_bivar_3D(shadow_tree, fontsize, ticks_fontsize, fontname,
azim, elev, dist,
show, colors, markersize,
n_colors_in_map, figsize, ax):
n_colors_in_map,
features,
figsize, ax):
X_train = shadow_tree.X_train
y_train = shadow_tree.y_train

Expand Down Expand Up @@ -1597,16 +1633,19 @@ def plane(node, bbox, color_spectrum):
color_spectrum = Color(colors['color_map_min']).range_to(Color(colors['color_map_max']), n_colors_in_map)
color_spectrum = [rgb2hex(c.rgb, force_long=True) for c in color_spectrum]
y_colors = [color_spectrum[y_to_color_index(y)] for y in y_train]
tessellation = shadow_tree.tessellation()

featidx = [shadow_tree.feature_names.index(f) for f in features]
x, y, z = X_train[:, featidx[0]], X_train[:, featidx[1]], y_train

tessellation = tessellate(shadow_tree.root, X_train, featidx)

for node, bbox in tessellation:
plane(node, bbox, color_spectrum)

x, y, z = X_train[:, 0], X_train[:, 1], y_train
ax.scatter(x, y, z, marker='o', alpha=colors['scatter_marker_alpha'], edgecolor=colors['scatter_edge'],
lw=.3, c=y_colors, s=markersize)

_format_axes(ax, shadow_tree.feature_names[0], shadow_tree.feature_names[1], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
_format_axes(ax, shadow_tree.feature_names[featidx[0]], shadow_tree.feature_names[featidx[1]], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
ax.set_zlabel(f"{shadow_tree.target_name}", fontsize=fontsize, fontname=fontname, color=colors['axis_label'])

if 'title' in show:
Expand Down
Loading

0 comments on commit 8608ba3

Please sign in to comment.