Got basics of new ctree_feature_space() that accepts a features arg. (#…

…253) * Got basics of new ctree_feature_space() that accepts a features arg. Signed-off-by: Terence Parr <parrt@antlr.org> * univar working for new ctree_feature_space() that accepts a features arg. Signed-off-by: Terence Parr <parrt@antlr.org> * got regression univar working Signed-off-by: Terence Parr <parrt@antlr.org> * got regression bivar working Signed-off-by: Terence Parr <parrt@antlr.org> * got regression bivar 3D working; update notebook Signed-off-by: Terence Parr <parrt@antlr.org> * Clean up the notebook Signed-off-by: Terence Parr <parrt@antlr.org> * Clean up the notebook Signed-off-by: Terence Parr <parrt@antlr.org> * Show 'Count' as y-axis for barstacked Signed-off-by: Terence Parr <parrt@antlr.org> * Handle the case where there are no features specified by passing it on the number of model features. Signed-off-by: Terence Parr <parrt@antlr.org> * Handle the case where there are no features specified by passing it on the number of model features. Signed-off-by: Terence Parr <parrt@antlr.org> * Handle the case where there are no features specified by passing it on the number of model features. Signed-off-by: Terence Parr <parrt@antlr.org> * Handle the case where there are no features specified by passing it on the number of model features. Signed-off-by: Terence Parr <parrt@antlr.org> * cleaned up comment Signed-off-by: Terence Parr <parrt@antlr.org> Co-authored-by: Matthew Epland <matthewepland@gmail.com>
parrt · Jan 22, 2023 · 8608ba3 · 8608ba3
1 parent fc8307b
commit 8608ba3
Show file tree

Hide file tree

Showing 5 changed files with 18,815 additions and 18,816 deletions.
diff --git a/dtreeviz/compatibility.py b/dtreeviz/compatibility.py
@@ -152,10 +152,10 @@ def ctreeviz_bivar(tree_model,
                                                 tree_index)
     model = DTreeVizAPI(shadow_tree)
     model.ctree_feature_space(fontsize=fontsize,
-                            fontname=fontname,
-                            show=show,
-                            colors=colors,
-                            ax=ax)
+                              fontname=fontname,
+                              show=show,
+                              colors=colors,
+                              ax=ax)
 
 def dtreeviz(tree_model,
              X_train: (pd.DataFrame, np.ndarray) = None,

diff --git a/dtreeviz/models/shadow_decision_tree.py b/dtreeviz/models/shadow_decision_tree.py
@@ -334,37 +334,6 @@ def walk(t, x, path):
         walk(self.root, x, path)
         return path
 
-    def tessellation(self):
-        """
-        Walk tree and return list of tuples containing a leaf node and bounding box list of(x1, y1, x2, y2) coordinates.
-        """
-        bboxes = []
-
-        def walk(t, bbox):
-            if t is None:
-                return None
-            # print(f"Node {t.id} bbox {bbox} {'   LEAF' if t.isleaf() else ''}")
-            if t.isleaf():
-                bboxes.append((t, bbox))
-                return t
-            # shrink bbox for left, right and recurse
-            s = t.split()
-            if t.feature() == 0:
-                walk(t.left, (bbox[0], bbox[1], s, bbox[3]))
-                walk(t.right, (s, bbox[1], bbox[2], bbox[3]))
-            else:
-                walk(t.left, (bbox[0], bbox[1], bbox[2], s))
-                walk(t.right, (bbox[0], s, bbox[2], bbox[3]))
-
-        # create bounding box in feature space (not zeroed)
-        f1_values = self.X_train[:, 0]
-        f2_values = self.X_train[:, 1]
-        overall_bbox = (np.min(f1_values), np.min(f2_values),  # x,y of lower left edge
-                        np.max(f1_values), np.max(f2_values))  # x,y of upper right edge
-        walk(self.root, overall_bbox)
-
-        return bboxes
-
     def get_leaf_sample_counts(self, min_samples=0, max_samples=None):
         """
         Get the number of samples for each leaf.

diff --git a/dtreeviz/trees.py b/dtreeviz/trees.py
@@ -13,7 +13,7 @@
 from dtreeviz.interpretation import explain_prediction_plain_english, explain_prediction_sklearn_default
 from dtreeviz.models.shadow_decision_tree import ShadowDecTree
 from dtreeviz.models.shadow_decision_tree import ShadowDecTreeNode
-from dtreeviz.utils import myround, DTreeVizRender, add_classifier_legend, _format_axes, _draw_wedge, _set_wedge_ticks
+from dtreeviz.utils import myround, DTreeVizRender, add_classifier_legend, _format_axes, _draw_wedge, _set_wedge_ticks, tessellate
 
 # How many bins should we have based upon number of classes
 NUM_BINS = [
@@ -886,27 +886,35 @@ def ctree_feature_space(self,
                             gtype='strip',
                             show={'title', 'legend', 'splits'},
                             colors=None,
+                            features=None,
                             figsize=None,
                             ax=None):
         # TODO: check if we can find some common functionality between univar and bivar visualisations and refactor
         #  to a single method.
-        if len(self.shadow_tree.feature_names) == 1:     # univar example
-            _ctreeviz_univar(self.shadow_tree, fontsize, ticks_fontsize, fontname, nbins, gtype, show, colors, figsize, ax)
-        elif len(self.shadow_tree.feature_names) == 2:   # bivar example
-            _ctreeviz_bivar(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, colors, figsize, ax)
+        if features is None:
+            n_features = len(self.shadow_tree.feature_names)
+            features = self.shadow_tree.feature_names[0:min(n_features,2)] # pick first one/two features if none given
+        if len(features) == 1:     # univar example
+            _ctreeviz_univar(self.shadow_tree, fontsize, ticks_fontsize, fontname, nbins, gtype, show, colors, features[0], figsize, ax)
+        elif len(features) == 2:   # bivar example
+            _ctreeviz_bivar(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, colors, features, figsize, ax)
         else:
             raise ValueError(f"ctree_feature_space supports a dataset with only one or two features."
                              f" You provided a dataset with {len(self.shadow_tree.feature_names)} features {self.shadow_tree.feature_names}.")
 
     def rtree_feature_space(self, fontsize: int = 10, ticks_fontsize=8, show={'title', 'splits'}, split_linewidth=.5,
                             mean_linewidth=2, markersize=15, colors=None, fontname="Arial",
-                            n_colors_in_map=100, figsize=None, ax=None):
-        if len(self.shadow_tree.feature_names) == 1:  # univar example
+                            n_colors_in_map=100, features=None,
+                            figsize=None, ax=None):
+        if features is None:
+            n_features = len(self.shadow_tree.feature_names)
+            features = self.shadow_tree.feature_names[0:min(n_features,2)] # pick first one/two features if none given
+        if len(features) == 1:  # univar example
             _rtreeviz_univar(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, split_linewidth, mean_linewidth, markersize, colors,
-                             figsize, ax)
-        elif len(self.shadow_tree.feature_names) == 2:  # bivar example
+                             features[0], figsize, ax)
+        elif len(features) == 2:  # bivar example
             _rtreeviz_bivar_heatmap(self.shadow_tree, fontsize, ticks_fontsize, fontname, show, n_colors_in_map, colors,
-                                    markersize, figsize, ax)
+                                    markersize, features, figsize, ax)
         else:
             raise ValueError(f"rtree_feature_space() supports a dataset with only one or two features."
                              f" You provided a dataset with {len(self.shadow_tree.feature_names)} features {self.shadow_tree.feature_names}.")
@@ -915,14 +923,28 @@ def rtree_feature_space3D(self,
                               fontsize=10, ticks_fontsize=8, fontname="Arial",
                               azim=0, elev=0, dist=7,
                               show={'title'}, colors=None, markersize=15,
-                              n_colors_in_map=100, figsize=None, ax=None):
+                              n_colors_in_map=100,
+                              features=None,
+                              figsize=None, ax=None):
         """
         Show 3D feature space for bivariate regression tree. X_train should have
         just the 2 variables used for training.
         """
-        _rtreeviz_bivar_3D(self.shadow_tree, fontsize, ticks_fontsize, fontname, azim, elev, dist, show, colors,
-                           markersize,
-                           n_colors_in_map, figsize, ax)
+        if features is None:
+            n_features = len(self.shadow_tree.feature_names)
+            if n_features >= 2:
+                features = self.shadow_tree.feature_names[0:2] # pick first two features if none given
+            else:
+                raise ValueError(f"rtree_feature_space3D() requires at least two features; but the model has {n_features}")
+        elif len(features)!=2:
+            raise ValueError(f"rtree_feature_space3D() requires exactly two features; found {len(features)}")
+
+        _rtreeviz_bivar_3D(self.shadow_tree, fontsize, ticks_fontsize, fontname,
+                           azim, elev, dist,
+                           show, colors, markersize,
+                           n_colors_in_map,
+                           features,
+                           figsize, ax)
 
 
 def _class_split_viz(node: ShadowDecTreeNode,
@@ -1313,6 +1335,7 @@ def _ctreeviz_univar(shadow_tree,
                      gtype,
                      show,
                      colors,
+                     feature,
                      figsize,
                      ax):
     if ax is None:
@@ -1321,24 +1344,26 @@ def _ctreeviz_univar(shadow_tree,
         else:
             fig, ax = plt.subplots()
 
-    X_train = shadow_tree.X_train.reshape(-1, )
+    featidx = shadow_tree.feature_names.index(feature)
+    X_train = shadow_tree.X_train
     y_train = shadow_tree.y_train
     colors = adjust_colors(colors)
     n_classes = shadow_tree.nclasses()
-    overall_feature_range = (np.min(X_train), np.max(X_train))
+    overall_feature_range = (np.min(X_train[:,featidx]), np.max(X_train[:,featidx]))
     class_values = shadow_tree.classes()
     color_values = colors['classes'][n_classes]
     color_map = {v: color_values[i] for i, v in enumerate(class_values)}
     X_colors = [color_map[cl] for cl in class_values]
 
-    _format_axes(ax, shadow_tree.feature_names[0], None, colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
-    ax.yaxis.set_visible(False)
-    ax.spines['left'].set_visible(False)
 
-    X_hist = [X_train[y_train == cl] for cl in class_values]
+    _format_axes(ax, shadow_tree.feature_names[featidx], 'Count' if gtype=='barstacked' else None,
+                 colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
+
+    X_hist = [X_train[y_train == cl,featidx] for cl in class_values]
 
     if gtype == 'barstacked':
-        bins = np.linspace(start=overall_feature_range[0], stop=overall_feature_range[1], num=nbins, endpoint=True)
+        bins = np.linspace(start=overall_feature_range[0], stop=overall_feature_range[1],
+                           num=nbins, endpoint=True)
         hist, bins, barcontainers = ax.hist(X_hist,
                                             color=X_colors,
                                             align='mid',
@@ -1355,6 +1380,8 @@ def _ctreeviz_univar(shadow_tree,
         ax.set_yticks([0, max([max(h) for h in hist])])
     elif gtype == 'strip':
         # user should pass in short and wide fig
+        ax.spines['left'].set_visible(False)
+        ax.yaxis.set_visible(False)
         sigma = .013
         mu = .08
         class_step = .08
@@ -1367,7 +1394,7 @@ def _ctreeviz_univar(shadow_tree,
     else:
         raise ValueError(f'Unrecognized gtype = {gtype}!')
 
-    splits = [node.split() for node in shadow_tree.internal]
+    splits = [node.split() for node in shadow_tree.internal if node.feature()==featidx]
     splits = sorted(splits)
 
     if 'preds' in show:  # this gets the horiz bars showing prediction region
@@ -1401,9 +1428,10 @@ def _ctreeviz_univar(shadow_tree,
 
 
 def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
-                    colors=None,
-                    figsize=None,
-                    ax=None):
+                    colors,
+                    features,
+                    figsize,
+                    ax):
     """
     Show tesselated 2D feature space for bivariate classification tree. X_train can
     have lots of features but features lists indexes of 2 features to train tree with.
@@ -1414,10 +1442,11 @@ def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
         else:
             fig, ax = plt.subplots()
 
+    featidx = [shadow_tree.feature_names.index(f) for f in features]
     X_train = shadow_tree.X_train
     y_train = shadow_tree.y_train
     colors = adjust_colors(colors)
-    tessellation = shadow_tree.tessellation()
+    tessellation = tessellate(shadow_tree.root, X_train, featidx)
     n_classes = shadow_tree.nclasses()
     class_values = shadow_tree.classes()
     color_values = colors['classes'][n_classes]
@@ -1436,10 +1465,11 @@ def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
     dot_w = 25
     X_hist = [X_train[y_train == cl] for cl in class_values]
     for i, h in enumerate(X_hist):
-        ax.scatter(h[:, 0], h[:, 1], marker='o', s=dot_w, c=color_map[i],
+        ax.scatter(h[:, featidx[0]], h[:, featidx[1]], marker='o', s=dot_w, c=color_map[i],
                    edgecolors=colors['scatter_edge'], lw=.3)
 
-    _format_axes(ax, shadow_tree.feature_names[0], shadow_tree.feature_names[1], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
+    _format_axes(ax, shadow_tree.feature_names[featidx[0]], shadow_tree.feature_names[featidx[1]],
+                 colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
 
     if 'legend' in show:
         add_classifier_legend(ax, shadow_tree.class_names, class_values, color_map, shadow_tree.target_name, colors,
@@ -1455,38 +1485,40 @@ def _ctreeviz_bivar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
 
 def _rtreeviz_univar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
                      split_linewidth, mean_linewidth, markersize, colors,
+                     feature,
                      figsize, ax):
-    X_train = shadow_tree.X_train.reshape(-1, )
+    featidx = shadow_tree.feature_names.index(feature)
+    X_train = shadow_tree.X_train
     y_train = shadow_tree.y_train
+    if X_train is None or y_train is None:
+        raise ValueError(f"X_train and y_train must not be none")
 
     if ax is None:
         if figsize:
             fig, ax = plt.subplots(figsize=figsize)
         else:
             fig, ax = plt.subplots()
 
-    if X_train is None or y_train is None:
-        raise ValueError(f"x_train and y_train must not be none")
-
     colors = adjust_colors(colors)
 
     y_range = (min(y_train), max(y_train))  # same y axis for all
-    overall_feature_range = (np.min(X_train), np.max(X_train))
+    overall_feature_range = (np.min(X_train[:,featidx]), np.max(X_train[:,featidx]))
 
     splits = []
     for node in shadow_tree.internal:
-        splits.append(node.split())
+        if node.feature()==featidx:
+            splits.append(node.split())
     splits = sorted(splits)
     bins = [overall_feature_range[0]] + splits + [overall_feature_range[1]]
 
     means = []
     for i in range(len(bins) - 1):
         left = bins[i]
         right = bins[i + 1]
-        inrange = y_train[(X_train >= left) & (X_train <= right)]
+        inrange = y_train[(X_train[:,featidx] >= left) & (X_train[:,featidx] <= right)]
         means.append(np.mean(inrange))
 
-    ax.scatter(X_train, y_train, marker='o', alpha=colors['scatter_marker_alpha'], c=colors['scatter_marker'],
+    ax.scatter(X_train[:,featidx], y_train, marker='o', alpha=colors['scatter_marker_alpha'], c=colors['scatter_marker'],
                s=markersize,
                edgecolor=colors['scatter_edge'], lw=.3)
 
@@ -1502,7 +1534,7 @@ def _rtreeviz_univar(shadow_tree, fontsize, ticks_fontsize, fontname, show,
             ax.plot([prevX, split], [m, m], '-', color=colors['mean_line'], linewidth=mean_linewidth)
             prevX = split
 
-    _format_axes(ax, shadow_tree.feature_names[0], shadow_tree.target_name, colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
+    _format_axes(ax, shadow_tree.feature_names[featidx], shadow_tree.target_name, colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
 
     if 'title' in show:
         title = f"Regression Tree Depth {shadow_tree.get_max_depth()}, Samples per Leaf {shadow_tree.get_min_samples_leaf()},\nTraining $R^2$={shadow_tree.get_score()}"
@@ -1514,8 +1546,9 @@ def _rtreeviz_bivar_heatmap(shadow_tree, fontsize, ticks_fontsize, fontname,
                             n_colors_in_map,
                             colors,
                             markersize,
+                            features,
                             figsize,
-                            ax) -> tree.DecisionTreeClassifier:
+                            ax):
     """
     Show tesselated 2D feature space for bivariate regression tree. X_train can
     have lots of features but features lists indexes of 2 features to train tree with.
@@ -1536,7 +1569,9 @@ def _rtreeviz_bivar_heatmap(shadow_tree, fontsize, ticks_fontsize, fontname,
     color_map = [rgb2hex(c.rgb, force_long=True) for c in
                  Color(colors['color_map_min']).range_to(Color(colors['color_map_max']),
                                                          n_colors_in_map)]
-    tessellation = shadow_tree.tessellation()
+    featidx = [shadow_tree.feature_names.index(f) for f in features]
+
+    tessellation = tessellate(shadow_tree.root, X_train, featidx)
 
     for node, bbox in tessellation:
         pred = node.prediction()
@@ -1550,23 +1585,24 @@ def _rtreeviz_bivar_heatmap(shadow_tree, fontsize, ticks_fontsize, fontname,
         ax.add_patch(rect)
 
     color_map = [color_map[int(((y - y_lim[0]) / y_range) * (n_colors_in_map - 1))] for y in y_train]
-    x, y, z = X_train[:, 0], X_train[:, 1], y_train
-    ax.scatter(x, y, marker='o', c=color_map, edgecolor=colors['scatter_edge'], lw=.3, s=markersize)
+    ax.scatter(X_train[:, featidx[0]], X_train[:, featidx[1]], marker='o', c=color_map,
+               edgecolor=colors['scatter_edge'], lw=.3, s=markersize)
 
-    _format_axes(ax, shadow_tree.feature_names[0], shadow_tree.feature_names[1], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
+    _format_axes(ax, shadow_tree.feature_names[featidx[0]], shadow_tree.feature_names[featidx[1]],
+                 colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
 
     if 'title' in show:
         accur = shadow_tree.get_score()
         title = f"Regression Tree Depth {shadow_tree.get_max_depth()}, Training $R^2$={accur:.3f}"
         ax.set_title(title, fontsize=fontsize, color=colors['title'])
 
-    return None
-
 
 def _rtreeviz_bivar_3D(shadow_tree, fontsize, ticks_fontsize, fontname,
                        azim, elev, dist,
                        show, colors, markersize,
-                       n_colors_in_map, figsize, ax):
+                       n_colors_in_map,
+                       features,
+                       figsize, ax):
     X_train = shadow_tree.X_train
     y_train = shadow_tree.y_train
 
@@ -1597,16 +1633,19 @@ def plane(node, bbox, color_spectrum):
     color_spectrum = Color(colors['color_map_min']).range_to(Color(colors['color_map_max']), n_colors_in_map)
     color_spectrum = [rgb2hex(c.rgb, force_long=True) for c in color_spectrum]
     y_colors = [color_spectrum[y_to_color_index(y)] for y in y_train]
-    tessellation = shadow_tree.tessellation()
+
+    featidx = [shadow_tree.feature_names.index(f) for f in features]
+    x, y, z = X_train[:, featidx[0]], X_train[:, featidx[1]], y_train
+
+    tessellation = tessellate(shadow_tree.root, X_train, featidx)
 
     for node, bbox in tessellation:
         plane(node, bbox, color_spectrum)
 
-    x, y, z = X_train[:, 0], X_train[:, 1], y_train
     ax.scatter(x, y, z, marker='o', alpha=colors['scatter_marker_alpha'], edgecolor=colors['scatter_edge'],
                lw=.3, c=y_colors, s=markersize)
 
-    _format_axes(ax, shadow_tree.feature_names[0], shadow_tree.feature_names[1], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
+    _format_axes(ax, shadow_tree.feature_names[featidx[0]], shadow_tree.feature_names[featidx[1]], colors, fontsize, fontname, ticks_fontsize=ticks_fontsize, grid=False)
     ax.set_zlabel(f"{shadow_tree.target_name}", fontsize=fontsize, fontname=fontname, color=colors['axis_label'])
 
     if 'title' in show: