In [1]:
%load_ext autoreload
%autoreload 2

%matplotlib inline
import sys
import os
# add library module to PYTHONPATH
sys.path.append(f"{os.getcwd()}/../")

In [2]:
%matplotlib inline

In [3]:
import sklearn
import graphviz
import pandas as pd

import dtreeviz

# Set up the dataset

In [4]:
random_state = 1234 # get reproducible trees

dataset_url = "https://raw.githubusercontent.com/parrt/dtreeviz/master/data/titanic/titanic.csv"
dataset = pd.read_csv(dataset_url)
# Fill missing values for Age
dataset.fillna({"Age":dataset.Age.mean()}, inplace=True)
# Encode categorical variables
dataset["Sex_label"] = dataset.Sex.astype("category").cat.codes
dataset["Cabin_label"] = dataset.Cabin.astype("category").cat.codes
dataset["Embarked_label"] = dataset.Embarked.astype("category").cat.codes

# Set up the pipeline

In [5]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_selection import VarianceThreshold
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import PolynomialFeatures

In [14]:
features = ["Pclass", "Age", "Fare", "Sex_label", "Cabin_label", "Embarked_label"]
target = "Survived"

In [15]:
model = make_pipeline(
    VarianceThreshold(0.5),
    PolynomialFeatures(degree=2, interaction_only=True),
    DecisionTreeClassifier(max_depth=4))

In [13]:
model.fit(dataset[features], dataset[target])

### Extract relevant parameters for `dtreeviz` from the pipeline


As the model used has several preprocessing steps prior the decision tree, its feature space is different from `dataset[features]`. Hence, we first have to extract the relevant parameters from the pipeline before passing them to `dtreeviz`. Here, we use the following helper function from the `dtreeviz.utils` module:

In [None]:
from dtreeviz.utils import extract_params_from_pipeline

In [None]:
tree_classifier, x_data, features_model = extract_params_from_pipeline(
    pipeline=model,
    x_data=dataset[features],
    feature_names=features)

In [None]:
y_data = dataset[target]

In [None]:
features_model

## Initialize dtreeviz model (adaptor)


In [None]:
viz_model = dtreeviz.model(tree_classifier,
                           x_data=x_data, y_data=y_data,
                           feature_names=features_model,
                           target_name=target, class_names=["survive", "perish"])

## Tree structure visualizations

To show the decision tree structure using the default visualization, call `view()`:

In [None]:
viz_model.view()

To change the visualization, you can pass parameters, such as changing the orientation to left-to-right:

In [None]:
viz_model.view(orientation="LR")

To visualize larger trees, you can reduce the amount of detail by turning off the fancy view:

In [None]:
viz_model.view(fancy=False)

Another way to reduce the visualization size is to specify the tree depths of interest:

In [None]:
viz_model.view(depth_range_to_display=(1, 2)) # root is level 0

## Prediction path explanations

For interpretation purposes, we often want to understand how a tree behaves for a specific instance. Let's pick a specific instance:

In [None]:
x = x_data.iloc[10]
x

and then display the path through the tree structure:

In [None]:
viz_model.view(x=x)

In [None]:
viz_model.view(x=x, show_just_path=True)

You can also get a string representation explaining the comparisons made as an instance is run down the tree:

In [None]:
print(viz_model.explain_prediction_path(x))

If you'd like the feature importance for a specific instance, as calculated by the underlying decision tree library, use `instance_feature_importance()`:

In [None]:
viz_model.instance_feature_importance(x)

## Leaf info

There are a number of functions to get information about the leaves of the tree.

In [None]:
viz_model.leaf_sizes()

In [None]:
viz_model.ctree_leaf_distributions()

In [None]:
viz_model.node_stats(node_id=14)

In [None]:
viz_model.leaf_purity()