Skip to content

Commit

Permalink
Browse files Browse the repository at this point in the history
… into master
  • Loading branch information
oegedijk committed Jan 9, 2021
2 parents 93a9bc5 + deeeb5f commit 080597a
Show file tree
Hide file tree
Showing 19 changed files with 528 additions and 332 deletions.
2 changes: 2 additions & 0 deletions TODO.md
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@


## Explainers:
- pass n_jobs to pdp_isolate
- autodetect xgboost booster or catboost.core and suggest XGBClassifier, etc
- make X_cats with categorical encoding .astype("category")
- add ExtraTrees and GradientBoostingClassifier to tree visualizers
- add plain language explanations
Expand Down
2 changes: 1 addition & 1 deletion docs/source/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -59,7 +59,7 @@

autodoc_mock_imports = ['matplotlib', 'np', 'dash', 'dash_bootstrap_components',
'dash_html_components', 'dash_table', 'dash_core_components',
'dtreeviz', 'numpy', 'pandas', 'pd', 'pdpbox',
'dtreeviz', 'numpy', 'pandas', 'pd',
'sklearn', 'shap', 'plotly', 'shortuuid',
'joblib', 'dash_auth', 'jupyter_dash', 'oyaml', 'click',
'flask', 'flask_simplelogin', 'werkzeug']
Expand Down
10 changes: 9 additions & 1 deletion explainerdashboard/dashboards.py
Original file line number Diff line number Diff line change
Expand Up @@ -416,6 +416,14 @@ def __init__(self, explainer=None, tabs=None,
("explainer should be an instance of BaseExplainer, such as "
"ClassifierExplainer or RegressionExplainer!")

if self.explainer.cats_only:
print("Note: explainer contains a model and data that deal with "
"categorical features directly. Not all elements of the "
"ExplainerDashboard are compatible with such models, and "
"so setting the following **kwargs: "
"cats=True, hide_cats=True, shap_interaction=False", flush=True)
kwargs.update(dict(
cats=True, hide_cats=True, shap_interaction=False))
if kwargs:
print("**kwargs: Passing the following keyword arguments to all the dashboard"
f" ExplainerComponents: {', '.join([f'{k}={v}' for k,v in kwargs.items()])}...")
Expand All @@ -426,7 +434,7 @@ def __init__(self, explainer=None, tabs=None,
print("No y labels were passed to the Explainer, so setting"
" model_summary=False...", flush=True)
model_summary = False
if shap_interaction and not explainer.interactions_should_work:
if shap_interaction and (not explainer.interactions_should_work or self.explainer.cats_only):
print("For this type of model and model_output interactions don't "
"work, so setting shap_interaction=False...", flush=True)
shap_interaction = False
Expand Down
114 changes: 76 additions & 38 deletions explainerdashboard/explainer_methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,11 @@
from functools import partial
import re
from collections import Counter
from typing import List, Union

import numpy as np
import pandas as pd
from pandas.api.types import is_numeric_dtype

from dtreeviz.trees import ShadowDecTree

Expand All @@ -15,6 +17,18 @@

from joblib import Parallel, delayed

def safe_is_instance(obj, *instance_str):
"""Checks instance by comparing str(type(obj)) to one or more
instance_str. """
obj_str = str(type(obj))
for i in instance_str:
if i.endswith("'>"):
if obj_str.endswith(i):
return True
else:
if obj_str[:-2].endswith(i):
return True
return False

def guess_shap(model):
"""guesses which SHAP explainer to use for a particular model, based
Expand Down Expand Up @@ -105,44 +119,6 @@ def parse_cats(X, cats, sep:str="_"):
return cats_list, cats_dict


# Could be removed:

# def get_feature_dict(cols, cats=None, sep="_"):
# """helper function to get a dictionary with onehot-encoded columns
# grouped per category.

# Example:
# get_features_dict(["Age", "Sex_Male", "Sex_Female"], cats=["Sex"])
# will return {"Age": ["Age"], "Sex": ["Sex_Male", "Sex_Female"]}

# Args:
# cols (list of str): list of column names
# cats (list of str, optional): list of categorically encoded columns.
# All columns names starting with such a column name will be grouped together.
# Defaults to None.
# sep (str), seperator used between the category and encoding. Defaults to '_'

# Returns:
# dict
# """
# feature_dict = {}

# if cats is None:
# return {col: [col] for col in cols}

# for col in cats:
# cat_cols = [c for c in cols if c.startswith(col + sep)]
# if len(cat_cols) > 1:
# feature_dict[col] = cat_cols

# # add all the individual features
# other_cols = list(set(cols) - set([item for sublist in list(feature_dict.values())
# for item in sublist]))

# for col in other_cols:
# feature_dict[col] = [col]
# return feature_dict


def split_pipeline(pipeline, X, verbose=1):
"""Returns an X_transformed dataframe and model from a fitted
Expand Down Expand Up @@ -535,6 +511,68 @@ def mean_absolute_shap_values(columns, shap_values, cats_dict=None):
return shap_df


def get_pdp_df(model, X_sample:pd.DataFrame, feature:Union[str, List], pos_label=1,
n_grid_points=10, min_percentage=0, max_percentage=100):
"""Returns a dataframe with partial dependence for every row in X_sample for a number of feature values
Args:
model (): sklearn compatible model to generate pdp for
X_sample (pd.DataFrame): X to generate pdp for
feature (Union[str, List]): Feature to generate pdp for. Either the
name of a column in X_sample, or a list of onehot-encoded columns.
pos_label (int, optional): for classifier model, which class to use
as the positive class. Defaults to 1.
n_grid_points (int, optional): For numeric features: number of grid points
to divide the x axis by. Defaults to 10.
min_percentage (int, optional): For numeric features: minimum percentage of
samples to start x axis by. If large than 0 a form of winsorizing the
x axis. Defaults to 0.
max_percentage (int, optional): For numeric features: maximum percentage of
samples to end x axis by. If smaller than 100 a form of winsorizing the
x axis. Defaults to 100.
"""
def get_grid_points(array, n_grid_points=10, min_percentage=0, max_percentage=100):
if not is_numeric_dtype(array):
raise ValueError("array should be a numeric dtype!")
if isinstance(array, pd.Series):
array = array.values
percentile_grids = np.linspace(start=min_percentage, stop=max_percentage, num=n_grid_points)
value_grids = np.percentile(array, percentile_grids)
return value_grids

if isinstance(feature, str):
if not is_numeric_dtype(X_sample[feature]):
grid_values = sorted(X_sample[feature].unique().tolist())
else:
grid_values = get_grid_points(X_sample[feature],
n_grid_points=n_grid_points,
min_percentage=min_percentage,
max_percentage=max_percentage).tolist()
elif isinstance(feature, list):
grid_values = feature
else:
raise ValueError("feature should either be a column name (str), "
"or a list of onehot-encoded columns!")

pdp_df = pd.DataFrame()
for grid_value in grid_values:
dtemp = X_sample.copy()
if isinstance(feature, list):
assert set(X_sample[grid_value].unique()).issubset({0, 1}),\
(f"{grid_values} When passing a list of features these have to be onehotencoded!"
f"But X_sample['{grid_value}'].unique()=={list(set(X_sample[grid_value].unique()))}")
dtemp.loc[:, grid_values] = [1 if g==grid_value else 0 for g in grid_values]
else:
dtemp.loc[:, feature] = grid_value
if hasattr(model, "predict_proba"):
preds = model.predict_proba(dtemp)[:, pos_label]
else:
preds = model.predict(dtemp)
pdp_df[grid_value] = preds

return pdp_df


def get_precision_df(pred_probas, y_true, bin_size=None, quantiles=None,
round=3, pos_label=1):
"""
Expand Down
79 changes: 34 additions & 45 deletions explainerdashboard/explainer_plots.py
Original file line number Diff line number Diff line change
Expand Up @@ -1000,14 +1000,14 @@ def plotly_shap_violin_plot(X, shap_values, col_name, color_col=None, points=Fal
return fig


def plotly_pdp(pdp_result,
def plotly_pdp(pdp_df,
display_index=None, index_feature_value=None, index_prediction=None,
absolute=True, plot_lines=True, num_grid_lines=100, feature_name=None,
round=2, target="", units="", index_name="index"):
"""Display partial-dependence plot (pdp)
Args:
pdp_result (pdp_result): Generated from pdp.pdp_result()
pdp_df (pd.DataFrame): Generated from get_pdp_df()
display_index (int, str, optional): Index to highligh in plot.
Defaults to None.
index_feature_value (str, float, optional): value of feature for index.
Expand All @@ -1020,8 +1020,7 @@ def plotly_pdp(pdp_result,
Defaults to True.
num_grid_lines (int, optional): Number of sample gridlines to display.
Defaults to 100.
feature_name (str, optional): Name of the feature that the pdp_result
was generated for. Defaults to None.
feature_name (str, optional): Name of the feature. Defaults to None.
round (int, optional): Rounding to apply to floats. Defaults to 2.
target (str, optional): Name of target variables. Defaults to "".
units (str, optional): Units of target variable. Defaults to "".
Expand All @@ -1030,56 +1029,52 @@ def plotly_pdp(pdp_result,
Returns:
Plotly fig
"""

if feature_name is None: feature_name = pdp_result.feature

if absolute:
pdp_mean = pdp_df.mean().round(round).values
else:
pdp_mean = pdp_df.mean().round(round).values - pdp_df.mean().round(round).values[0]

trace0 = go.Scatter(
x = pdp_result.feature_grids,
y = pdp_result.pdp.round(round) if absolute else (
pdp_result.pdp - pdp_result.pdp[0]).round(round),
x = pdp_df.columns.values,
y = pdp_mean,
mode = 'lines+markers',
line = dict(color='grey', width = 4),
name = f'average prediction <br>for different values of <br>{pdp_result.feature}'
name = f'average prediction <br>for different values of <br>{feature_name}'
)
data = [trace0]

if display_index is not None:
trace1 = go.Scatter(
x = pdp_result.feature_grids,
y = pdp_result.ice_lines.iloc[display_index].round(round).values if absolute else \
pdp_result.ice_lines.iloc[display_index].round(round).values - pdp_result.ice_lines.iloc[display_index].round(round).values[0],
x = pdp_df.columns.values,
y = pdp_df.iloc[[display_index]].round(round).values[0] if absolute else \
pdp_df.iloc[[display_index]].round(round).values[0] - pdp_df.iloc[[display_index]].values[0,0],
mode = 'lines+markers',
line = dict(color='blue', width = 4),
name = f'prediction for {index_name} {display_index} <br>for different values of <br>{pdp_result.feature}'
name = f'prediction for {index_name} {display_index} <br>for different values of <br>{feature_name}'
)
data.append(trace1)

if plot_lines:
x = pdp_result.feature_grids
ice_lines = pdp_result.ice_lines.sample(num_grid_lines)
ice_lines = ice_lines.values if absolute else\
ice_lines.values - np.expand_dims(ice_lines.iloc[:, 0].transpose().values, axis=1)

for y in ice_lines:
data.append(
go.Scatter(
x = pdp_df.columns.values
pdp_sample = pdp_df.sample(min(num_grid_lines, len(pdp_df)))
ice_lines = pdp_sample.values if absolute else\
pdp_sample.values - np.expand_dims(pdp_sample.iloc[:, 0].values, axis=1)

for row in pdp_sample.itertuples(index=False):
data.append(go.Scatter(
x = x,
y = y,
y = tuple(row),
mode='lines',
hoverinfo='skip',
line=dict(color='grey'),
opacity=0.1,
showlegend=False
)
)
showlegend=False))

layout = go.Layout(title = f'pdp plot for {feature_name}',
plot_bgcolor = '#fff',
yaxis=dict(title=f"Predicted {target}{f' ({units})' if units else ''}"),
xaxis=dict(title=feature_name))

fig = go.Figure(data=data, layout=layout)

shapes = []
annotations = []

Expand All @@ -1094,10 +1089,8 @@ def plotly_pdp(pdp_result,
yref='y',
x0=index_feature_value,
x1=index_feature_value,
y0=np.min(ice_lines) if plot_lines else \
np.min(pdp_result.pdp),
y1=np.max(ice_lines) if plot_lines \
else np.max(pdp_result.pdp),
y0=pdp_sample.min().min() if plot_lines else pdp_mean.min(),
y1=pdp_sample.max().max() if plot_lines else pdp_mean.max(),
line=dict(
color="MediumPurple",
width=4,
Expand All @@ -1106,8 +1099,7 @@ def plotly_pdp(pdp_result,
))
annotations.append(
go.layout.Annotation(x=index_feature_value,
y=np.min(ice_lines) if plot_lines else \
np.min(pdp_result.pdp),
y=pdp_sample.min().min() if plot_lines else pdp_mean.min(),
text=f"baseline value = {index_feature_value}"))

if index_prediction is not None:
Expand All @@ -1116,22 +1108,19 @@ def plotly_pdp(pdp_result,
type='line',
xref='x',
yref='y',
x0=pdp_result.feature_grids[0],
x1=pdp_result.feature_grids[-1],
x0=pdp_df.columns.values[0],
x1=pdp_df.columns.values[-1],
y0=index_prediction,
y1=index_prediction,
line=dict(
color="MediumPurple",
width=4,
dash="dot",
),
))
annotations.append(
go.layout.Annotation(
x=pdp_result.feature_grids[
int(0.5*len(pdp_result.feature_grids))],
y=index_prediction,
text=f"baseline pred = {np.round(index_prediction,2)}"))
)
)
)

annotations.append(go.layout.Annotation(x=pdp_df.columns[int(0.5*len(pdp_df.columns))], y=index_prediction, text=f"baseline pred = {np.round(index_prediction,2)}"))

fig.update_layout(annotations=annotations)
fig.update_layout(shapes=shapes)
Expand Down
Loading

0 comments on commit 080597a

Please sign in to comment.