Merge branch 'master' of https://github.com/oegedijk/explainerdashboard…

… into master
oegedijk · Jan 9, 2021 · 080597a · 080597a
2 parents 93a9bc5 + deeeb5f
commit 080597a
Show file tree

Hide file tree

Showing 19 changed files with 528 additions and 332 deletions.
diff --git a/TODO.md b/TODO.md
@@ -32,6 +32,8 @@
 
 
 ## Explainers:
+- pass n_jobs to pdp_isolate
+- autodetect xgboost booster or catboost.core and suggest XGBClassifier, etc
 - make X_cats with categorical encoding .astype("category")
 - add ExtraTrees and GradientBoostingClassifier to tree visualizers
 - add plain language explanations

diff --git a/docs/source/conf.py b/docs/source/conf.py
@@ -59,7 +59,7 @@
 
 autodoc_mock_imports = ['matplotlib', 'np', 'dash', 'dash_bootstrap_components',
                     'dash_html_components', 'dash_table', 'dash_core_components',
-                    'dtreeviz', 'numpy', 'pandas', 'pd', 'pdpbox', 
+                    'dtreeviz', 'numpy', 'pandas', 'pd', 
                     'sklearn', 'shap',  'plotly', 'shortuuid', 
                     'joblib', 'dash_auth', 'jupyter_dash', 'oyaml', 'click',
                     'flask', 'flask_simplelogin', 'werkzeug']

diff --git a/explainerdashboard/dashboards.py b/explainerdashboard/dashboards.py
@@ -416,6 +416,14 @@ def __init__(self, explainer=None, tabs=None,
             ("explainer should be an instance of BaseExplainer, such as "
             "ClassifierExplainer or RegressionExplainer!")
 
+        if self.explainer.cats_only:
+            print("Note: explainer contains a model and data that deal with "
+                    "categorical features directly. Not all elements of the "
+                    "ExplainerDashboard are compatible with such models, and "
+                    "so setting the following **kwargs: "
+                    "cats=True, hide_cats=True, shap_interaction=False", flush=True)
+            kwargs.update(dict(
+                cats=True, hide_cats=True, shap_interaction=False))
         if kwargs: 
             print("**kwargs: Passing the following keyword arguments to all the dashboard"
                 f" ExplainerComponents: {', '.join([f'{k}={v}' for k,v in kwargs.items()])}...")
@@ -426,7 +434,7 @@ def __init__(self, explainer=None, tabs=None,
                 print("No y labels were passed to the Explainer, so setting"
                         " model_summary=False...", flush=True)
                 model_summary = False
-            if shap_interaction and not explainer.interactions_should_work:
+            if shap_interaction and (not explainer.interactions_should_work or self.explainer.cats_only):
                 print("For this type of model and model_output interactions don't "
                           "work, so setting shap_interaction=False...", flush=True)
                 shap_interaction = False

diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py
@@ -2,9 +2,11 @@
 from functools import partial
 import re
 from collections import Counter
+from typing import List, Union
 
 import numpy as np
 import pandas as pd
+from pandas.api.types import is_numeric_dtype
 
 from dtreeviz.trees import ShadowDecTree
 
@@ -15,6 +17,18 @@
 
 from joblib import Parallel, delayed
 
+def safe_is_instance(obj, *instance_str):
+    """Checks instance by comparing str(type(obj)) to one or more
+    instance_str. """
+    obj_str = str(type(obj))
+    for i in instance_str:
+        if i.endswith("'>"):
+            if obj_str.endswith(i):
+                return True
+        else:
+            if obj_str[:-2].endswith(i):
+                return True
+    return False
 
 def guess_shap(model):
     """guesses which SHAP explainer to use for a particular model, based
@@ -105,44 +119,6 @@ def parse_cats(X, cats, sep:str="_"):
     return cats_list, cats_dict
 
 
-# Could be removed:
-
-# def get_feature_dict(cols, cats=None, sep="_"):
-#     """helper function to get a dictionary with onehot-encoded columns
-#     grouped per category. 
-
-#     Example:
-#         get_features_dict(["Age", "Sex_Male", "Sex_Female"], cats=["Sex"])
-#         will return {"Age": ["Age"], "Sex": ["Sex_Male", "Sex_Female"]}
-
-#     Args:
-#         cols (list of str): list of column names
-#         cats (list of str, optional): list of categorically encoded columns. 
-#             All columns names starting with such a column name will be grouped together. 
-#             Defaults to None.
-#         sep (str), seperator used between the category and encoding. Defaults to '_' 
-
-#     Returns:
-#         dict
-#     """
-#     feature_dict = {}
-
-#     if cats is None:
-#         return {col: [col] for col in cols}
-
-#     for col in cats:
-#         cat_cols = [c for c in cols if c.startswith(col + sep)]
-#         if len(cat_cols) > 1:
-#             feature_dict[col] = cat_cols
-
-#     # add all the individual features
-#     other_cols = list(set(cols) - set([item for sublist in list(feature_dict.values())
-#                                                 for item in sublist]))
-
-#     for col in other_cols:
-#         feature_dict[col] = [col]
-#     return feature_dict
-
 
 def split_pipeline(pipeline, X, verbose=1):
     """Returns an X_transformed dataframe and model from a fitted 
@@ -535,6 +511,68 @@ def mean_absolute_shap_values(columns, shap_values, cats_dict=None):
     return shap_df
 
 
+def get_pdp_df(model, X_sample:pd.DataFrame, feature:Union[str, List], pos_label=1,
+                  n_grid_points=10, min_percentage=0, max_percentage=100):
+    """Returns a dataframe with partial dependence for every row in X_sample for a number of feature values
+
+    Args:
+        model (): sklearn compatible model to generate pdp for
+        X_sample (pd.DataFrame): X to generate pdp for
+        feature (Union[str, List]): Feature to generate pdp for. Either the 
+            name of a column in X_sample, or a list of onehot-encoded columns.  
+        pos_label (int, optional): for classifier model, which class to use
+            as the positive class. Defaults to 1.
+        n_grid_points (int, optional): For numeric features: number of grid points
+            to divide the x axis by. Defaults to 10.
+        min_percentage (int, optional): For numeric features: minimum percentage of
+            samples to start x axis by. If large than 0 a form of winsorizing the 
+            x axis. Defaults to 0.
+        max_percentage (int, optional): For numeric features: maximum percentage of
+            samples to end x axis by. If smaller than 100 a form of winsorizing the 
+            x axis. Defaults to 100.
+    """
+    def get_grid_points(array, n_grid_points=10, min_percentage=0, max_percentage=100):
+        if not is_numeric_dtype(array):
+            raise ValueError("array should be a numeric dtype!")
+        if isinstance(array, pd.Series):
+            array = array.values
+        percentile_grids = np.linspace(start=min_percentage, stop=max_percentage, num=n_grid_points)
+        value_grids = np.percentile(array, percentile_grids)
+        return value_grids
+
+    if isinstance(feature, str):
+        if not is_numeric_dtype(X_sample[feature]):
+            grid_values = sorted(X_sample[feature].unique().tolist())
+        else:
+            grid_values = get_grid_points(X_sample[feature], 
+                                          n_grid_points=n_grid_points, 
+                                          min_percentage=min_percentage, 
+                                          max_percentage=max_percentage).tolist()
+    elif isinstance(feature, list):
+        grid_values = feature
+    else:
+        raise ValueError("feature should either be a column name (str), "
+                         "or a list of onehot-encoded columns!")
+
+    pdp_df = pd.DataFrame()
+    for grid_value in grid_values:
+        dtemp = X_sample.copy()
+        if isinstance(feature, list):
+            assert set(X_sample[grid_value].unique()).issubset({0, 1}),\
+                (f"{grid_values} When passing a list of features these have to be onehotencoded!"
+                 f"But X_sample['{grid_value}'].unique()=={list(set(X_sample[grid_value].unique()))}")
+            dtemp.loc[:, grid_values] = [1 if g==grid_value else 0 for g in grid_values]
+        else:
+            dtemp.loc[:, feature] = grid_value
+        if hasattr(model, "predict_proba"):
+            preds = model.predict_proba(dtemp)[:, pos_label]
+        else:
+            preds = model.predict(dtemp)  
+        pdp_df[grid_value] = preds
+
+    return pdp_df
+
+
 def get_precision_df(pred_probas, y_true, bin_size=None, quantiles=None, 
                         round=3, pos_label=1):
     """

diff --git a/explainerdashboard/explainer_plots.py b/explainerdashboard/explainer_plots.py
@@ -1000,14 +1000,14 @@ def plotly_shap_violin_plot(X, shap_values, col_name, color_col=None, points=Fal
     return fig
 
 
-def plotly_pdp(pdp_result, 
+def plotly_pdp(pdp_df, 
                display_index=None, index_feature_value=None, index_prediction=None,
                absolute=True, plot_lines=True, num_grid_lines=100, feature_name=None,
                round=2, target="", units="", index_name="index"):
     """Display partial-dependence plot (pdp)
 
     Args:
-        pdp_result (pdp_result): Generated from pdp.pdp_result()
+        pdp_df (pd.DataFrame): Generated from get_pdp_df()
         display_index (int, str, optional): Index to highligh in plot. 
             Defaults to None.
         index_feature_value (str, float, optional): value of feature for index. 
@@ -1020,8 +1020,7 @@ def plotly_pdp(pdp_result,
             Defaults to True.
         num_grid_lines (int, optional): Number of sample gridlines to display. 
             Defaults to 100.
-        feature_name (str, optional): Name of the feature that the pdp_result
-            was generated for. Defaults to None.
+        feature_name (str, optional): Name of the feature. Defaults to None.
         round (int, optional): Rounding to apply to floats. Defaults to 2.
         target (str, optional): Name of target variables. Defaults to "".
         units (str, optional): Units of target variable. Defaults to "".
@@ -1030,56 +1029,52 @@ def plotly_pdp(pdp_result,
     Returns:
         Plotly fig
     """
-
-    if feature_name is None: feature_name = pdp_result.feature
-
+    if absolute:
+        pdp_mean = pdp_df.mean().round(round).values
+    else:
+        pdp_mean = pdp_df.mean().round(round).values - pdp_df.mean().round(round).values[0]
+
     trace0 = go.Scatter(
-            x = pdp_result.feature_grids,
-            y = pdp_result.pdp.round(round) if absolute else (
-                    pdp_result.pdp - pdp_result.pdp[0]).round(round),
+            x = pdp_df.columns.values,
+            y = pdp_mean,
             mode = 'lines+markers',
             line = dict(color='grey', width = 4),
-            name = f'average prediction <br>for different values of <br>{pdp_result.feature}'
+            name = f'average prediction <br>for different values of <br>{feature_name}'
         )
     data = [trace0]
 
     if display_index is not None:
         trace1 = go.Scatter(
-            x = pdp_result.feature_grids,
-            y = pdp_result.ice_lines.iloc[display_index].round(round).values if absolute else \
-                pdp_result.ice_lines.iloc[display_index].round(round).values - pdp_result.ice_lines.iloc[display_index].round(round).values[0],
+            x = pdp_df.columns.values,
+            y = pdp_df.iloc[[display_index]].round(round).values[0] if absolute else \
+                pdp_df.iloc[[display_index]].round(round).values[0] - pdp_df.iloc[[display_index]].values[0,0],
             mode = 'lines+markers',
             line = dict(color='blue', width = 4),
-            name = f'prediction for {index_name} {display_index} <br>for different values of <br>{pdp_result.feature}'
+            name = f'prediction for {index_name} {display_index} <br>for different values of <br>{feature_name}'
         )
         data.append(trace1)
-
     if plot_lines:
-        x = pdp_result.feature_grids
-        ice_lines = pdp_result.ice_lines.sample(num_grid_lines)
-        ice_lines = ice_lines.values if absolute else\
-                    ice_lines.values - np.expand_dims(ice_lines.iloc[:, 0].transpose().values, axis=1)
-
-        for y in ice_lines:
-            data.append(
-                go.Scatter(
+        x = pdp_df.columns.values
+        pdp_sample = pdp_df.sample(min(num_grid_lines, len(pdp_df)))
+        ice_lines = pdp_sample.values if absolute else\
+                    pdp_sample.values - np.expand_dims(pdp_sample.iloc[:, 0].values, axis=1)
+
+        for row in pdp_sample.itertuples(index=False):
+            data.append(go.Scatter(
                     x = x,
-                    y = y,
+                    y = tuple(row),
                     mode='lines',
                     hoverinfo='skip',
                     line=dict(color='grey'),
                     opacity=0.1,
-                    showlegend=False             
-                )
-            )
+                    showlegend=False))
 
     layout = go.Layout(title = f'pdp plot for {feature_name}',
                         plot_bgcolor = '#fff',
                         yaxis=dict(title=f"Predicted {target}{f' ({units})' if units else ''}"),
                         xaxis=dict(title=feature_name))
 
     fig = go.Figure(data=data, layout=layout)
-
     shapes = []
     annotations = []
 
@@ -1094,10 +1089,8 @@ def plotly_pdp(pdp_result,
                         yref='y',
                         x0=index_feature_value,
                         x1=index_feature_value,
-                        y0=np.min(ice_lines) if plot_lines else \
-                            np.min(pdp_result.pdp),
-                        y1=np.max(ice_lines) if plot_lines \
-                            else np.max(pdp_result.pdp),
+                        y0=pdp_sample.min().min() if plot_lines else pdp_mean.min(),
+                        y1=pdp_sample.max().max() if plot_lines else pdp_mean.max(),
                         line=dict(
                             color="MediumPurple",
                             width=4,
@@ -1106,8 +1099,7 @@ def plotly_pdp(pdp_result,
                          ))
         annotations.append(
             go.layout.Annotation(x=index_feature_value, 
-                                 y=np.min(ice_lines) if plot_lines else \
-                                    np.min(pdp_result.pdp),
+                                 y=pdp_sample.min().min() if plot_lines else pdp_mean.min(),
                                  text=f"baseline value = {index_feature_value}"))
 
     if index_prediction is not None:
@@ -1116,22 +1108,19 @@ def plotly_pdp(pdp_result,
                         type='line',
                         xref='x',
                         yref='y',
-                        x0=pdp_result.feature_grids[0],
-                        x1=pdp_result.feature_grids[-1],
+                        x0=pdp_df.columns.values[0],
+                        x1=pdp_df.columns.values[-1],
                         y0=index_prediction,
                         y1=index_prediction,
                         line=dict(
                             color="MediumPurple",
                             width=4,
                             dash="dot",
-                        ),
-                         ))
-        annotations.append(
-            go.layout.Annotation(
-                x=pdp_result.feature_grids[
-                            int(0.5*len(pdp_result.feature_grids))], 
-                y=index_prediction, 
-                text=f"baseline pred = {np.round(index_prediction,2)}"))
+                        )
+                    )
+        )
+
+        annotations.append(go.layout.Annotation(x=pdp_df.columns[int(0.5*len(pdp_df.columns))], y=index_prediction, text=f"baseline pred = {np.round(index_prediction,2)}"))
 
     fig.update_layout(annotations=annotations)
     fig.update_layout(shapes=shapes)