diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py index ccf82ba..04a53a3 100644 --- a/explainerdashboard/explainer_methods.py +++ b/explainerdashboard/explainer_methods.py @@ -919,7 +919,7 @@ def get_pdp_df( else: first_row = X_sample.iloc[[0]] warnings.filterwarnings("ignore", category=UserWarning) - n_labels = model.predict_proba(first_row).shape[1] + n_labels = model.predict_proba(first_row.copy()).shape[1] warnings.filterwarnings("default", category=UserWarning) if multiclass: pdp_dfs = [pd.DataFrame() for i in range(n_labels)] diff --git a/explainerdashboard/explainer_plots.py b/explainerdashboard/explainer_plots.py index 0045641..fa34b95 100644 --- a/explainerdashboard/explainer_plots.py +++ b/explainerdashboard/explainer_plots.py @@ -2815,7 +2815,7 @@ def plotly_rf_trees( "model": range(len(model.estimators_)), "prediction": [ np.round( - 100 * m.predict_proba(observation)[0, pos_label], round + 100 * m.predict_proba(observation.copy())[0, pos_label], round ) for m in model.estimators_ ], diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py index 093f560..db36ec9 100644 --- a/explainerdashboard/explainers.py +++ b/explainerdashboard/explainers.py @@ -240,8 +240,12 @@ def __init__( col for col in self.regular_cols if not is_numeric_dtype(self.X[col]) ] self.categorical_dict = { - col: sorted(self.X[col].unique().tolist()) for col in self.categorical_cols + col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols } + #Add nan to list, as this is a valid option for encoders + for col in self.categorical_cols: + if self.X[col].isnull().values.any(): + self.categorical_dict[col].append('NaN') self.cat_cols = self.onehot_cols + self.categorical_cols self.original_cols = self.X.columns self.merged_cols = pd.Index(self.regular_cols + self.onehot_cols) @@ -757,6 +761,11 @@ def get_row_from_input( df_merged = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna( self.na_fill )[self.merged_cols] + #Adjust categorical col to proper nan value instead of self.na_fill + for col, values in self.categorical_dict.items(): + if 'NaN' in values: + df_merged[col] = df_merged[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan + df_merged[col] = df_merged[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string if return_merged: return df_merged else: @@ -765,6 +774,11 @@ def get_row_from_input( elif len(inputs) == len(self.columns): cols = self.columns df = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(self.na_fill) + #unsure if this is okay here for categorical defined values + for col, values in self.categorical_dict.items(): + if 'NaN' in values: + df[col] = df[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan + df[col] = df[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string if return_merged: return merge_categorical_columns(df, self.onehot_dict, self.merged_cols) else: @@ -2561,12 +2575,12 @@ def pred_probas_raw(self): self.model, "predict_proba" ), "model does not have a predict_proba method!" if self.shap == "skorch": - self._pred_probas = self.model.predict_proba(self.X.values).astype( + self._pred_probas = self.model.predict_proba(self.X.copy().values).astype( self.precision ) else: warnings.filterwarnings("ignore", category=UserWarning) - self._pred_probas = self.model.predict_proba(self.X).astype( + self._pred_probas = self.model.predict_proba(self.X.copy()).astype( self.precision ) warnings.filterwarnings("default", category=UserWarning) @@ -2768,7 +2782,7 @@ def shap_explainer(self): def model_predict(data_asarray): data_asframe = pd.DataFrame(data_asarray, columns=self.columns) - return self.model.predict_proba(data_asframe) + return self.model.predict_proba(data_asframe.copy()) self._shap_explainer = shap.KernelExplainer( model_predict, @@ -3251,7 +3265,7 @@ def get_cv_metrics(n_splits): ): X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index] y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index] - preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test) + preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test.copy()) for label in range(len(self.labels)): for cut in np.linspace(1, 99, 99, dtype=int): y_true = np.where(y_test == label, 1, 0) @@ -3484,7 +3498,7 @@ def prediction_result_df( X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns) if self.shap == "skorch": X_row = X_row.values.astype("float32") - pred_probas = self.model.predict_proba(X_row)[0, :].squeeze() + pred_probas = self.model.predict_proba(X_row.copy())[0, :].squeeze() preds_df = pd.DataFrame(dict(label=self.labels, probability=pred_probas)) if logodds and all(preds_df.probability < 1 - np.finfo(np.float64).eps): diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..1c5c81d --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,83 @@ +from sklearn.ensemble import RandomForestClassifier +import pandas as pd +from explainerdashboard import ClassifierExplainer, ExplainerDashboard +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split +import os +import numpy as np + +class CategoricalModelWrapper: + def __init__(self, model, categorical_label_test) -> None: + self._model = model + self._categorical_label_test = categorical_label_test + pass + + def _perform_label_encoding(self, y): + label_enc = LabelEncoder() + label_enc.fit([["Survived"],["Not Survived"]]) + return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index) + + def _perform_label_decoding(self, y): + label_enc = LabelEncoder() + label_enc.fit([["Survived"],["Not Survived"]]) + return pd.Series(label_enc.inverse_transform(y), name=y.name) + + def _preprocessor(self, X): + return X.drop(["Name"], axis=1) + + def _postprocessor(self, y): + if self._categorical_label_test == True: + y = self._perform_label_decoding(y) + return y + + def predict(self, X): + X = self._preprocessor(X) + y = self._model.predict(X) + return self._postprocessor(y) + + def predict_proba(self, X): + X = self._preprocessor(X) + probabilities_raw = self._model.predict_proba(X) + return probabilities_raw + +def generate_categorical_dataset_model_wrapper(categorical_label_test=False): + model = RandomForestClassifier(n_estimators=5, max_depth=2) + wrapper = CategoricalModelWrapper(model, categorical_label_test) + df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\data.csv")) + if categorical_label_test == True: + #Test for categorical label, convert titanic binary numeric label to categorical ["Survived"],["Not Survived"] + df["Survival"] = wrapper._perform_label_decoding(df["Survival"]) + else: + #We only test NaN in categorical features and numerical target + df["Name"][0] = np.nan + df["Name"][10] = np.nan + df["Name"][20] = np.nan + df["Name"][30] = np.nan + df["Name"][40] = np.nan + df["Name"][50] = np.nan + df["Name"][60] = np.nan + df["Name"][70] = np.nan + df["Name"][80] = np.nan + X_train, X_test, y_train, y_test = train_test_split(df.drop(["Survival"], axis=1), df["Survival"], test_size=0.2, random_state=42) + + X_train = wrapper._preprocessor(X_train) + + if categorical_label_test == True: + y_train = wrapper._perform_label_encoding(y_train) + + model.fit(X_train, y_train) + return CategoricalModelWrapper(model, categorical_label_test), X_test, y_test + +def test_NaN_containing_categorical_dataset(): + _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper() + explainer = ClassifierExplainer( + _wrapper, _test_X, _test_y) + dashboard = ExplainerDashboard(explainer) + assert "NaN" in explainer.categorical_dict["Name"] + +def test_categorical_label(): + _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True) + explainer = ClassifierExplainer( + _wrapper, _test_X, _test_y) + dashboard = ExplainerDashboard(explainer) + assert "Survived" in explainer.labels