From 2541e27f4b9d81f4245a05d7822725d55f0e9fd2 Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Fri, 14 Jul 2023 13:08:00 +0200 Subject: [PATCH 1/8] added dropna to avoid crash on nan values added copy to any predict or predict_proba call to only pass definitiv copies --- explainerdashboard/explainer_methods.py | 12 ++++++------ explainerdashboard/explainer_plots.py | 4 ++-- explainerdashboard/explainers.py | 26 ++++++++++++------------- 3 files changed, 21 insertions(+), 21 deletions(-) diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py index f3920b9..1fe6dca 100644 --- a/explainerdashboard/explainer_methods.py +++ b/explainerdashboard/explainer_methods.py @@ -572,7 +572,7 @@ def one_vs_all_metric(metric, pos_label, y_true, y_pred): sign = 1 if greater_is_better else -1 def _scorer(clf, X, y): - y_pred = clf.predict_proba(X) + y_pred = clf.predict_proba(X.copy()) score = sign * partial_metric(y, y_pred) return score @@ -915,7 +915,7 @@ def get_pdp_df( first_row = X_sample.iloc[[0]].values.astype("float32") else: first_row = X_sample.iloc[[0]] - n_labels = model.predict_proba(first_row).shape[1] + n_labels = model.predict_proba(first_row.copy()).shape[1] if multiclass: pdp_dfs = [pd.DataFrame() for i in range(n_labels)] else: @@ -1732,21 +1732,21 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1): if pos_label == 1: preds = [ xgbmodel.predict( - X_row, iteration_range=(0, i + 1), output_margin=True + X_row.copy(), iteration_range=(0, i + 1), output_margin=True )[0] for i in range(n_trees) ] elif pos_label == 0: preds = [ -xgbmodel.predict( - X_row, iteration_range=(0, i + 1), output_margin=True + X_row.copy(), iteration_range=(0, i + 1), output_margin=True )[0] for i in range(n_trees) ] pred_probas = (np.exp(preds) / (1 + np.exp(preds))).tolist() else: margins = [ - xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[ + xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[ 0 ] for i in range(n_trees) @@ -1758,7 +1758,7 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1): else: preds = [ - xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[0] + xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[0] for i in range(n_trees) ] diff --git a/explainerdashboard/explainer_plots.py b/explainerdashboard/explainer_plots.py index 31e0fab..0f60b35 100644 --- a/explainerdashboard/explainer_plots.py +++ b/explainerdashboard/explainer_plots.py @@ -2813,7 +2813,7 @@ def plotly_rf_trees( "model": range(len(model.estimators_)), "prediction": [ np.round( - 100 * m.predict_proba(observation)[0, pos_label], round + 100 * m.predict_proba(observation.copy())[0, pos_label], round ) for m in model.estimators_ ], @@ -2829,7 +2829,7 @@ def plotly_rf_trees( { "model": range(len(model.estimators_)), "prediction": [ - np.round(m.predict(observation)[0], round) + np.round(m.predict(observation.copy())[0], round) for m in model.estimators_ ], "color": colors, diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py index c630a6f..d859b5e 100644 --- a/explainerdashboard/explainers.py +++ b/explainerdashboard/explainers.py @@ -240,7 +240,7 @@ def __init__( col for col in self.regular_cols if not is_numeric_dtype(self.X[col]) ] self.categorical_dict = { - col: sorted(self.X[col].unique().tolist()) for col in self.categorical_cols + col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols } self.cat_cols = self.onehot_cols + self.categorical_cols self.original_cols = self.X.columns @@ -837,11 +837,11 @@ def get_col_value_plus_prediction( if self.is_classifier: if pos_label is None: pos_label = self.pos_label - prediction = self.model.predict_proba(X_row)[0][pos_label].squeeze() + prediction = self.model.predict_proba(X_row.copy())[0][pos_label].squeeze() if self.model_output == "probability": prediction = 100 * prediction elif self.is_regression: - prediction = self.model.predict(X_row)[0].squeeze() + prediction = self.model.predict(X_row.copy())[0].squeeze() return col_value, prediction else: raise ValueError("You need to pass either index or X_row!") @@ -968,11 +968,11 @@ def preds(self): print("Calculating predictions...", flush=True) if self.shap == "skorch": # skorch model.predict need np.array self._preds = ( - self.model.predict(self.X.values).squeeze().astype(self.precision) + self.model.predict(self.X.copy().values).squeeze().astype(self.precision) ) else: # Pipelines.predict need pd.DataFrame: self._preds = ( - self.model.predict(self.X).squeeze().astype(self.precision) + self.model.predict(self.X.copy()).squeeze().astype(self.precision) ) return self._preds @@ -1107,7 +1107,7 @@ def shap_explainer(self): def model_predict(data_asarray): data_asframe = pd.DataFrame(data_asarray, columns=self.columns) - preds = self.model.predict(data_asframe) + preds = self.model.predict(data_asframe.copy()) return preds.reshape(len(preds)) self._shap_explainer = shap.KernelExplainer( @@ -2561,11 +2561,11 @@ def pred_probas_raw(self): self.model, "predict_proba" ), "model does not have a predict_proba method!" if self.shap == "skorch": - self._pred_probas = self.model.predict_proba(self.X.values).astype( + self._pred_probas = self.model.predict_proba(self.X.copy().values).astype( self.precision ) else: - self._pred_probas = self.model.predict_proba(self.X).astype( + self._pred_probas = self.model.predict_proba(self.X.copy()).astype( self.precision ) return self._pred_probas @@ -2766,7 +2766,7 @@ def shap_explainer(self): def model_predict(data_asarray): data_asframe = pd.DataFrame(data_asarray, columns=self.columns) - return self.model.predict_proba(data_asframe) + return self.model.predict_proba(data_asframe.copy()) self._shap_explainer = shap.KernelExplainer( model_predict, @@ -3249,7 +3249,7 @@ def get_cv_metrics(n_splits): ): X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index] y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index] - preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test) + preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test.copy()) for label in range(len(self.labels)): for cut in np.linspace(1, 99, 99, dtype=int): y_true = np.where(y_test == label, 1, 0) @@ -3482,7 +3482,7 @@ def prediction_result_df( X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns) if self.shap == "skorch": X_row = X_row.values.astype("float32") - pred_probas = self.model.predict_proba(X_row)[0, :].squeeze() + pred_probas = self.model.predict_proba(X_row.copy())[0, :].squeeze() preds_df = pd.DataFrame(dict(label=self.labels, probability=pred_probas)) if logodds and all(preds_df.probability < 1 - np.finfo(np.float64).eps): @@ -4145,7 +4145,7 @@ def prediction_result_df(self, index=None, X_row=None, round=3): X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns) if self.shap == "skorch": X_row = X_row.values.astype("float32") - pred = self.model.predict(X_row).item() + pred = self.model.predict(X_row.copy()).item() preds_df = pd.DataFrame(columns=["", self.target]) preds_df = append_dict_to_df( preds_df, {"": "Predicted", self.target: f"{pred:.{round}f} {self.units}"} @@ -4203,7 +4203,7 @@ def metrics(self, show_metrics: List[str] = None): ): X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index] y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index] - preds = clone(self.model).fit(X_train, y_train).predict(X_test) + preds = clone(self.model).fit(X_train, y_train).predict(X_test.copy()) metrics_dict["mean-squared-error"].append( mean_squared_error(y_test, preds) ) From 723b4b15e84659a85b338cf5a2d8ef7029fec2ce Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Fri, 14 Jul 2023 15:37:39 +0200 Subject: [PATCH 2/8] added nan value to categorical features --- explainerdashboard/explainers.py | 12 ++++++++++++ 1 file changed, 12 insertions(+) diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py index d859b5e..5dd253a 100644 --- a/explainerdashboard/explainers.py +++ b/explainerdashboard/explainers.py @@ -242,6 +242,10 @@ def __init__( self.categorical_dict = { col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols } + #Add nan to list, as this is a valid option for encoders + for col in self.categorical_cols: + if self.X[col].isnull().values.any(): + self.categorical_dict[col].append('NaN') self.cat_cols = self.onehot_cols + self.categorical_cols self.original_cols = self.X.columns self.merged_cols = pd.Index(self.regular_cols + self.onehot_cols) @@ -757,6 +761,10 @@ def get_row_from_input( df_merged = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna( self.na_fill )[self.merged_cols] + #Adjust categorical col to proper nan value instead of self.na_fill + for col, values in self.categorical_dict.items(): + if 'NaN' in values: + df_merged[col] = df_merged[col].replace(self.na_fill, np.nan) if return_merged: return df_merged else: @@ -765,6 +773,10 @@ def get_row_from_input( elif len(inputs) == len(self.columns): cols = self.columns df = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(self.na_fill) + #unsure if this is okay here for categorical defined values + for col, values in self.categorical_dict.items(): + if 'NaN' in values: + df[col] = df[col].replace(self.na_fill, np.nan) if return_merged: return merge_categorical_columns(df, self.onehot_dict, self.merged_cols) else: From be6cfc08ea981358755fc004fc381beefcfd0342 Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Fri, 14 Jul 2023 18:14:35 +0200 Subject: [PATCH 3/8] added conversion for string NaN from frontend --- explainerdashboard/explainers.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py index 5dd253a..b2ad582 100644 --- a/explainerdashboard/explainers.py +++ b/explainerdashboard/explainers.py @@ -764,7 +764,8 @@ def get_row_from_input( #Adjust categorical col to proper nan value instead of self.na_fill for col, values in self.categorical_dict.items(): if 'NaN' in values: - df_merged[col] = df_merged[col].replace(self.na_fill, np.nan) + df_merged[col] = df_merged[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan + df_merged[col] = df_merged[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string if return_merged: return df_merged else: @@ -776,7 +777,8 @@ def get_row_from_input( #unsure if this is okay here for categorical defined values for col, values in self.categorical_dict.items(): if 'NaN' in values: - df[col] = df[col].replace(self.na_fill, np.nan) + df[col] = df[col].replace(self.na_fill, np.nan) #If the categorical feature comes from the existing data it will be nan + df[col] = df[col].replace('NaN', np.nan) #If the categorical feature is changed to NaN in the frontend it will be a string if return_merged: return merge_categorical_columns(df, self.onehot_dict, self.merged_cols) else: From b318e045ca6aca28e06fbaffea53eb63af26ceb2 Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Tue, 1 Aug 2023 15:45:15 +0200 Subject: [PATCH 4/8] added test for nan categorical removed copy from predict function calls, added test for testing categorical labels --- .gitignore | 1 + explainerdashboard/explainer_methods.py | 8 +- explainerdashboard/explainer_plots.py | 2 +- explainerdashboard/explainers.py | 14 +-- tests/test_assets/car.csv | 111 ++++++++++++++++++++++++ tests/test_datasets.py | 79 +++++++++++++++++ 6 files changed, 203 insertions(+), 12 deletions(-) create mode 100644 tests/test_assets/car.csv create mode 100644 tests/test_datasets.py diff --git a/.gitignore b/.gitignore index 8691c26..91db970 100644 --- a/.gitignore +++ b/.gitignore @@ -159,6 +159,7 @@ tests/cli_assets/* tests/test_assets/* !tests/test_assets/data.csv !tests/test_assets/pipeline_data.csv +!tests/test_assets/car.csv db_test.py .DS_Store diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py index 1fe6dca..1c1bcad 100644 --- a/explainerdashboard/explainer_methods.py +++ b/explainerdashboard/explainer_methods.py @@ -1732,21 +1732,21 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1): if pos_label == 1: preds = [ xgbmodel.predict( - X_row.copy(), iteration_range=(0, i + 1), output_margin=True + X_row, iteration_range=(0, i + 1), output_margin=True )[0] for i in range(n_trees) ] elif pos_label == 0: preds = [ -xgbmodel.predict( - X_row.copy(), iteration_range=(0, i + 1), output_margin=True + X_row, iteration_range=(0, i + 1), output_margin=True )[0] for i in range(n_trees) ] pred_probas = (np.exp(preds) / (1 + np.exp(preds))).tolist() else: margins = [ - xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[ + xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[ 0 ] for i in range(n_trees) @@ -1758,7 +1758,7 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1): else: preds = [ - xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[0] + xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[0] for i in range(n_trees) ] diff --git a/explainerdashboard/explainer_plots.py b/explainerdashboard/explainer_plots.py index 0f60b35..887b561 100644 --- a/explainerdashboard/explainer_plots.py +++ b/explainerdashboard/explainer_plots.py @@ -2829,7 +2829,7 @@ def plotly_rf_trees( { "model": range(len(model.estimators_)), "prediction": [ - np.round(m.predict(observation.copy())[0], round) + np.round(m.predict(observation)[0], round) for m in model.estimators_ ], "color": colors, diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py index b2ad582..ec47037 100644 --- a/explainerdashboard/explainers.py +++ b/explainerdashboard/explainers.py @@ -851,11 +851,11 @@ def get_col_value_plus_prediction( if self.is_classifier: if pos_label is None: pos_label = self.pos_label - prediction = self.model.predict_proba(X_row.copy())[0][pos_label].squeeze() + prediction = self.model.predict_proba(X_row)[0][pos_label].squeeze() if self.model_output == "probability": prediction = 100 * prediction elif self.is_regression: - prediction = self.model.predict(X_row.copy())[0].squeeze() + prediction = self.model.predict(X_row)[0].squeeze() return col_value, prediction else: raise ValueError("You need to pass either index or X_row!") @@ -982,11 +982,11 @@ def preds(self): print("Calculating predictions...", flush=True) if self.shap == "skorch": # skorch model.predict need np.array self._preds = ( - self.model.predict(self.X.copy().values).squeeze().astype(self.precision) + self.model.predict(self.X.values).squeeze().astype(self.precision) ) else: # Pipelines.predict need pd.DataFrame: self._preds = ( - self.model.predict(self.X.copy()).squeeze().astype(self.precision) + self.model.predict(self.X).squeeze().astype(self.precision) ) return self._preds @@ -1121,7 +1121,7 @@ def shap_explainer(self): def model_predict(data_asarray): data_asframe = pd.DataFrame(data_asarray, columns=self.columns) - preds = self.model.predict(data_asframe.copy()) + preds = self.model.predict(data_asframe) return preds.reshape(len(preds)) self._shap_explainer = shap.KernelExplainer( @@ -4159,7 +4159,7 @@ def prediction_result_df(self, index=None, X_row=None, round=3): X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns) if self.shap == "skorch": X_row = X_row.values.astype("float32") - pred = self.model.predict(X_row.copy()).item() + pred = self.model.predict(X_row).item() preds_df = pd.DataFrame(columns=["", self.target]) preds_df = append_dict_to_df( preds_df, {"": "Predicted", self.target: f"{pred:.{round}f} {self.units}"} @@ -4217,7 +4217,7 @@ def metrics(self, show_metrics: List[str] = None): ): X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index] y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index] - preds = clone(self.model).fit(X_train, y_train).predict(X_test.copy()) + preds = clone(self.model).fit(X_train, y_train).predict(X_test) metrics_dict["mean-squared-error"].append( mean_squared_error(y_test, preds) ) diff --git a/tests/test_assets/car.csv b/tests/test_assets/car.csv new file mode 100644 index 0000000..32091e2 --- /dev/null +++ b/tests/test_assets/car.csv @@ -0,0 +1,111 @@ +buying,maint,doors,persons,lug_boot,safety,class +,"vhigh","2","2","small","low","unacc" +"vhigh","vhigh","2","2","small","med","unacc" +"vhigh","vhigh","2","2","small","high","unacc" +"vhigh","vhigh","2","2","med","low","unacc" +"vhigh","vhigh","2","2","med","med","unacc" +,"vhigh","2","2","med","high","unacc" +"vhigh","vhigh","2","2","big","low","unacc" +"vhigh","vhigh","2","2","big","med","unacc" +"vhigh","vhigh","2","2","big","high","unacc" +"vhigh","vhigh","2","4","small","low","unacc" +"vhigh","vhigh","2","4","small","med","unacc" +"vhigh","vhigh","2","4","small","high","unacc" +"vhigh","vhigh","2","4","med","low","unacc" +,"vhigh","2","4","med","med","unacc" +"vhigh","vhigh","2","4","med","high","unacc" +"vhigh","vhigh","2","4","big","low","unacc" +"vhigh","vhigh","2","4","big","med","unacc" +"vhigh","vhigh","2","4","big","high","unacc" +,"vhigh","2","more","small","low","unacc" +"vhigh","vhigh","2","more","small","med","unacc" +"vhigh","vhigh","2","more","small","high","unacc" +"vhigh","vhigh","2","more","med","low","unacc" +"vhigh","vhigh","2","more","med","med","unacc" +"vhigh","vhigh","2","more","med","high","unacc" +,"vhigh","2","more","big","low","unacc" +"vhigh","vhigh","2","more","big","med","unacc" +"vhigh","vhigh","2","more","big","high","unacc" +"vhigh","vhigh","3","2","small","low","unacc" +"vhigh","vhigh","3","2","small","med","unacc" +"vhigh","vhigh","3","2","small","high","unacc" +"vhigh","vhigh","3","2","med","low","unacc" +"vhigh","vhigh","3","2","med","med","unacc" +"vhigh","vhigh","3","2","med","high","unacc" +"vhigh","vhigh","3","2","big","low","unacc" +"vhigh","vhigh","3","2","big","med","unacc" +,"vhigh","3","2","big","high","unacc" +"vhigh","vhigh","3","4","small","low","unacc" +"vhigh","vhigh","3","4","small","med","unacc" +"vhigh","vhigh","3","4","small","high","unacc" +"vhigh","vhigh","3","4","med","low","unacc" +"vhigh","vhigh","3","4","med","med","unacc" +"vhigh","vhigh","3","4","med","high","unacc" +"vhigh","vhigh","3","4","big","low","unacc" +"vhigh","vhigh","3","4","big","med","unacc" +"vhigh","vhigh","3","4","big","high","unacc" +"vhigh","vhigh","3","more","small","low","unacc" +"vhigh","vhigh","3","more","small","med","unacc" +"vhigh","vhigh","3","more","small","high","unacc" +"vhigh","vhigh","3","more","med","low","unacc" +"vhigh","vhigh","3","more","med","med","unacc" +"vhigh","vhigh","3","more","med","high","unacc" +"vhigh","vhigh","3","more","big","low","unacc" +"vhigh","vhigh","3","more","big","med","unacc" +"vhigh","vhigh","3","more","big","high","unacc" +"vhigh","vhigh","4","2","small","low","unacc" +"vhigh","vhigh","4","2","small","med","unacc" +"vhigh","vhigh","4","2","small","high","unacc" +"vhigh","vhigh","4","2","med","low","unacc" +"vhigh","vhigh","4","2","med","med","unacc" +"vhigh","vhigh","4","2","med","high","unacc" +"vhigh","vhigh","4","2","big","low","unacc" +"vhigh","vhigh","4","2","big","med","unacc" +"vhigh","vhigh","4","2","big","high","unacc" +"vhigh","vhigh","4","4","small","low","unacc" +"vhigh","vhigh","4","4","small","med","unacc" +"vhigh","vhigh","4","4","small","high","unacc" +"vhigh","vhigh","4","4","med","low","unacc" +"vhigh","vhigh","4","4","med","med","unacc" +"vhigh","vhigh","4","4","med","high","unacc" +"vhigh","vhigh","4","4","big","low","unacc" +"vhigh","vhigh","4","4","big","med","unacc" +"vhigh","vhigh","4","4","big","high","unacc" +"vhigh","vhigh","4","more","small","low","unacc" +"vhigh","vhigh","4","more","small","med","unacc" +"vhigh","vhigh","4","more","small","high","unacc" +"vhigh","vhigh","4","more","med","low","unacc" +"vhigh","vhigh","4","more","med","med","unacc" +"vhigh","vhigh","4","more","med","high","unacc" +"vhigh","vhigh","4","more","big","low","unacc" +"low","low","4","more","med","high","vgood" +"low","low","4","more","big","low","unacc" +"low","low","4","more","big","med","good" +"low","low","4","more","big","high","vgood" +"low","low","5more","2","small","low","unacc" +"low","low","5more","2","small","med","unacc" +"low","low","5more","2","small","high","acc" +"low","low","5more","2","med","low","unacc" +"low","low","5more","2","med","med","acc" +"low","low","5more","2","med","high","unacc" +"low","low","5more","2","big","low","unacc" +"low","low","5more","2","big","med","acc" +"low","low","5more","2","big","high","unacc" +"low","low","5more","4","small","low","unacc" +"low","low","5more","4","small","med","acc" +"low","low","5more","4","small","high","good" +"low","low","5more","4","med","low","unacc" +"low","low","5more","4","med","med","good" +"low","low","5more","4","med","high","vgood" +"low","low","5more","4","big","low","unacc" +"low","low","5more","4","big","med","good" +"low","low","5more","4","big","high","vgood" +"low","low","5more","more","small","low","unacc" +"low","low","5more","more","small","med","acc" +"low","low","5more","more","small","high","good" +"low","low","5more","more","med","low","unacc" +"low","low","5more","more","med","med","good" +"low","low","5more","more","med","high","vgood" +"low","low","5more","more","big","low","unacc" +"low","low","5more","more","big","med","good" +"low","low","5more","more","big","high","vgood" \ No newline at end of file diff --git a/tests/test_datasets.py b/tests/test_datasets.py new file mode 100644 index 0000000..4620ee1 --- /dev/null +++ b/tests/test_datasets.py @@ -0,0 +1,79 @@ +import unittest + +from sklearn.ensemble import RandomForestClassifier +import pandas as pd +from explainerdashboard import ClassifierExplainer, ExplainerDashboard +from explainerdashboard.custom import ShapDependenceComposite +from sklearn.preprocessing import OneHotEncoder +from sklearn.preprocessing import LabelEncoder +from sklearn.model_selection import train_test_split +import os + +class CategoricalModelWrapper: + def __init__(self, model) -> None: + self._model = model + pass + + def _perform_one_hot_encoding(self, X, feature, values): + one_hot_enc = OneHotEncoder(dtype='int64', sparse_output=False, handle_unknown="ignore").set_output(transform="pandas") + one_hot_enc.fit(values) + result = one_hot_enc.transform(X[[feature]]) + for col in result.columns: + result = result.rename(columns={ col : col.replace("x0", feature)}) + return pd.concat([X, result], axis=1).drop(columns=[feature]) + + def _perform_label_encoding(self, y): + label_enc = LabelEncoder() + label_enc.fit([["unacc"],["acc"],["good"],["vgood"]]) + return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index) + + def _perform_label_decoding(self, y): + label_enc = LabelEncoder() + label_enc.fit([["unacc"],["acc"],["good"],["vgood"]]) + return pd.Series(label_enc.inverse_transform(y), name=y.name) + + def _preprocessor(self, X): + #Emulate a manual pipeline, e.g. what AutoML solutions can produce + #preprocess buying + X = self._perform_one_hot_encoding(X, "buying", [["vhigh"],["high"],["med"],["low"]]) + X = self._perform_one_hot_encoding(X, "maint", [["vhigh"],["high"],["med"],["low"]]) + X = self._perform_one_hot_encoding(X, "doors", [["2"],["3"],["4"],["5more"]]) + X = self._perform_one_hot_encoding(X, "persons", [["2"],["4"],["more"]]) + X = self._perform_one_hot_encoding(X, "lug_boot", [["small"],["med"],["big"]]) + X = self._perform_one_hot_encoding(X, "safety", [["low"],["med"],["high"]]) + return X + + def _postprocessor(self, y): + return self._perform_label_decoding(y) + + def predict_proba(self, X): + X = self._preprocessor(X) + probabilities_raw = self._model.predict_proba(X) + return probabilities_raw + +def generate_categorical_dataset_model_wrapper(categorical_label=False): + df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\car.csv")) + X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2, random_state=42) + + model = RandomForestClassifier(n_estimators=5, max_depth=2) + wrapper = CategoricalModelWrapper(model) + X_train = wrapper._preprocessor(X_train) + y_train = wrapper._perform_label_encoding(y_train) + if categorical_label == False: + #We only test categorical features and numerical target + y_test = wrapper._perform_label_encoding(y_test) + model.fit(X_train, y_train) + return CategoricalModelWrapper(model), X_test, y_test + +def test_NaN_containing_categorical_dataset(): + _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper() + explainer = ClassifierExplainer( + _wrapper, _test_X, _test_y) + assert "NaN" in explainer.categorical_dict["buying"] + +def test_categorical_label(): + _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True) + explainer = ClassifierExplainer( + _wrapper, _test_X, _test_y) + dashboard = ExplainerDashboard(explainer) + assert "unacc" in explainer.labels From 6d052b7151caa39d5a1cd0cfe30e909e8d781144 Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Tue, 1 Aug 2023 16:23:32 +0200 Subject: [PATCH 5/8] added more acc classes in dataset and dashboard generation in NaN categorical test --- tests/test_assets/car.csv | 12 ++++++------ tests/test_datasets.py | 6 ++++++ 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/tests/test_assets/car.csv b/tests/test_assets/car.csv index 32091e2..6636705 100644 --- a/tests/test_assets/car.csv +++ b/tests/test_assets/car.csv @@ -54,17 +54,17 @@ buying,maint,doors,persons,lug_boot,safety,class "vhigh","vhigh","3","more","big","med","unacc" "vhigh","vhigh","3","more","big","high","unacc" "vhigh","vhigh","4","2","small","low","unacc" -"vhigh","vhigh","4","2","small","med","unacc" -"vhigh","vhigh","4","2","small","high","unacc" +"vhigh","vhigh","4","2","small","med","acc" +"vhigh","vhigh","4","2","small","high","acc" "vhigh","vhigh","4","2","med","low","unacc" "vhigh","vhigh","4","2","med","med","unacc" -"vhigh","vhigh","4","2","med","high","unacc" +"vhigh","vhigh","4","2","med","high","acc" "vhigh","vhigh","4","2","big","low","unacc" -"vhigh","vhigh","4","2","big","med","unacc" +"vhigh","vhigh","4","2","big","med","acc" "vhigh","vhigh","4","2","big","high","unacc" -"vhigh","vhigh","4","4","small","low","unacc" +"vhigh","vhigh","4","4","small","low","acc" "vhigh","vhigh","4","4","small","med","unacc" -"vhigh","vhigh","4","4","small","high","unacc" +"vhigh","vhigh","4","4","small","high","acc" "vhigh","vhigh","4","4","med","low","unacc" "vhigh","vhigh","4","4","med","med","unacc" "vhigh","vhigh","4","4","med","high","unacc" diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 4620ee1..305d65b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -46,6 +46,11 @@ def _preprocessor(self, X): def _postprocessor(self, y): return self._perform_label_decoding(y) + def predict(self, X): + X = self._preprocessor(X) + y = self._model.predict(X) + return self._postprocessor(y) + def predict_proba(self, X): X = self._preprocessor(X) probabilities_raw = self._model.predict_proba(X) @@ -69,6 +74,7 @@ def test_NaN_containing_categorical_dataset(): _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper() explainer = ClassifierExplainer( _wrapper, _test_X, _test_y) + dashboard = ExplainerDashboard(explainer) assert "NaN" in explainer.categorical_dict["buying"] def test_categorical_label(): From 3a9374ff09ff2bef522e1eceea008abcc6a325cd Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Thu, 3 Aug 2023 09:18:14 +0200 Subject: [PATCH 6/8] changed used dataset to titanic --- .gitignore | 1 - tests/test_assets/car.csv | 111 -------------------------------------- tests/test_datasets.py | 63 ++++++++++++---------- 3 files changed, 35 insertions(+), 140 deletions(-) delete mode 100644 tests/test_assets/car.csv diff --git a/.gitignore b/.gitignore index 91db970..8691c26 100644 --- a/.gitignore +++ b/.gitignore @@ -159,7 +159,6 @@ tests/cli_assets/* tests/test_assets/* !tests/test_assets/data.csv !tests/test_assets/pipeline_data.csv -!tests/test_assets/car.csv db_test.py .DS_Store diff --git a/tests/test_assets/car.csv b/tests/test_assets/car.csv deleted file mode 100644 index 6636705..0000000 --- a/tests/test_assets/car.csv +++ /dev/null @@ -1,111 +0,0 @@ -buying,maint,doors,persons,lug_boot,safety,class -,"vhigh","2","2","small","low","unacc" -"vhigh","vhigh","2","2","small","med","unacc" -"vhigh","vhigh","2","2","small","high","unacc" -"vhigh","vhigh","2","2","med","low","unacc" -"vhigh","vhigh","2","2","med","med","unacc" -,"vhigh","2","2","med","high","unacc" -"vhigh","vhigh","2","2","big","low","unacc" -"vhigh","vhigh","2","2","big","med","unacc" -"vhigh","vhigh","2","2","big","high","unacc" -"vhigh","vhigh","2","4","small","low","unacc" -"vhigh","vhigh","2","4","small","med","unacc" -"vhigh","vhigh","2","4","small","high","unacc" -"vhigh","vhigh","2","4","med","low","unacc" -,"vhigh","2","4","med","med","unacc" -"vhigh","vhigh","2","4","med","high","unacc" -"vhigh","vhigh","2","4","big","low","unacc" -"vhigh","vhigh","2","4","big","med","unacc" -"vhigh","vhigh","2","4","big","high","unacc" -,"vhigh","2","more","small","low","unacc" -"vhigh","vhigh","2","more","small","med","unacc" -"vhigh","vhigh","2","more","small","high","unacc" -"vhigh","vhigh","2","more","med","low","unacc" -"vhigh","vhigh","2","more","med","med","unacc" -"vhigh","vhigh","2","more","med","high","unacc" -,"vhigh","2","more","big","low","unacc" -"vhigh","vhigh","2","more","big","med","unacc" -"vhigh","vhigh","2","more","big","high","unacc" -"vhigh","vhigh","3","2","small","low","unacc" -"vhigh","vhigh","3","2","small","med","unacc" -"vhigh","vhigh","3","2","small","high","unacc" -"vhigh","vhigh","3","2","med","low","unacc" -"vhigh","vhigh","3","2","med","med","unacc" -"vhigh","vhigh","3","2","med","high","unacc" -"vhigh","vhigh","3","2","big","low","unacc" -"vhigh","vhigh","3","2","big","med","unacc" -,"vhigh","3","2","big","high","unacc" -"vhigh","vhigh","3","4","small","low","unacc" -"vhigh","vhigh","3","4","small","med","unacc" -"vhigh","vhigh","3","4","small","high","unacc" -"vhigh","vhigh","3","4","med","low","unacc" -"vhigh","vhigh","3","4","med","med","unacc" -"vhigh","vhigh","3","4","med","high","unacc" -"vhigh","vhigh","3","4","big","low","unacc" -"vhigh","vhigh","3","4","big","med","unacc" -"vhigh","vhigh","3","4","big","high","unacc" -"vhigh","vhigh","3","more","small","low","unacc" -"vhigh","vhigh","3","more","small","med","unacc" -"vhigh","vhigh","3","more","small","high","unacc" -"vhigh","vhigh","3","more","med","low","unacc" -"vhigh","vhigh","3","more","med","med","unacc" -"vhigh","vhigh","3","more","med","high","unacc" -"vhigh","vhigh","3","more","big","low","unacc" -"vhigh","vhigh","3","more","big","med","unacc" -"vhigh","vhigh","3","more","big","high","unacc" -"vhigh","vhigh","4","2","small","low","unacc" -"vhigh","vhigh","4","2","small","med","acc" -"vhigh","vhigh","4","2","small","high","acc" -"vhigh","vhigh","4","2","med","low","unacc" -"vhigh","vhigh","4","2","med","med","unacc" -"vhigh","vhigh","4","2","med","high","acc" -"vhigh","vhigh","4","2","big","low","unacc" -"vhigh","vhigh","4","2","big","med","acc" -"vhigh","vhigh","4","2","big","high","unacc" -"vhigh","vhigh","4","4","small","low","acc" -"vhigh","vhigh","4","4","small","med","unacc" -"vhigh","vhigh","4","4","small","high","acc" -"vhigh","vhigh","4","4","med","low","unacc" -"vhigh","vhigh","4","4","med","med","unacc" -"vhigh","vhigh","4","4","med","high","unacc" -"vhigh","vhigh","4","4","big","low","unacc" -"vhigh","vhigh","4","4","big","med","unacc" -"vhigh","vhigh","4","4","big","high","unacc" -"vhigh","vhigh","4","more","small","low","unacc" -"vhigh","vhigh","4","more","small","med","unacc" -"vhigh","vhigh","4","more","small","high","unacc" -"vhigh","vhigh","4","more","med","low","unacc" -"vhigh","vhigh","4","more","med","med","unacc" -"vhigh","vhigh","4","more","med","high","unacc" -"vhigh","vhigh","4","more","big","low","unacc" -"low","low","4","more","med","high","vgood" -"low","low","4","more","big","low","unacc" -"low","low","4","more","big","med","good" -"low","low","4","more","big","high","vgood" -"low","low","5more","2","small","low","unacc" -"low","low","5more","2","small","med","unacc" -"low","low","5more","2","small","high","acc" -"low","low","5more","2","med","low","unacc" -"low","low","5more","2","med","med","acc" -"low","low","5more","2","med","high","unacc" -"low","low","5more","2","big","low","unacc" -"low","low","5more","2","big","med","acc" -"low","low","5more","2","big","high","unacc" -"low","low","5more","4","small","low","unacc" -"low","low","5more","4","small","med","acc" -"low","low","5more","4","small","high","good" -"low","low","5more","4","med","low","unacc" -"low","low","5more","4","med","med","good" -"low","low","5more","4","med","high","vgood" -"low","low","5more","4","big","low","unacc" -"low","low","5more","4","big","med","good" -"low","low","5more","4","big","high","vgood" -"low","low","5more","more","small","low","unacc" -"low","low","5more","more","small","med","acc" -"low","low","5more","more","small","high","good" -"low","low","5more","more","med","low","unacc" -"low","low","5more","more","med","med","good" -"low","low","5more","more","med","high","vgood" -"low","low","5more","more","big","low","unacc" -"low","low","5more","more","big","med","good" -"low","low","5more","more","big","high","vgood" \ No newline at end of file diff --git a/tests/test_datasets.py b/tests/test_datasets.py index 305d65b..c1ec53b 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,17 +1,16 @@ -import unittest - from sklearn.ensemble import RandomForestClassifier import pandas as pd from explainerdashboard import ClassifierExplainer, ExplainerDashboard -from explainerdashboard.custom import ShapDependenceComposite from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import os +import numpy as np class CategoricalModelWrapper: - def __init__(self, model) -> None: + def __init__(self, model, categorical_label_test) -> None: self._model = model + self._categorical_label_test = categorical_label_test pass def _perform_one_hot_encoding(self, X, feature, values): @@ -24,27 +23,21 @@ def _perform_one_hot_encoding(self, X, feature, values): def _perform_label_encoding(self, y): label_enc = LabelEncoder() - label_enc.fit([["unacc"],["acc"],["good"],["vgood"]]) + label_enc.fit([["Survived"],["Not Survived"]]) return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index) def _perform_label_decoding(self, y): label_enc = LabelEncoder() - label_enc.fit([["unacc"],["acc"],["good"],["vgood"]]) + label_enc.fit([["Survived"],["Not Survived"]]) return pd.Series(label_enc.inverse_transform(y), name=y.name) def _preprocessor(self, X): - #Emulate a manual pipeline, e.g. what AutoML solutions can produce - #preprocess buying - X = self._perform_one_hot_encoding(X, "buying", [["vhigh"],["high"],["med"],["low"]]) - X = self._perform_one_hot_encoding(X, "maint", [["vhigh"],["high"],["med"],["low"]]) - X = self._perform_one_hot_encoding(X, "doors", [["2"],["3"],["4"],["5more"]]) - X = self._perform_one_hot_encoding(X, "persons", [["2"],["4"],["more"]]) - X = self._perform_one_hot_encoding(X, "lug_boot", [["small"],["med"],["big"]]) - X = self._perform_one_hot_encoding(X, "safety", [["low"],["med"],["high"]]) - return X + return X.drop(["Name"], axis=1) def _postprocessor(self, y): - return self._perform_label_decoding(y) + if self._categorical_label_test == True: + y = self._perform_label_decoding(y) + return y def predict(self, X): X = self._preprocessor(X) @@ -56,30 +49,44 @@ def predict_proba(self, X): probabilities_raw = self._model.predict_proba(X) return probabilities_raw -def generate_categorical_dataset_model_wrapper(categorical_label=False): - df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\car.csv")) - X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2, random_state=42) - +def generate_categorical_dataset_model_wrapper(categorical_label_test=False): model = RandomForestClassifier(n_estimators=5, max_depth=2) - wrapper = CategoricalModelWrapper(model) + wrapper = CategoricalModelWrapper(model, categorical_label_test) + df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\data.csv")) + if categorical_label_test == True: + #Test for categorical label, convert titanic binary numeric label to categorical ["Survived"],["Not Survived"] + df["Survival"] = wrapper._perform_label_decoding(df["Survival"]) + else: + #We only test NaN in categorical features and numerical target + df["Name"][0] = np.nan + df["Name"][10] = np.nan + df["Name"][20] = np.nan + df["Name"][30] = np.nan + df["Name"][40] = np.nan + df["Name"][50] = np.nan + df["Name"][60] = np.nan + df["Name"][70] = np.nan + df["Name"][80] = np.nan + X_train, X_test, y_train, y_test = train_test_split(df.drop(["Survival"], axis=1), df["Survival"], test_size=0.2, random_state=42) + X_train = wrapper._preprocessor(X_train) - y_train = wrapper._perform_label_encoding(y_train) - if categorical_label == False: - #We only test categorical features and numerical target - y_test = wrapper._perform_label_encoding(y_test) + + if categorical_label_test == True: + y_train = wrapper._perform_label_encoding(y_train) + model.fit(X_train, y_train) - return CategoricalModelWrapper(model), X_test, y_test + return CategoricalModelWrapper(model, categorical_label_test), X_test, y_test def test_NaN_containing_categorical_dataset(): _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper() explainer = ClassifierExplainer( _wrapper, _test_X, _test_y) dashboard = ExplainerDashboard(explainer) - assert "NaN" in explainer.categorical_dict["buying"] + assert "NaN" in explainer.categorical_dict["Name"] def test_categorical_label(): _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True) explainer = ClassifierExplainer( _wrapper, _test_X, _test_y) dashboard = ExplainerDashboard(explainer) - assert "unacc" in explainer.labels + assert "Survived" in explainer.labels From f98eae50f2cbbf0e15857a48d736dc1677ecbca8 Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Thu, 3 Aug 2023 09:20:41 +0200 Subject: [PATCH 7/8] removed one hot encoder --- tests/test_datasets.py | 9 --------- 1 file changed, 9 deletions(-) diff --git a/tests/test_datasets.py b/tests/test_datasets.py index c1ec53b..1c5c81d 100644 --- a/tests/test_datasets.py +++ b/tests/test_datasets.py @@ -1,7 +1,6 @@ from sklearn.ensemble import RandomForestClassifier import pandas as pd from explainerdashboard import ClassifierExplainer, ExplainerDashboard -from sklearn.preprocessing import OneHotEncoder from sklearn.preprocessing import LabelEncoder from sklearn.model_selection import train_test_split import os @@ -12,14 +11,6 @@ def __init__(self, model, categorical_label_test) -> None: self._model = model self._categorical_label_test = categorical_label_test pass - - def _perform_one_hot_encoding(self, X, feature, values): - one_hot_enc = OneHotEncoder(dtype='int64', sparse_output=False, handle_unknown="ignore").set_output(transform="pandas") - one_hot_enc.fit(values) - result = one_hot_enc.transform(X[[feature]]) - for col in result.columns: - result = result.rename(columns={ col : col.replace("x0", feature)}) - return pd.concat([X, result], axis=1).drop(columns=[feature]) def _perform_label_encoding(self, y): label_enc = LabelEncoder() From 1f459a78043ce323a87f38702e55e18c544eae4c Mon Sep 17 00:00:00 2001 From: Alexander Zender Date: Thu, 10 Aug 2023 13:29:54 +0200 Subject: [PATCH 8/8] removed unecessary copy --- explainerdashboard/explainer_methods.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py index 1c1bcad..579ac6b 100644 --- a/explainerdashboard/explainer_methods.py +++ b/explainerdashboard/explainer_methods.py @@ -572,7 +572,7 @@ def one_vs_all_metric(metric, pos_label, y_true, y_pred): sign = 1 if greater_is_better else -1 def _scorer(clf, X, y): - y_pred = clf.predict_proba(X.copy()) + y_pred = clf.predict_proba(X) score = sign * partial_metric(y, y_pred) return score