From 2541e27f4b9d81f4245a05d7822725d55f0e9fd2 Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Fri, 14 Jul 2023 13:08:00 +0200
Subject: [PATCH 1/8] added dropna to avoid crash on nan values

added copy to any predict or predict_proba call to only pass definitiv copies
---
 explainerdashboard/explainer_methods.py | 12 ++++++------
 explainerdashboard/explainer_plots.py   |  4 ++--
 explainerdashboard/explainers.py        | 26 ++++++++++++-------------
 3 files changed, 21 insertions(+), 21 deletions(-)

diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py
index f3920b9..1fe6dca 100644
--- a/explainerdashboard/explainer_methods.py
+++ b/explainerdashboard/explainer_methods.py
@@ -572,7 +572,7 @@ def one_vs_all_metric(metric, pos_label, y_true, y_pred):
     sign = 1 if greater_is_better else -1
 
     def _scorer(clf, X, y):
-        y_pred = clf.predict_proba(X)
+        y_pred = clf.predict_proba(X.copy())
         score = sign * partial_metric(y, y_pred)
         return score
 
@@ -915,7 +915,7 @@ def get_pdp_df(
             first_row = X_sample.iloc[[0]].values.astype("float32")
         else:
             first_row = X_sample.iloc[[0]]
-        n_labels = model.predict_proba(first_row).shape[1]
+        n_labels = model.predict_proba(first_row.copy()).shape[1]
         if multiclass:
             pdp_dfs = [pd.DataFrame() for i in range(n_labels)]
         else:
@@ -1732,21 +1732,21 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1):
             if pos_label == 1:
                 preds = [
                     xgbmodel.predict(
-                        X_row, iteration_range=(0, i + 1), output_margin=True
+                        X_row.copy(), iteration_range=(0, i + 1), output_margin=True
                     )[0]
                     for i in range(n_trees)
                 ]
             elif pos_label == 0:
                 preds = [
                     -xgbmodel.predict(
-                        X_row, iteration_range=(0, i + 1), output_margin=True
+                        X_row.copy(), iteration_range=(0, i + 1), output_margin=True
                     )[0]
                     for i in range(n_trees)
                 ]
             pred_probas = (np.exp(preds) / (1 + np.exp(preds))).tolist()
         else:
             margins = [
-                xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[
+                xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[
                     0
                 ]
                 for i in range(n_trees)
@@ -1758,7 +1758,7 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1):
 
     else:
         preds = [
-            xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[0]
+            xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[0]
             for i in range(n_trees)
         ]
 
diff --git a/explainerdashboard/explainer_plots.py b/explainerdashboard/explainer_plots.py
index 31e0fab..0f60b35 100644
--- a/explainerdashboard/explainer_plots.py
+++ b/explainerdashboard/explainer_plots.py
@@ -2813,7 +2813,7 @@ def plotly_rf_trees(
                     "model": range(len(model.estimators_)),
                     "prediction": [
                         np.round(
-                            100 * m.predict_proba(observation)[0, pos_label], round
+                            100 * m.predict_proba(observation.copy())[0, pos_label], round
                         )
                         for m in model.estimators_
                     ],
@@ -2829,7 +2829,7 @@ def plotly_rf_trees(
                 {
                     "model": range(len(model.estimators_)),
                     "prediction": [
-                        np.round(m.predict(observation)[0], round)
+                        np.round(m.predict(observation.copy())[0], round)
                         for m in model.estimators_
                     ],
                     "color": colors,
diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py
index c630a6f..d859b5e 100644
--- a/explainerdashboard/explainers.py
+++ b/explainerdashboard/explainers.py
@@ -240,7 +240,7 @@ def __init__(
             col for col in self.regular_cols if not is_numeric_dtype(self.X[col])
         ]
         self.categorical_dict = {
-            col: sorted(self.X[col].unique().tolist()) for col in self.categorical_cols
+            col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols
         }
         self.cat_cols = self.onehot_cols + self.categorical_cols
         self.original_cols = self.X.columns
@@ -837,11 +837,11 @@ def get_col_value_plus_prediction(
             if self.is_classifier:
                 if pos_label is None:
                     pos_label = self.pos_label
-                prediction = self.model.predict_proba(X_row)[0][pos_label].squeeze()
+                prediction = self.model.predict_proba(X_row.copy())[0][pos_label].squeeze()
                 if self.model_output == "probability":
                     prediction = 100 * prediction
             elif self.is_regression:
-                prediction = self.model.predict(X_row)[0].squeeze()
+                prediction = self.model.predict(X_row.copy())[0].squeeze()
             return col_value, prediction
         else:
             raise ValueError("You need to pass either index or X_row!")
@@ -968,11 +968,11 @@ def preds(self):
             print("Calculating predictions...", flush=True)
             if self.shap == "skorch":  # skorch model.predict need np.array
                 self._preds = (
-                    self.model.predict(self.X.values).squeeze().astype(self.precision)
+                    self.model.predict(self.X.copy().values).squeeze().astype(self.precision)
                 )
             else:  # Pipelines.predict need pd.DataFrame:
                 self._preds = (
-                    self.model.predict(self.X).squeeze().astype(self.precision)
+                    self.model.predict(self.X.copy()).squeeze().astype(self.precision)
                 )
 
         return self._preds
@@ -1107,7 +1107,7 @@ def shap_explainer(self):
 
                 def model_predict(data_asarray):
                     data_asframe = pd.DataFrame(data_asarray, columns=self.columns)
-                    preds = self.model.predict(data_asframe)
+                    preds = self.model.predict(data_asframe.copy())
                     return preds.reshape(len(preds))
 
                 self._shap_explainer = shap.KernelExplainer(
@@ -2561,11 +2561,11 @@ def pred_probas_raw(self):
                 self.model, "predict_proba"
             ), "model does not have a predict_proba method!"
             if self.shap == "skorch":
-                self._pred_probas = self.model.predict_proba(self.X.values).astype(
+                self._pred_probas = self.model.predict_proba(self.X.copy().values).astype(
                     self.precision
                 )
             else:
-                self._pred_probas = self.model.predict_proba(self.X).astype(
+                self._pred_probas = self.model.predict_proba(self.X.copy()).astype(
                     self.precision
                 )
         return self._pred_probas
@@ -2766,7 +2766,7 @@ def shap_explainer(self):
 
                 def model_predict(data_asarray):
                     data_asframe = pd.DataFrame(data_asarray, columns=self.columns)
-                    return self.model.predict_proba(data_asframe)
+                    return self.model.predict_proba(data_asframe.copy())
 
                 self._shap_explainer = shap.KernelExplainer(
                     model_predict,
@@ -3249,7 +3249,7 @@ def get_cv_metrics(n_splits):
             ):
                 X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                 y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
-                preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test)
+                preds = clone(self.model).fit(X_train, y_train).predict_proba(X_test.copy())
                 for label in range(len(self.labels)):
                     for cut in np.linspace(1, 99, 99, dtype=int):
                         y_true = np.where(y_test == label, 1, 0)
@@ -3482,7 +3482,7 @@ def prediction_result_df(
                 X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns)
             if self.shap == "skorch":
                 X_row = X_row.values.astype("float32")
-            pred_probas = self.model.predict_proba(X_row)[0, :].squeeze()
+            pred_probas = self.model.predict_proba(X_row.copy())[0, :].squeeze()
 
         preds_df = pd.DataFrame(dict(label=self.labels, probability=pred_probas))
         if logodds and all(preds_df.probability < 1 - np.finfo(np.float64).eps):
@@ -4145,7 +4145,7 @@ def prediction_result_df(self, index=None, X_row=None, round=3):
                 X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns)
         if self.shap == "skorch":
             X_row = X_row.values.astype("float32")
-        pred = self.model.predict(X_row).item()
+        pred = self.model.predict(X_row.copy()).item()
         preds_df = pd.DataFrame(columns=["", self.target])
         preds_df = append_dict_to_df(
             preds_df, {"": "Predicted", self.target: f"{pred:.{round}f} {self.units}"}
@@ -4203,7 +4203,7 @@ def metrics(self, show_metrics: List[str] = None):
             ):
                 X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                 y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
-                preds = clone(self.model).fit(X_train, y_train).predict(X_test)
+                preds = clone(self.model).fit(X_train, y_train).predict(X_test.copy())
                 metrics_dict["mean-squared-error"].append(
                     mean_squared_error(y_test, preds)
                 )

From 723b4b15e84659a85b338cf5a2d8ef7029fec2ce Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Fri, 14 Jul 2023 15:37:39 +0200
Subject: [PATCH 2/8] added nan value to categorical features

---
 explainerdashboard/explainers.py | 12 ++++++++++++
 1 file changed, 12 insertions(+)

diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py
index d859b5e..5dd253a 100644
--- a/explainerdashboard/explainers.py
+++ b/explainerdashboard/explainers.py
@@ -242,6 +242,10 @@ def __init__(
         self.categorical_dict = {
             col: sorted(self.X[col].dropna().unique().tolist()) for col in self.categorical_cols
         }
+        #Add nan to list, as this is a valid option for encoders
+        for col in self.categorical_cols:
+            if self.X[col].isnull().values.any():
+                self.categorical_dict[col].append('NaN')
         self.cat_cols = self.onehot_cols + self.categorical_cols
         self.original_cols = self.X.columns
         self.merged_cols = pd.Index(self.regular_cols + self.onehot_cols)
@@ -757,6 +761,10 @@ def get_row_from_input(
             df_merged = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(
                 self.na_fill
             )[self.merged_cols]
+            #Adjust categorical col to proper nan value instead of self.na_fill
+            for col, values in self.categorical_dict.items():
+                if 'NaN' in values:
+                    df_merged[col] = df_merged[col].replace(self.na_fill, np.nan)
             if return_merged:
                 return df_merged
             else:
@@ -765,6 +773,10 @@ def get_row_from_input(
         elif len(inputs) == len(self.columns):
             cols = self.columns
             df = pd.DataFrame(dict(zip(cols, inputs)), index=[0]).fillna(self.na_fill)
+            #unsure if this is okay here for categorical defined values
+            for col, values in self.categorical_dict.items():
+                if 'NaN' in values:
+                    df[col] = df[col].replace(self.na_fill, np.nan)
             if return_merged:
                 return merge_categorical_columns(df, self.onehot_dict, self.merged_cols)
             else:

From be6cfc08ea981358755fc004fc381beefcfd0342 Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Fri, 14 Jul 2023 18:14:35 +0200
Subject: [PATCH 3/8] added conversion for string NaN from frontend

---
 explainerdashboard/explainers.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py
index 5dd253a..b2ad582 100644
--- a/explainerdashboard/explainers.py
+++ b/explainerdashboard/explainers.py
@@ -764,7 +764,8 @@ def get_row_from_input(
             #Adjust categorical col to proper nan value instead of self.na_fill
             for col, values in self.categorical_dict.items():
                 if 'NaN' in values:
-                    df_merged[col] = df_merged[col].replace(self.na_fill, np.nan)
+                    df_merged[col] = df_merged[col].replace(self.na_fill, np.nan)   #If the categorical feature comes from the existing data it will be nan
+                    df_merged[col] = df_merged[col].replace('NaN', np.nan)          #If the categorical feature is changed to NaN in the frontend it will be a string
             if return_merged:
                 return df_merged
             else:
@@ -776,7 +777,8 @@ def get_row_from_input(
             #unsure if this is okay here for categorical defined values
             for col, values in self.categorical_dict.items():
                 if 'NaN' in values:
-                    df[col] = df[col].replace(self.na_fill, np.nan)
+                    df[col] = df[col].replace(self.na_fill, np.nan)         #If the categorical feature comes from the existing data it will be nan
+                    df[col] = df[col].replace('NaN', np.nan)                #If the categorical feature is changed to NaN in the frontend it will be a string
             if return_merged:
                 return merge_categorical_columns(df, self.onehot_dict, self.merged_cols)
             else:

From b318e045ca6aca28e06fbaffea53eb63af26ceb2 Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Tue, 1 Aug 2023 15:45:15 +0200
Subject: [PATCH 4/8] added test for nan categorical

removed copy from predict function calls,
added test for testing categorical labels
---
 .gitignore                              |   1 +
 explainerdashboard/explainer_methods.py |   8 +-
 explainerdashboard/explainer_plots.py   |   2 +-
 explainerdashboard/explainers.py        |  14 +--
 tests/test_assets/car.csv               | 111 ++++++++++++++++++++++++
 tests/test_datasets.py                  |  79 +++++++++++++++++
 6 files changed, 203 insertions(+), 12 deletions(-)
 create mode 100644 tests/test_assets/car.csv
 create mode 100644 tests/test_datasets.py

diff --git a/.gitignore b/.gitignore
index 8691c26..91db970 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,6 +159,7 @@ tests/cli_assets/*
 tests/test_assets/*
 !tests/test_assets/data.csv
 !tests/test_assets/pipeline_data.csv
+!tests/test_assets/car.csv
 
 db_test.py
 .DS_Store
diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py
index 1fe6dca..1c1bcad 100644
--- a/explainerdashboard/explainer_methods.py
+++ b/explainerdashboard/explainer_methods.py
@@ -1732,21 +1732,21 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1):
             if pos_label == 1:
                 preds = [
                     xgbmodel.predict(
-                        X_row.copy(), iteration_range=(0, i + 1), output_margin=True
+                        X_row, iteration_range=(0, i + 1), output_margin=True
                     )[0]
                     for i in range(n_trees)
                 ]
             elif pos_label == 0:
                 preds = [
                     -xgbmodel.predict(
-                        X_row.copy(), iteration_range=(0, i + 1), output_margin=True
+                        X_row, iteration_range=(0, i + 1), output_margin=True
                     )[0]
                     for i in range(n_trees)
                 ]
             pred_probas = (np.exp(preds) / (1 + np.exp(preds))).tolist()
         else:
             margins = [
-                xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[
+                xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[
                     0
                 ]
                 for i in range(n_trees)
@@ -1758,7 +1758,7 @@ def get_xgboost_preds_df(xgbmodel, X_row, pos_label=1):
 
     else:
         preds = [
-            xgbmodel.predict(X_row.copy(), iteration_range=(0, i + 1), output_margin=True)[0]
+            xgbmodel.predict(X_row, iteration_range=(0, i + 1), output_margin=True)[0]
             for i in range(n_trees)
         ]
 
diff --git a/explainerdashboard/explainer_plots.py b/explainerdashboard/explainer_plots.py
index 0f60b35..887b561 100644
--- a/explainerdashboard/explainer_plots.py
+++ b/explainerdashboard/explainer_plots.py
@@ -2829,7 +2829,7 @@ def plotly_rf_trees(
                 {
                     "model": range(len(model.estimators_)),
                     "prediction": [
-                        np.round(m.predict(observation.copy())[0], round)
+                        np.round(m.predict(observation)[0], round)
                         for m in model.estimators_
                     ],
                     "color": colors,
diff --git a/explainerdashboard/explainers.py b/explainerdashboard/explainers.py
index b2ad582..ec47037 100644
--- a/explainerdashboard/explainers.py
+++ b/explainerdashboard/explainers.py
@@ -851,11 +851,11 @@ def get_col_value_plus_prediction(
             if self.is_classifier:
                 if pos_label is None:
                     pos_label = self.pos_label
-                prediction = self.model.predict_proba(X_row.copy())[0][pos_label].squeeze()
+                prediction = self.model.predict_proba(X_row)[0][pos_label].squeeze()
                 if self.model_output == "probability":
                     prediction = 100 * prediction
             elif self.is_regression:
-                prediction = self.model.predict(X_row.copy())[0].squeeze()
+                prediction = self.model.predict(X_row)[0].squeeze()
             return col_value, prediction
         else:
             raise ValueError("You need to pass either index or X_row!")
@@ -982,11 +982,11 @@ def preds(self):
             print("Calculating predictions...", flush=True)
             if self.shap == "skorch":  # skorch model.predict need np.array
                 self._preds = (
-                    self.model.predict(self.X.copy().values).squeeze().astype(self.precision)
+                    self.model.predict(self.X.values).squeeze().astype(self.precision)
                 )
             else:  # Pipelines.predict need pd.DataFrame:
                 self._preds = (
-                    self.model.predict(self.X.copy()).squeeze().astype(self.precision)
+                    self.model.predict(self.X).squeeze().astype(self.precision)
                 )
 
         return self._preds
@@ -1121,7 +1121,7 @@ def shap_explainer(self):
 
                 def model_predict(data_asarray):
                     data_asframe = pd.DataFrame(data_asarray, columns=self.columns)
-                    preds = self.model.predict(data_asframe.copy())
+                    preds = self.model.predict(data_asframe)
                     return preds.reshape(len(preds))
 
                 self._shap_explainer = shap.KernelExplainer(
@@ -4159,7 +4159,7 @@ def prediction_result_df(self, index=None, X_row=None, round=3):
                 X_row = X_cats_to_X(X_row, self.onehot_dict, self.X.columns)
         if self.shap == "skorch":
             X_row = X_row.values.astype("float32")
-        pred = self.model.predict(X_row.copy()).item()
+        pred = self.model.predict(X_row).item()
         preds_df = pd.DataFrame(columns=["", self.target])
         preds_df = append_dict_to_df(
             preds_df, {"": "Predicted", self.target: f"{pred:.{round}f} {self.units}"}
@@ -4217,7 +4217,7 @@ def metrics(self, show_metrics: List[str] = None):
             ):
                 X_train, X_test = self.X.iloc[train_index], self.X.iloc[test_index]
                 y_train, y_test = self.y.iloc[train_index], self.y.iloc[test_index]
-                preds = clone(self.model).fit(X_train, y_train).predict(X_test.copy())
+                preds = clone(self.model).fit(X_train, y_train).predict(X_test)
                 metrics_dict["mean-squared-error"].append(
                     mean_squared_error(y_test, preds)
                 )
diff --git a/tests/test_assets/car.csv b/tests/test_assets/car.csv
new file mode 100644
index 0000000..32091e2
--- /dev/null
+++ b/tests/test_assets/car.csv
@@ -0,0 +1,111 @@
+buying,maint,doors,persons,lug_boot,safety,class
+,"vhigh","2","2","small","low","unacc"
+"vhigh","vhigh","2","2","small","med","unacc"
+"vhigh","vhigh","2","2","small","high","unacc"
+"vhigh","vhigh","2","2","med","low","unacc"
+"vhigh","vhigh","2","2","med","med","unacc"
+,"vhigh","2","2","med","high","unacc"
+"vhigh","vhigh","2","2","big","low","unacc"
+"vhigh","vhigh","2","2","big","med","unacc"
+"vhigh","vhigh","2","2","big","high","unacc"
+"vhigh","vhigh","2","4","small","low","unacc"
+"vhigh","vhigh","2","4","small","med","unacc"
+"vhigh","vhigh","2","4","small","high","unacc"
+"vhigh","vhigh","2","4","med","low","unacc"
+,"vhigh","2","4","med","med","unacc"
+"vhigh","vhigh","2","4","med","high","unacc"
+"vhigh","vhigh","2","4","big","low","unacc"
+"vhigh","vhigh","2","4","big","med","unacc"
+"vhigh","vhigh","2","4","big","high","unacc"
+,"vhigh","2","more","small","low","unacc"
+"vhigh","vhigh","2","more","small","med","unacc"
+"vhigh","vhigh","2","more","small","high","unacc"
+"vhigh","vhigh","2","more","med","low","unacc"
+"vhigh","vhigh","2","more","med","med","unacc"
+"vhigh","vhigh","2","more","med","high","unacc"
+,"vhigh","2","more","big","low","unacc"
+"vhigh","vhigh","2","more","big","med","unacc"
+"vhigh","vhigh","2","more","big","high","unacc"
+"vhigh","vhigh","3","2","small","low","unacc"
+"vhigh","vhigh","3","2","small","med","unacc"
+"vhigh","vhigh","3","2","small","high","unacc"
+"vhigh","vhigh","3","2","med","low","unacc"
+"vhigh","vhigh","3","2","med","med","unacc"
+"vhigh","vhigh","3","2","med","high","unacc"
+"vhigh","vhigh","3","2","big","low","unacc"
+"vhigh","vhigh","3","2","big","med","unacc"
+,"vhigh","3","2","big","high","unacc"
+"vhigh","vhigh","3","4","small","low","unacc"
+"vhigh","vhigh","3","4","small","med","unacc"
+"vhigh","vhigh","3","4","small","high","unacc"
+"vhigh","vhigh","3","4","med","low","unacc"
+"vhigh","vhigh","3","4","med","med","unacc"
+"vhigh","vhigh","3","4","med","high","unacc"
+"vhigh","vhigh","3","4","big","low","unacc"
+"vhigh","vhigh","3","4","big","med","unacc"
+"vhigh","vhigh","3","4","big","high","unacc"
+"vhigh","vhigh","3","more","small","low","unacc"
+"vhigh","vhigh","3","more","small","med","unacc"
+"vhigh","vhigh","3","more","small","high","unacc"
+"vhigh","vhigh","3","more","med","low","unacc"
+"vhigh","vhigh","3","more","med","med","unacc"
+"vhigh","vhigh","3","more","med","high","unacc"
+"vhigh","vhigh","3","more","big","low","unacc"
+"vhigh","vhigh","3","more","big","med","unacc"
+"vhigh","vhigh","3","more","big","high","unacc"
+"vhigh","vhigh","4","2","small","low","unacc"
+"vhigh","vhigh","4","2","small","med","unacc"
+"vhigh","vhigh","4","2","small","high","unacc"
+"vhigh","vhigh","4","2","med","low","unacc"
+"vhigh","vhigh","4","2","med","med","unacc"
+"vhigh","vhigh","4","2","med","high","unacc"
+"vhigh","vhigh","4","2","big","low","unacc"
+"vhigh","vhigh","4","2","big","med","unacc"
+"vhigh","vhigh","4","2","big","high","unacc"
+"vhigh","vhigh","4","4","small","low","unacc"
+"vhigh","vhigh","4","4","small","med","unacc"
+"vhigh","vhigh","4","4","small","high","unacc"
+"vhigh","vhigh","4","4","med","low","unacc"
+"vhigh","vhigh","4","4","med","med","unacc"
+"vhigh","vhigh","4","4","med","high","unacc"
+"vhigh","vhigh","4","4","big","low","unacc"
+"vhigh","vhigh","4","4","big","med","unacc"
+"vhigh","vhigh","4","4","big","high","unacc"
+"vhigh","vhigh","4","more","small","low","unacc"
+"vhigh","vhigh","4","more","small","med","unacc"
+"vhigh","vhigh","4","more","small","high","unacc"
+"vhigh","vhigh","4","more","med","low","unacc"
+"vhigh","vhigh","4","more","med","med","unacc"
+"vhigh","vhigh","4","more","med","high","unacc"
+"vhigh","vhigh","4","more","big","low","unacc"
+"low","low","4","more","med","high","vgood"
+"low","low","4","more","big","low","unacc"
+"low","low","4","more","big","med","good"
+"low","low","4","more","big","high","vgood"
+"low","low","5more","2","small","low","unacc"
+"low","low","5more","2","small","med","unacc"
+"low","low","5more","2","small","high","acc"
+"low","low","5more","2","med","low","unacc"
+"low","low","5more","2","med","med","acc"
+"low","low","5more","2","med","high","unacc"
+"low","low","5more","2","big","low","unacc"
+"low","low","5more","2","big","med","acc"
+"low","low","5more","2","big","high","unacc"
+"low","low","5more","4","small","low","unacc"
+"low","low","5more","4","small","med","acc"
+"low","low","5more","4","small","high","good"
+"low","low","5more","4","med","low","unacc"
+"low","low","5more","4","med","med","good"
+"low","low","5more","4","med","high","vgood"
+"low","low","5more","4","big","low","unacc"
+"low","low","5more","4","big","med","good"
+"low","low","5more","4","big","high","vgood"
+"low","low","5more","more","small","low","unacc"
+"low","low","5more","more","small","med","acc"
+"low","low","5more","more","small","high","good"
+"low","low","5more","more","med","low","unacc"
+"low","low","5more","more","med","med","good"
+"low","low","5more","more","med","high","vgood"
+"low","low","5more","more","big","low","unacc"
+"low","low","5more","more","big","med","good"
+"low","low","5more","more","big","high","vgood"
\ No newline at end of file
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
new file mode 100644
index 0000000..4620ee1
--- /dev/null
+++ b/tests/test_datasets.py
@@ -0,0 +1,79 @@
+import unittest
+
+from sklearn.ensemble import RandomForestClassifier
+import pandas as pd
+from explainerdashboard import ClassifierExplainer, ExplainerDashboard
+from explainerdashboard.custom import ShapDependenceComposite
+from sklearn.preprocessing import OneHotEncoder
+from sklearn.preprocessing import LabelEncoder
+from sklearn.model_selection import train_test_split
+import os
+
+class CategoricalModelWrapper:
+    def __init__(self, model) -> None:
+        self._model = model
+        pass
+
+    def _perform_one_hot_encoding(self, X, feature, values):
+        one_hot_enc = OneHotEncoder(dtype='int64', sparse_output=False, handle_unknown="ignore").set_output(transform="pandas")
+        one_hot_enc.fit(values)
+        result = one_hot_enc.transform(X[[feature]])
+        for col in result.columns:
+            result = result.rename(columns={ col : col.replace("x0", feature)})
+        return pd.concat([X, result], axis=1).drop(columns=[feature])
+    
+    def _perform_label_encoding(self, y):
+        label_enc = LabelEncoder()
+        label_enc.fit([["unacc"],["acc"],["good"],["vgood"]])
+        return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index)
+        
+    def _perform_label_decoding(self, y):
+        label_enc = LabelEncoder()
+        label_enc.fit([["unacc"],["acc"],["good"],["vgood"]])
+        return pd.Series(label_enc.inverse_transform(y), name=y.name)
+
+    def _preprocessor(self, X):
+        #Emulate a manual pipeline, e.g. what AutoML solutions can produce
+        #preprocess buying
+        X = self._perform_one_hot_encoding(X, "buying", [["vhigh"],["high"],["med"],["low"]])
+        X = self._perform_one_hot_encoding(X, "maint", [["vhigh"],["high"],["med"],["low"]])
+        X = self._perform_one_hot_encoding(X, "doors", [["2"],["3"],["4"],["5more"]])
+        X = self._perform_one_hot_encoding(X, "persons", [["2"],["4"],["more"]])
+        X = self._perform_one_hot_encoding(X, "lug_boot", [["small"],["med"],["big"]])
+        X = self._perform_one_hot_encoding(X, "safety", [["low"],["med"],["high"]])
+        return X
+
+    def _postprocessor(self, y):
+        return self._perform_label_decoding(y)
+
+    def predict_proba(self, X):
+        X = self._preprocessor(X)
+        probabilities_raw = self._model.predict_proba(X)
+        return probabilities_raw
+
+def generate_categorical_dataset_model_wrapper(categorical_label=False):
+    df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\car.csv"))
+    X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2, random_state=42)
+
+    model = RandomForestClassifier(n_estimators=5, max_depth=2)
+    wrapper = CategoricalModelWrapper(model)
+    X_train = wrapper._preprocessor(X_train)
+    y_train = wrapper._perform_label_encoding(y_train)
+    if categorical_label == False:
+        #We only test categorical features and numerical target
+        y_test = wrapper._perform_label_encoding(y_test)
+    model.fit(X_train, y_train)
+    return CategoricalModelWrapper(model), X_test, y_test
+
+def test_NaN_containing_categorical_dataset():
+    _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper()
+    explainer = ClassifierExplainer(
+                    _wrapper, _test_X, _test_y)
+    assert "NaN" in explainer.categorical_dict["buying"]
+    
+def test_categorical_label():
+    _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True)
+    explainer = ClassifierExplainer(
+                    _wrapper, _test_X, _test_y)
+    dashboard = ExplainerDashboard(explainer)
+    assert "unacc" in explainer.labels

From 6d052b7151caa39d5a1cd0cfe30e909e8d781144 Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Tue, 1 Aug 2023 16:23:32 +0200
Subject: [PATCH 5/8] added more acc classes in dataset and dashboard
 generation in NaN categorical test

---
 tests/test_assets/car.csv | 12 ++++++------
 tests/test_datasets.py    |  6 ++++++
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/tests/test_assets/car.csv b/tests/test_assets/car.csv
index 32091e2..6636705 100644
--- a/tests/test_assets/car.csv
+++ b/tests/test_assets/car.csv
@@ -54,17 +54,17 @@ buying,maint,doors,persons,lug_boot,safety,class
 "vhigh","vhigh","3","more","big","med","unacc"
 "vhigh","vhigh","3","more","big","high","unacc"
 "vhigh","vhigh","4","2","small","low","unacc"
-"vhigh","vhigh","4","2","small","med","unacc"
-"vhigh","vhigh","4","2","small","high","unacc"
+"vhigh","vhigh","4","2","small","med","acc"
+"vhigh","vhigh","4","2","small","high","acc"
 "vhigh","vhigh","4","2","med","low","unacc"
 "vhigh","vhigh","4","2","med","med","unacc"
-"vhigh","vhigh","4","2","med","high","unacc"
+"vhigh","vhigh","4","2","med","high","acc"
 "vhigh","vhigh","4","2","big","low","unacc"
-"vhigh","vhigh","4","2","big","med","unacc"
+"vhigh","vhigh","4","2","big","med","acc"
 "vhigh","vhigh","4","2","big","high","unacc"
-"vhigh","vhigh","4","4","small","low","unacc"
+"vhigh","vhigh","4","4","small","low","acc"
 "vhigh","vhigh","4","4","small","med","unacc"
-"vhigh","vhigh","4","4","small","high","unacc"
+"vhigh","vhigh","4","4","small","high","acc"
 "vhigh","vhigh","4","4","med","low","unacc"
 "vhigh","vhigh","4","4","med","med","unacc"
 "vhigh","vhigh","4","4","med","high","unacc"
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 4620ee1..305d65b 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -46,6 +46,11 @@ def _preprocessor(self, X):
     def _postprocessor(self, y):
         return self._perform_label_decoding(y)
 
+    def predict(self, X):
+        X = self._preprocessor(X)
+        y = self._model.predict(X)
+        return self._postprocessor(y)
+
     def predict_proba(self, X):
         X = self._preprocessor(X)
         probabilities_raw = self._model.predict_proba(X)
@@ -69,6 +74,7 @@ def test_NaN_containing_categorical_dataset():
     _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper()
     explainer = ClassifierExplainer(
                     _wrapper, _test_X, _test_y)
+    dashboard = ExplainerDashboard(explainer)
     assert "NaN" in explainer.categorical_dict["buying"]
     
 def test_categorical_label():

From 3a9374ff09ff2bef522e1eceea008abcc6a325cd Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Thu, 3 Aug 2023 09:18:14 +0200
Subject: [PATCH 6/8] changed used dataset to titanic

---
 .gitignore                |   1 -
 tests/test_assets/car.csv | 111 --------------------------------------
 tests/test_datasets.py    |  63 ++++++++++++----------
 3 files changed, 35 insertions(+), 140 deletions(-)
 delete mode 100644 tests/test_assets/car.csv

diff --git a/.gitignore b/.gitignore
index 91db970..8691c26 100644
--- a/.gitignore
+++ b/.gitignore
@@ -159,7 +159,6 @@ tests/cli_assets/*
 tests/test_assets/*
 !tests/test_assets/data.csv
 !tests/test_assets/pipeline_data.csv
-!tests/test_assets/car.csv
 
 db_test.py
 .DS_Store
diff --git a/tests/test_assets/car.csv b/tests/test_assets/car.csv
deleted file mode 100644
index 6636705..0000000
--- a/tests/test_assets/car.csv
+++ /dev/null
@@ -1,111 +0,0 @@
-buying,maint,doors,persons,lug_boot,safety,class
-,"vhigh","2","2","small","low","unacc"
-"vhigh","vhigh","2","2","small","med","unacc"
-"vhigh","vhigh","2","2","small","high","unacc"
-"vhigh","vhigh","2","2","med","low","unacc"
-"vhigh","vhigh","2","2","med","med","unacc"
-,"vhigh","2","2","med","high","unacc"
-"vhigh","vhigh","2","2","big","low","unacc"
-"vhigh","vhigh","2","2","big","med","unacc"
-"vhigh","vhigh","2","2","big","high","unacc"
-"vhigh","vhigh","2","4","small","low","unacc"
-"vhigh","vhigh","2","4","small","med","unacc"
-"vhigh","vhigh","2","4","small","high","unacc"
-"vhigh","vhigh","2","4","med","low","unacc"
-,"vhigh","2","4","med","med","unacc"
-"vhigh","vhigh","2","4","med","high","unacc"
-"vhigh","vhigh","2","4","big","low","unacc"
-"vhigh","vhigh","2","4","big","med","unacc"
-"vhigh","vhigh","2","4","big","high","unacc"
-,"vhigh","2","more","small","low","unacc"
-"vhigh","vhigh","2","more","small","med","unacc"
-"vhigh","vhigh","2","more","small","high","unacc"
-"vhigh","vhigh","2","more","med","low","unacc"
-"vhigh","vhigh","2","more","med","med","unacc"
-"vhigh","vhigh","2","more","med","high","unacc"
-,"vhigh","2","more","big","low","unacc"
-"vhigh","vhigh","2","more","big","med","unacc"
-"vhigh","vhigh","2","more","big","high","unacc"
-"vhigh","vhigh","3","2","small","low","unacc"
-"vhigh","vhigh","3","2","small","med","unacc"
-"vhigh","vhigh","3","2","small","high","unacc"
-"vhigh","vhigh","3","2","med","low","unacc"
-"vhigh","vhigh","3","2","med","med","unacc"
-"vhigh","vhigh","3","2","med","high","unacc"
-"vhigh","vhigh","3","2","big","low","unacc"
-"vhigh","vhigh","3","2","big","med","unacc"
-,"vhigh","3","2","big","high","unacc"
-"vhigh","vhigh","3","4","small","low","unacc"
-"vhigh","vhigh","3","4","small","med","unacc"
-"vhigh","vhigh","3","4","small","high","unacc"
-"vhigh","vhigh","3","4","med","low","unacc"
-"vhigh","vhigh","3","4","med","med","unacc"
-"vhigh","vhigh","3","4","med","high","unacc"
-"vhigh","vhigh","3","4","big","low","unacc"
-"vhigh","vhigh","3","4","big","med","unacc"
-"vhigh","vhigh","3","4","big","high","unacc"
-"vhigh","vhigh","3","more","small","low","unacc"
-"vhigh","vhigh","3","more","small","med","unacc"
-"vhigh","vhigh","3","more","small","high","unacc"
-"vhigh","vhigh","3","more","med","low","unacc"
-"vhigh","vhigh","3","more","med","med","unacc"
-"vhigh","vhigh","3","more","med","high","unacc"
-"vhigh","vhigh","3","more","big","low","unacc"
-"vhigh","vhigh","3","more","big","med","unacc"
-"vhigh","vhigh","3","more","big","high","unacc"
-"vhigh","vhigh","4","2","small","low","unacc"
-"vhigh","vhigh","4","2","small","med","acc"
-"vhigh","vhigh","4","2","small","high","acc"
-"vhigh","vhigh","4","2","med","low","unacc"
-"vhigh","vhigh","4","2","med","med","unacc"
-"vhigh","vhigh","4","2","med","high","acc"
-"vhigh","vhigh","4","2","big","low","unacc"
-"vhigh","vhigh","4","2","big","med","acc"
-"vhigh","vhigh","4","2","big","high","unacc"
-"vhigh","vhigh","4","4","small","low","acc"
-"vhigh","vhigh","4","4","small","med","unacc"
-"vhigh","vhigh","4","4","small","high","acc"
-"vhigh","vhigh","4","4","med","low","unacc"
-"vhigh","vhigh","4","4","med","med","unacc"
-"vhigh","vhigh","4","4","med","high","unacc"
-"vhigh","vhigh","4","4","big","low","unacc"
-"vhigh","vhigh","4","4","big","med","unacc"
-"vhigh","vhigh","4","4","big","high","unacc"
-"vhigh","vhigh","4","more","small","low","unacc"
-"vhigh","vhigh","4","more","small","med","unacc"
-"vhigh","vhigh","4","more","small","high","unacc"
-"vhigh","vhigh","4","more","med","low","unacc"
-"vhigh","vhigh","4","more","med","med","unacc"
-"vhigh","vhigh","4","more","med","high","unacc"
-"vhigh","vhigh","4","more","big","low","unacc"
-"low","low","4","more","med","high","vgood"
-"low","low","4","more","big","low","unacc"
-"low","low","4","more","big","med","good"
-"low","low","4","more","big","high","vgood"
-"low","low","5more","2","small","low","unacc"
-"low","low","5more","2","small","med","unacc"
-"low","low","5more","2","small","high","acc"
-"low","low","5more","2","med","low","unacc"
-"low","low","5more","2","med","med","acc"
-"low","low","5more","2","med","high","unacc"
-"low","low","5more","2","big","low","unacc"
-"low","low","5more","2","big","med","acc"
-"low","low","5more","2","big","high","unacc"
-"low","low","5more","4","small","low","unacc"
-"low","low","5more","4","small","med","acc"
-"low","low","5more","4","small","high","good"
-"low","low","5more","4","med","low","unacc"
-"low","low","5more","4","med","med","good"
-"low","low","5more","4","med","high","vgood"
-"low","low","5more","4","big","low","unacc"
-"low","low","5more","4","big","med","good"
-"low","low","5more","4","big","high","vgood"
-"low","low","5more","more","small","low","unacc"
-"low","low","5more","more","small","med","acc"
-"low","low","5more","more","small","high","good"
-"low","low","5more","more","med","low","unacc"
-"low","low","5more","more","med","med","good"
-"low","low","5more","more","med","high","vgood"
-"low","low","5more","more","big","low","unacc"
-"low","low","5more","more","big","med","good"
-"low","low","5more","more","big","high","vgood"
\ No newline at end of file
diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index 305d65b..c1ec53b 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,17 +1,16 @@
-import unittest
-
 from sklearn.ensemble import RandomForestClassifier
 import pandas as pd
 from explainerdashboard import ClassifierExplainer, ExplainerDashboard
-from explainerdashboard.custom import ShapDependenceComposite
 from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import train_test_split
 import os
+import numpy as np
 
 class CategoricalModelWrapper:
-    def __init__(self, model) -> None:
+    def __init__(self, model, categorical_label_test) -> None:
         self._model = model
+        self._categorical_label_test = categorical_label_test
         pass
 
     def _perform_one_hot_encoding(self, X, feature, values):
@@ -24,27 +23,21 @@ def _perform_one_hot_encoding(self, X, feature, values):
     
     def _perform_label_encoding(self, y):
         label_enc = LabelEncoder()
-        label_enc.fit([["unacc"],["acc"],["good"],["vgood"]])
+        label_enc.fit([["Survived"],["Not Survived"]])
         return pd.Series(label_enc.transform(y.values), name=y.name, index=y.index)
         
     def _perform_label_decoding(self, y):
         label_enc = LabelEncoder()
-        label_enc.fit([["unacc"],["acc"],["good"],["vgood"]])
+        label_enc.fit([["Survived"],["Not Survived"]])
         return pd.Series(label_enc.inverse_transform(y), name=y.name)
 
     def _preprocessor(self, X):
-        #Emulate a manual pipeline, e.g. what AutoML solutions can produce
-        #preprocess buying
-        X = self._perform_one_hot_encoding(X, "buying", [["vhigh"],["high"],["med"],["low"]])
-        X = self._perform_one_hot_encoding(X, "maint", [["vhigh"],["high"],["med"],["low"]])
-        X = self._perform_one_hot_encoding(X, "doors", [["2"],["3"],["4"],["5more"]])
-        X = self._perform_one_hot_encoding(X, "persons", [["2"],["4"],["more"]])
-        X = self._perform_one_hot_encoding(X, "lug_boot", [["small"],["med"],["big"]])
-        X = self._perform_one_hot_encoding(X, "safety", [["low"],["med"],["high"]])
-        return X
+        return X.drop(["Name"], axis=1)
 
     def _postprocessor(self, y):
-        return self._perform_label_decoding(y)
+        if self._categorical_label_test == True:
+            y = self._perform_label_decoding(y)
+        return y
 
     def predict(self, X):
         X = self._preprocessor(X)
@@ -56,30 +49,44 @@ def predict_proba(self, X):
         probabilities_raw = self._model.predict_proba(X)
         return probabilities_raw
 
-def generate_categorical_dataset_model_wrapper(categorical_label=False):
-    df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\car.csv"))
-    X_train, X_test, y_train, y_test = train_test_split(df.drop(["class"], axis=1), df["class"], test_size=0.2, random_state=42)
-
+def generate_categorical_dataset_model_wrapper(categorical_label_test=False):
     model = RandomForestClassifier(n_estimators=5, max_depth=2)
-    wrapper = CategoricalModelWrapper(model)
+    wrapper = CategoricalModelWrapper(model, categorical_label_test)
+    df = pd.read_csv(os.path.join(os.getcwd(), "tests\\test_assets\\data.csv"))
+    if categorical_label_test == True:
+        #Test for categorical label, convert titanic binary numeric label to categorical ["Survived"],["Not Survived"]
+        df["Survival"] = wrapper._perform_label_decoding(df["Survival"])
+    else:
+        #We only test NaN in categorical features and numerical target
+        df["Name"][0] = np.nan
+        df["Name"][10] = np.nan
+        df["Name"][20] = np.nan
+        df["Name"][30] = np.nan
+        df["Name"][40] = np.nan
+        df["Name"][50] = np.nan
+        df["Name"][60] = np.nan
+        df["Name"][70] = np.nan
+        df["Name"][80] = np.nan
+    X_train, X_test, y_train, y_test = train_test_split(df.drop(["Survival"], axis=1), df["Survival"], test_size=0.2, random_state=42)
+
     X_train = wrapper._preprocessor(X_train)
-    y_train = wrapper._perform_label_encoding(y_train)
-    if categorical_label == False:
-        #We only test categorical features and numerical target
-        y_test = wrapper._perform_label_encoding(y_test)
+
+    if categorical_label_test == True:
+        y_train = wrapper._perform_label_encoding(y_train)
+        
     model.fit(X_train, y_train)
-    return CategoricalModelWrapper(model), X_test, y_test
+    return CategoricalModelWrapper(model, categorical_label_test), X_test, y_test
 
 def test_NaN_containing_categorical_dataset():
     _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper()
     explainer = ClassifierExplainer(
                     _wrapper, _test_X, _test_y)
     dashboard = ExplainerDashboard(explainer)
-    assert "NaN" in explainer.categorical_dict["buying"]
+    assert "NaN" in explainer.categorical_dict["Name"]
     
 def test_categorical_label():
     _wrapper, _test_X, _test_y = generate_categorical_dataset_model_wrapper(True)
     explainer = ClassifierExplainer(
                     _wrapper, _test_X, _test_y)
     dashboard = ExplainerDashboard(explainer)
-    assert "unacc" in explainer.labels
+    assert "Survived" in explainer.labels

From f98eae50f2cbbf0e15857a48d736dc1677ecbca8 Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Thu, 3 Aug 2023 09:20:41 +0200
Subject: [PATCH 7/8] removed one hot encoder

---
 tests/test_datasets.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/tests/test_datasets.py b/tests/test_datasets.py
index c1ec53b..1c5c81d 100644
--- a/tests/test_datasets.py
+++ b/tests/test_datasets.py
@@ -1,7 +1,6 @@
 from sklearn.ensemble import RandomForestClassifier
 import pandas as pd
 from explainerdashboard import ClassifierExplainer, ExplainerDashboard
-from sklearn.preprocessing import OneHotEncoder
 from sklearn.preprocessing import LabelEncoder
 from sklearn.model_selection import train_test_split
 import os
@@ -12,14 +11,6 @@ def __init__(self, model, categorical_label_test) -> None:
         self._model = model
         self._categorical_label_test = categorical_label_test
         pass
-
-    def _perform_one_hot_encoding(self, X, feature, values):
-        one_hot_enc = OneHotEncoder(dtype='int64', sparse_output=False, handle_unknown="ignore").set_output(transform="pandas")
-        one_hot_enc.fit(values)
-        result = one_hot_enc.transform(X[[feature]])
-        for col in result.columns:
-            result = result.rename(columns={ col : col.replace("x0", feature)})
-        return pd.concat([X, result], axis=1).drop(columns=[feature])
     
     def _perform_label_encoding(self, y):
         label_enc = LabelEncoder()

From 1f459a78043ce323a87f38702e55e18c544eae4c Mon Sep 17 00:00:00 2001
From: Alexander Zender <alexander.zender@h-da.de>
Date: Thu, 10 Aug 2023 13:29:54 +0200
Subject: [PATCH 8/8] removed unecessary copy

---
 explainerdashboard/explainer_methods.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/explainerdashboard/explainer_methods.py b/explainerdashboard/explainer_methods.py
index 1c1bcad..579ac6b 100644
--- a/explainerdashboard/explainer_methods.py
+++ b/explainerdashboard/explainer_methods.py
@@ -572,7 +572,7 @@ def one_vs_all_metric(metric, pos_label, y_true, y_pred):
     sign = 1 if greater_is_better else -1
 
     def _scorer(clf, X, y):
-        y_pred = clf.predict_proba(X.copy())
+        y_pred = clf.predict_proba(X)
         score = sign * partial_metric(y, y_pred)
         return score