Merge branch 'main' into zhb/Causal

microsoft · Feb 28, 2022 · a4e2948 · a4e2948
2 parents 4c8cda1 + 274543c
commit a4e2948
Show file tree

Hide file tree

Showing 6 changed files with 135 additions and 37 deletions.
diff --git a/...responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb b/...responsibleaidashboard/responsibleaidashboard-census-classification-model-debugging.ipynb
@@ -79,7 +79,7 @@
    "id": "clinical-henry",
    "metadata": {},
    "source": [
-    "First, load the census dataset and specify the different types of features. Then, clean the target feature values to include only 0 and 1."
+    "First, load the census dataset and specify the different types of features. Compose a pipeline which contains a preprocessor and estimator."
    ]
   },
   {
@@ -99,7 +99,7 @@
     "    y = dataset[[target_feature]]\n",
     "    return X, y\n",
     "\n",
-    "def clean_data(X, y, target_feature):\n",
+    "def create_classification_pipeline(X, y, target_feature):\n",
     "    features = X.columns.values.tolist()\n",
     "    classes = y[target_feature].unique().tolist()\n",
     "    pipe_cfg = {\n",
@@ -118,9 +118,13 @@
     "        ('num_pipe', num_pipe, pipe_cfg['num_cols']),\n",
     "        ('cat_pipe', cat_pipe, pipe_cfg['cat_cols'])\n",
     "    ])\n",
-    "    X = feat_pipe.fit_transform(X)\n",
-    "    print(pipe_cfg['cat_cols'])\n",
-    "    return X, feat_pipe, features, classes\n",
+    "\n",
+    "    # Append classifier to preprocessing pipeline.\n",
+    "    # Now we have a full prediction pipeline.\n",
+    "    pipeline = Pipeline(steps=[('preprocessor', feat_pipe),\n",
+    "                               ('model', LGBMClassifier())])\n",
+    "\n",
+    "    return pipeline\n",
     "\n",
     "outdirname = 'responsibleai.12.28.21'\n",
     "try:\n",
@@ -140,30 +144,25 @@
     "train_data = pd.read_csv('adult-train.csv')\n",
     "test_data = pd.read_csv('adult-test.csv')\n",
     "\n",
-    "\n",
     "X_train_original, y_train = split_label(train_data, target_feature)\n",
     "X_test_original, y_test = split_label(test_data, target_feature)\n",
     "\n",
+    "pipeline = create_classification_pipeline(X_train_original, y_train, target_feature)\n",
     "\n",
-    "X_train, feat_pipe, features, classes = clean_data(X_train_original, y_train, target_feature)\n",
     "y_train = y_train[target_feature].to_numpy()\n",
-    "\n",
-    "X_test = feat_pipe.transform(X_test_original)\n",
     "y_test = y_test[target_feature].to_numpy()\n",
     "\n",
-    "train_data[target_feature] = y_train\n",
-    "test_data[target_feature] = y_test\n",
     "\n",
-    "test_data_sample = test_data.sample(n=500, random_state=5)\n",
-    "train_data_sample = train_data.sample(n=8000, random_state=5)"
+    "# Take 500 samples from the test data\n",
+    "test_data_sample = test_data.sample(n=500, random_state=5)"
    ]
   },
   {
    "cell_type": "markdown",
    "id": "potential-proportion",
    "metadata": {},
    "source": [
-    "Train a LightGBM classifier on the training data."
+    "Train the classification pipeline composed in the previous cell on the training data."
    ]
   },
   {
@@ -173,8 +172,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "clf = LGBMClassifier()\n",
-    "model = clf.fit(X_train, y_train)"
+    "model = pipeline.fit(X_train_original, y_train)"
    ]
   },
   {
@@ -213,10 +211,8 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "dashboard_pipeline = Pipeline(steps=[('preprocess', feat_pipe), ('model', model)])\n",
-    "\n",
-    "rai_insights = RAIInsights(dashboard_pipeline, train_data_sample, test_data_sample, target_feature, 'classification',\n",
-    "                               categorical_features=categorical_features)"
+    "rai_insights = RAIInsights(model, train_data, test_data_sample, target_feature, 'classification',\n",
+    "                           categorical_features=categorical_features)"
    ]
   },
   {
@@ -519,7 +515,7 @@
    "name": "python",
    "nbconvert_exporter": "python",
    "pygments_lexer": "ipython3",
-   "version": "3.6.13"
+   "version": "3.7.11"
   }
  },
  "nbformat": 4,

diff --git a/raiwidgets/raiwidgets/responsibleai_dashboard_input.py b/raiwidgets/raiwidgets/responsibleai_dashboard_input.py
@@ -41,8 +41,11 @@ def __init__(
         self.dashboard_input = analysis.get_data()
 
         self._validate_cohort_list(cohort_list)
-        # Add cohort_list to dashboard_input
-        self.dashboard_input.cohortData = cohort_list
+        if cohort_list is not None:
+            # Add cohort_list to dashboard_input
+            self.dashboard_input.cohortData = cohort_list
+        else:
+            self.dashboard_input.cohortData = []
 
         self._feature_length = len(self.dashboard_input.dataset.feature_names)
         self._row_length = len(self.dashboard_input.dataset.features)

diff --git a/raiwidgets/tests/conftest.py b/raiwidgets/tests/conftest.py
@@ -1,16 +1,19 @@
 # Copyright (c) Microsoft Corporation
 # Licensed under the MIT License.
 
+import pandas as pd
 import pytest
 import shap
 import sklearn
+from sklearn.datasets import fetch_california_housing
+from sklearn.ensemble import RandomForestRegressor
 from sklearn.model_selection import train_test_split
 
 from responsibleai import RAIInsights
 
 
 @pytest.fixture(scope='session')
-def create_rai_insights_object():
+def create_rai_insights_object_classification():
     X, y = shap.datasets.adult()
     y = [1 if r else 0 for r in y]
 
@@ -41,3 +44,32 @@ def create_rai_insights_object():
                   skip_cat_limit_checks=True)
     ri.compute()
     return ri
+
+
+@pytest.fixture(scope='session')
+def create_rai_insights_object_regression():
+    housing = fetch_california_housing()
+    X_train, X_test, y_train, y_test = train_test_split(housing.data,
+                                                        housing.target,
+                                                        test_size=0.005,
+                                                        random_state=7)
+    X_train = pd.DataFrame(X_train, columns=housing.feature_names)
+    X_test = pd.DataFrame(X_test, columns=housing.feature_names)
+
+    rfc = RandomForestRegressor(n_estimators=10, max_depth=4,
+                                random_state=777)
+    model = rfc.fit(X_train, y_train)
+
+    X_train['target'] = y_train
+    X_test['target'] = y_test
+
+    ri = RAIInsights(model, X_train, X_test, 'target', 'regression')
+    ri.explainer.add()
+    ri.counterfactual.add(10, desired_range=[5, 10])
+    ri.error_analysis.add()
+    ri.causal.add(treatment_features=['AveRooms'],
+                  heterogeneity_features=None,
+                  upper_bound_on_cat_expansion=42,
+                  skip_cat_limit_checks=True)
+    ri.compute()
+    return ri
diff --git a/raiwidgets/tests/test_model_analysis_dashboard.py b/raiwidgets/tests/test_model_analysis_dashboard.py
@@ -27,8 +27,9 @@ def validate_rai_dashboard_data(self, rai_widget):
             rai_widget.input.dashboard_input.counterfactualData[0],
             CounterfactualData)
 
-    def test_model_analysis_adult(self, tmpdir, create_rai_insights_object):
-        ri = create_rai_insights_object
+    def test_model_analysis_adult(self, tmpdir,
+                                  create_rai_insights_object_classification):
+        ri = create_rai_insights_object_classification
         with pytest.warns(
             DeprecationWarning,
             match="MODULE-DEPRECATION-WARNING: "

diff --git a/raiwidgets/tests/test_responsibleai_dashboard.py b/raiwidgets/tests/test_responsibleai_dashboard.py
@@ -32,16 +32,17 @@ def validate_rai_dashboard_data(self, rai_widget):
             rai_widget.input.dashboard_input.counterfactualData[0],
             CounterfactualData)
 
-        if rai_widget.input.dashboard_input.cohortData is not None:
+        if len(rai_widget.input.dashboard_input.cohortData) != 0:
             assert isinstance(rai_widget.input.dashboard_input.cohortData[0],
                               Cohort)
 
         # Make sure the dashboard input can be serialized
         json.dumps(rai_widget.input.dashboard_input,
                    default=serialize_json_safe)
 
-    def test_responsibleai_adult(self, tmpdir, create_rai_insights_object):
-        ri = create_rai_insights_object
+    def test_responsibleai_adult_save_and_load(
+            self, tmpdir, create_rai_insights_object_classification):
+        ri = create_rai_insights_object_classification
 
         widget = ResponsibleAIDashboard(ri)
         self.validate_rai_dashboard_data(widget)
@@ -53,9 +54,73 @@ def test_responsibleai_adult(self, tmpdir, create_rai_insights_object):
         widget_copy = ResponsibleAIDashboard(ri_copy)
         self.validate_rai_dashboard_data(widget_copy)
 
+    def test_responsibleai_housing_save_and_load(
+            self, tmpdir, create_rai_insights_object_regression):
+        ri = create_rai_insights_object_regression
+
+        widget = ResponsibleAIDashboard(ri)
+        self.validate_rai_dashboard_data(widget)
+
+        save_dir = tmpdir.mkdir('save-dir')
+        ri.save(save_dir)
+        ri_copy = ri.load(save_dir)
+
+        widget_copy = ResponsibleAIDashboard(ri_copy)
+        self.validate_rai_dashboard_data(widget_copy)
+
+    def test_responsibleai_housing_with_pre_defined_cohorts(
+            self, create_rai_insights_object_regression):
+        ri = create_rai_insights_object_regression
+
+        cohort_filter_continuous_1 = CohortFilter(
+            method=CohortFilterMethods.METHOD_LESS,
+            arg=[30.5],
+            column='HouseAge')
+        cohort_filter_continuous_2 = CohortFilter(
+            method=CohortFilterMethods.METHOD_GREATER,
+            arg=[3.0],
+            column='AveRooms')
+
+        user_cohort_continuous = Cohort(name='Cohort Continuous')
+        user_cohort_continuous.add_cohort_filter(cohort_filter_continuous_1)
+        user_cohort_continuous.add_cohort_filter(cohort_filter_continuous_2)
+
+        cohort_filter_index = CohortFilter(
+            method=CohortFilterMethods.METHOD_LESS,
+            arg=[20],
+            column='Index')
+
+        user_cohort_index = Cohort(name='Cohort Index')
+        user_cohort_index.add_cohort_filter(cohort_filter_index)
+
+        cohort_filter_predicted_y = CohortFilter(
+            method=CohortFilterMethods.METHOD_LESS,
+            arg=[5.0],
+            column='Predicted Y')
+
+        user_cohort_predicted_y = Cohort(name='Cohort Predicted Y')
+        user_cohort_predicted_y.add_cohort_filter(cohort_filter_predicted_y)
+
+        cohort_filter_true_y = CohortFilter(
+            method=CohortFilterMethods.METHOD_GREATER,
+            arg=[1.0],
+            column='True Y')
+
+        user_cohort_true_y = Cohort(name='Cohort True Y')
+        user_cohort_true_y.add_cohort_filter(cohort_filter_true_y)
+
+        widget = ResponsibleAIDashboard(
+            ri,
+            cohort_list=[user_cohort_continuous,
+                         user_cohort_index,
+                         user_cohort_predicted_y,
+                         user_cohort_true_y])
+
+        self.validate_rai_dashboard_data(widget)
+
     def test_responsibleai_adult_with_pre_defined_cohorts(
-            self, create_rai_insights_object):
-        ri = create_rai_insights_object
+            self, create_rai_insights_object_classification):
+        ri = create_rai_insights_object_classification
 
         cohort_filter_continuous_1 = CohortFilter(
             method=CohortFilterMethods.METHOD_LESS,
@@ -95,8 +160,8 @@ def test_responsibleai_adult_with_pre_defined_cohorts(
         self.validate_rai_dashboard_data(widget)
 
     def test_responsibleai_adult_with_ill_defined_cohorts(
-            self, create_rai_insights_object):
-        ri = create_rai_insights_object
+            self, create_rai_insights_object_classification):
+        ri = create_rai_insights_object_classification
 
         cohort_filter_continuous_1 = CohortFilter(
             method=CohortFilterMethods.METHOD_LESS,
@@ -123,8 +188,8 @@ def test_responsibleai_adult_with_ill_defined_cohorts(
                 ri, cohort_list=[user_cohort_continuous, {}])
 
     def test_responsibleai_adult_duplicate_cohort_names(
-            self, create_rai_insights_object):
-        ri = create_rai_insights_object
+            self, create_rai_insights_object_classification):
+        ri = create_rai_insights_object_classification
 
         cohort_filter_continuous_1 = CohortFilter(
             method=CohortFilterMethods.METHOD_LESS,

diff --git a/raiwidgets/tests/test_responsibleai_dashboard_input.py b/raiwidgets/tests/test_responsibleai_dashboard_input.py
@@ -8,8 +8,9 @@
 
 
 class TestResponsibleAIDashboardInput:
-    def test_model_analysis_adult(self, create_rai_insights_object):
-        ri = create_rai_insights_object
+    def test_model_analysis_adult(
+            self, create_rai_insights_object_classification):
+        ri = create_rai_insights_object_classification
         knn = ri.model
         test_data = ri.test