Merge branch 'main' into zhb/BugFix

microsoft · Apr 8, 2022 · 89e5d76 · 89e5d76
2 parents f90cb6e + 417364e
commit 89e5d76
Show file tree

Hide file tree

Showing 7 changed files with 238 additions and 34 deletions.
diff --git a/erroranalysis/erroranalysis/_internal/cohort_filter.py b/erroranalysis/erroranalysis/_internal/cohort_filter.py
@@ -4,12 +4,13 @@
 import numpy as np
 import pandas as pd
 
-from erroranalysis._internal.constants import (METHOD, PRED_Y, ROW_INDEX,
-                                               TRUE_Y, CohortFilterMethods,
-                                               ModelTask)
+from erroranalysis._internal.constants import (ARG, COLUMN, COMPOSITE_FILTERS,
+                                               METHOD, OPERATION, PRED_Y,
+                                               ROW_INDEX, TRUE_Y,
+                                               CohortFilterMethods,
+                                               CohortFilterOps, ModelTask)
 from erroranalysis._internal.metrics import get_ordered_classes
 
-COLUMN = 'column'
 MODEL = 'model'
 CLASSIFICATION_OUTCOME = 'Classification outcome'
 
@@ -168,7 +169,7 @@ def build_query(filters, categorical_features, categories):
     for filter in filters:
         if METHOD in filter:
             method = filter[METHOD]
-            arg0 = str(filter['arg'][0])
+            arg0 = str(filter[ARG][0])
             colname = filter[COLUMN]
             if method == CohortFilterMethods.METHOD_GREATER:
                 queries.append("`" + colname + "` > " + arg0)
@@ -179,7 +180,7 @@ def build_query(filters, categorical_features, categories):
             elif method == CohortFilterMethods.METHOD_GREATER_AND_EQUAL:
                 queries.append("`" + colname + "` >= " + arg0)
             elif method == CohortFilterMethods.METHOD_RANGE:
-                arg1 = str(filter['arg'][1])
+                arg1 = str(filter[ARG][1])
                 queries.append("`" + colname + "` >= " + arg0 +
                                ' & `' + colname + "` <= " + arg1)
             elif method == CohortFilterMethods.METHOD_INCLUDES or \
@@ -194,7 +195,7 @@ def build_query(filters, categorical_features, categories):
                     is_categorical = colname in categorical_features
                 if is_categorical:
                     cat_idx = categorical_features.index(colname)
-                    arg0i = filter['arg'][0]
+                    arg0i = filter[ARG][0]
                     arg_cat = categories[cat_idx][arg0i]
                     if isinstance(arg_cat, str):
                         queries.append("`{}` == '{}'".format(colname, arg_cat))
@@ -207,11 +208,11 @@ def build_query(filters, categorical_features, categories):
                     "Unsupported method type: {}".format(method))
         else:
             cqueries = []
-            for composite_filter in filter['compositeFilters']:
+            for composite_filter in filter[COMPOSITE_FILTERS]:
                 cqueries.append(build_query([composite_filter],
                                             categorical_features,
                                             categories))
-            if filter['operation'] == 'and':
+            if filter[OPERATION] == CohortFilterOps.AND:
                 queries.append('(' + ') & ('.join(cqueries) + ')')
             else:
                 queries.append('(' + ') | ('.join(cqueries) + ')')
@@ -245,7 +246,7 @@ def build_bounds_query(filter, colname, method,
     is_categorical = False
     if categorical_features:
         is_categorical = colname in categorical_features
-    for arg in filter['arg']:
+    for arg in filter[ARG]:
         if is_categorical:
             cat_idx = categorical_features.index(colname)
             if isinstance(categories[cat_idx][arg], str):

diff --git a/erroranalysis/erroranalysis/_internal/constants.py b/erroranalysis/erroranalysis/_internal/constants.py
@@ -3,14 +3,18 @@
 
 from enum import Enum
 
-PRED_Y = 'pred_y'
-ROW_INDEX = 'Index'
-TRUE_Y = 'true_y'
+ARG = 'arg'
+COLUMN = 'column'
+COMPOSITE_FILTERS = 'compositeFilters'
 DIFF = 'diff'
-SPLIT_INDEX = 'split_index'
-SPLIT_FEATURE = 'split_feature'
 LEAF_INDEX = 'leaf_index'
 METHOD = 'method'
+OPERATION = 'operation'
+PRED_Y = 'pred_y'
+ROW_INDEX = 'Index'
+SPLIT_FEATURE = 'split_feature'
+SPLIT_INDEX = 'split_index'
+TRUE_Y = 'true_y'
 
 
 class CohortFilterMethods:
@@ -27,6 +31,14 @@ class CohortFilterMethods:
     METHOD_RANGE = 'in the range of'
 
 
+class CohortFilterOps:
+    """Cohort filter operations.
+    """
+
+    AND = 'and'
+    OR = 'or'
+
+
 class ModelTask(str, Enum):
     """Provide model task constants.
 

diff --git a/erroranalysis/erroranalysis/_internal/surrogate_error_tree.py b/erroranalysis/erroranalysis/_internal/surrogate_error_tree.py
@@ -124,6 +124,8 @@ def compute_error_tree(analyzer,
     filtered_df = filter_from_cohort(analyzer,
                                      filters,
                                      composite_filters)
+    if filtered_df.shape[0] == 0:
+        return create_empty_node(analyzer.metric)
     row_index = filtered_df[ROW_INDEX]
     true_y = filtered_df[TRUE_Y]
     dropped_cols = [TRUE_Y, ROW_INDEX]
@@ -548,7 +550,62 @@ def node_to_dict(df, tree, nodeid, categories, json,
         node_name = feature_names[tree[SPLIT_FEATURE]]
     else:
         node_name = None
-    json.append({
+    json.append(get_json_node(arg, condition, error, nodeid, method,
+                              node_name, parentid, p_node_name,
+                              total, success, metric_name,
+                              metric_value, is_error_metric))
+    return json, df
+
+
+def create_empty_node(metric):
+    """Create an empty node for the tree.
+
+    :param metric: The metric to use for the node.
+    :type metric: str
+    :return: The empty node.
+    :rtype: dict
+    """
+    metric_name = metric_to_display_name[metric]
+    is_error_metric = metric in error_metrics
+    return [get_json_node(None, None, 0, 0, None, None, None,
+                          None, 0, 0, metric_name, 0, is_error_metric)]
+
+
+def get_json_node(arg, condition, error, nodeid, method, node_name,
+                  parentid, p_node_name, total, success, metric_name,
+                  metric_value, is_error_metric):
+    """Get the json node for the tree.
+
+    :param arg: The arg for the node.
+    :type arg: str
+    :param condition: The condition for the node.
+    :type condition: str
+    :param error: The error for the node.
+    :type error: int
+    :param nodeid: The node id for the node.
+    :type nodeid: int
+    :param method: The method for the node.
+    :type method: str
+    :param node_name: The node name for the node.
+    :type node_name: str
+    :param parentid: The parent id for the node.
+    :type parentid: int
+    :param p_node_name: The parent node name for the node.
+    :type p_node_name: str
+    :param total: The total number of instances in the node.
+    :type total: int
+    :param success: The total number of success instances for the node.
+    :type success: int
+    :param metric_name: The metric name for the node.
+    :type metric_name: str
+    :param metric_value: The metric value for the node.
+    :type metric_value: float
+    :param is_error_metric: Whether the metric is an error metric.
+    :type is_error_metric: bool
+    :return: The json node.
+    :rtype: dict
+    """
+    return {
         "arg": arg,
         "badFeaturesRowCount": 0,  # Note: remove this eventually
         "condition": condition,
@@ -566,8 +623,7 @@ def node_to_dict(df, tree, nodeid, categories, json,
         "metricName": metric_name,
         "metricValue": float(metric_value),
         "isErrorMetric": is_error_metric
-    })
-    return json, df
+    }
 
 
 def get_regression_metric_data(df):

diff --git a/erroranalysis/erroranalysis/version.py b/erroranalysis/erroranalysis/version.py
@@ -3,6 +3,6 @@
 
 name = 'erroranalysis'
 _major = '0'
-_minor = '1'
-_patch = '31'
+_minor = '2'
+_patch = '0'
 version = '{}.{}.{}'.format(_major, _minor, _patch)
diff --git a/erroranalysis/tests/test_surrogate_error_tree.py b/erroranalysis/tests/test_surrogate_error_tree.py
@@ -3,17 +3,23 @@
 
 import time
 
+import pandas as pd
 import pytest
 from common_utils import (create_adult_census_data,
                           create_binary_classification_dataset,
+                          create_cancer_data, create_diabetes_data,
                           create_iris_data, create_kneighbors_classifier,
                           create_models_classification,
                           create_sklearn_random_forest_regressor,
                           replicate_dataset)
 
-from erroranalysis._internal.constants import (DIFF, LEAF_INDEX, PRED_Y,
+from erroranalysis._internal.cohort_filter import filter_from_cohort
+from erroranalysis._internal.constants import (ARG, COLUMN, COMPOSITE_FILTERS,
+                                               DIFF, LEAF_INDEX, METHOD,
+                                               OPERATION, PRED_Y, ROW_INDEX,
                                                SPLIT_FEATURE, SPLIT_INDEX,
-                                               TRUE_Y, Metrics)
+                                               TRUE_Y, CohortFilterMethods,
+                                               CohortFilterOps, Metrics)
 from erroranalysis._internal.error_analyzer import ModelAnalyzer
 from erroranalysis._internal.surrogate_error_tree import (
     TreeSide, cache_subtree_features, create_surrogate_model,
@@ -33,9 +39,7 @@ def test_surrogate_error_tree_iris(self):
         models = create_models_classification(X_train, y_train)
 
         for model in models:
-            categorical_features = []
-            run_error_analyzer(model, X_test, y_test, feature_names,
-                               categorical_features)
+            run_error_analyzer(model, X_test, y_test, feature_names)
 
     def test_surrogate_error_tree_int_categorical(self):
         X_train, X_test, y_train, y_test, categorical_features = \
@@ -57,8 +61,7 @@ def test_large_data_surrogate_error_tree(self):
         t0 = time.time()
         categorical_features = []
         model_analyzer = ModelAnalyzer(model, X_test, y_test,
-                                       feature_names,
-                                       categorical_features)
+                                       feature_names, categorical_features)
         max_depth = 3
         num_leaves = 31
         min_child_samples = 20
@@ -173,42 +176,88 @@ def test_parameters(self, metric, min_child_samples,
         X_train, X_test, y_train, y_test, feature_names, _ = create_iris_data()
 
         model = create_kneighbors_classifier(X_train, y_train)
-        categorical_features = []
         run_error_analyzer(model, X_test, y_test, feature_names,
-                           categorical_features,
                            max_depth=max_depth,
                            num_leaves=num_leaves,
                            min_child_samples=min_child_samples,
                            metric=metric)
 
+    def test_empty_cohort_cancer_classification(self):
+        X_train, X_test, y_train, y_test, feature_names, _ = \
+            create_cancer_data()
+
+        model = create_kneighbors_classifier(X_train, y_train)
+
+        composite_filters = [{COMPOSITE_FILTERS:
+                             [{COMPOSITE_FILTERS:
+                              [{ARG: [20.45, 22.27],
+                                COLUMN: 'mean radius',
+                                METHOD: CohortFilterMethods.METHOD_RANGE},
+                               {ARG: [10.88, 14.46],
+                                COLUMN: 'mean texture',
+                                METHOD: CohortFilterMethods.METHOD_RANGE}],
+                               OPERATION: CohortFilterOps.AND}],
+                             OPERATION: CohortFilterOps.OR}]
+        run_error_analyzer(model, X_test, y_test, feature_names,
+                           composite_filters=composite_filters)
+
+    def test_empty_cohort_diabetes_regression(self):
+        X_train, X_test, y_train, y_test, feature_names = \
+            create_diabetes_data()
+
+        model = create_kneighbors_classifier(X_train, y_train)
+
+        composite_filters = [{COMPOSITE_FILTERS:
+                             [{COMPOSITE_FILTERS:
+                              [{ARG: [0.06],
+                                COLUMN: 's1',
+                                METHOD: CohortFilterMethods.METHOD_GREATER},
+                               {ARG: [-0.01],
+                                COLUMN: 's2',
+                                METHOD: CohortFilterMethods.METHOD_LESS}],
+                               OPERATION: CohortFilterOps.AND}],
+                             OPERATION: CohortFilterOps.OR}]
+        run_error_analyzer(model, X_test, y_test, feature_names,
+                           composite_filters=composite_filters)
+
 
 def run_error_analyzer(model, X_test, y_test, feature_names,
-                       categorical_features, tree_features=None,
+                       categorical_features=None, tree_features=None,
                        max_depth=3, num_leaves=31,
                        min_child_samples=20,
+                       filters=None,
+                       composite_filters=None,
                        metric=None):
     error_analyzer = ModelAnalyzer(model, X_test, y_test,
                                    feature_names,
                                    categorical_features,
                                    metric=metric)
     if tree_features is None:
         tree_features = feature_names
-    filters = None
-    composite_filters = None
     tree = error_analyzer.compute_error_tree(
         tree_features, filters, composite_filters,
         max_depth=max_depth, num_leaves=num_leaves,
         min_child_samples=min_child_samples)
+    validation_data = X_test
+    if filters is not None or composite_filters is not None:
+        validation_data = filter_from_cohort(error_analyzer,
+                                             filters,
+                                             composite_filters)
+        y_test = validation_data[TRUE_Y]
+        validation_data = validation_data.drop(columns=[TRUE_Y, ROW_INDEX])
+        if not isinstance(X_test, pd.DataFrame):
+            validation_data = validation_data.values
+    validation_data_len = len(validation_data)
     assert tree is not None
     assert len(tree) > 0
     assert ERROR in tree[0]
     assert ID in tree[0]
     assert PARENTID in tree[0]
     assert tree[0][PARENTID] is None
     assert SIZE in tree[0]
-    assert tree[0][SIZE] == len(X_test)
+    assert tree[0][SIZE] == validation_data_len
     for node in tree:
-        assert node[SIZE] >= min_child_samples
+        assert node[SIZE] >= min(min_child_samples, validation_data_len)
 
 
 def validate_traversed_tree(tree, tree_dict, max_split_index,

diff --git a/responsibleai/tests/causal/conftest.py b/responsibleai/tests/causal/conftest.py
@@ -6,6 +6,8 @@
 import numpy as np
 import pandas as pd
 import pytest
+import shap
+from sklearn.model_selection import train_test_split
 
 from ..common_utils import create_adult_income_dataset, create_housing_data
 
@@ -103,3 +105,31 @@ def parks_data() -> Tuple[pd.DataFrame, pd.DataFrame, str]:
 
     target_feature = 'area'
     return train_df, test_df, target_feature
+
+
+@pytest.fixture(scope='session')
+def get_adult_shap_dataset():
+    X, y = shap.datasets.adult()
+
+    target_feature = "income"
+    y = [1 if y_i else 0 for y_i in y]
+
+    full_data = X.copy()
+    full_data[target_feature] = y
+
+    data_train, data_test = train_test_split(
+        full_data, test_size=1000, random_state=96132,
+        stratify=full_data[target_feature]
+    )
+
+    data_train.reset_index(drop=True, inplace=True)
+    data_test.reset_index(drop=True, inplace=True)
+
+    treatment_features = ["Age", "Sex"]
+    heterogeneity_features = ["Marital Status"]
+
+    cat_cols = ["Race", "Sex", "Workclass", "Marital Status",
+                "Country", "Occupation"]
+
+    return data_train, data_test, treatment_features, \
+        heterogeneity_features, cat_cols, target_feature