Skip to content

Commit

Permalink
Merge branch 'main' into zhb/BugFix
Browse files Browse the repository at this point in the history
  • Loading branch information
zhb000 committed Apr 8, 2022
2 parents f90cb6e + 417364e commit 89e5d76
Show file tree
Hide file tree
Showing 7 changed files with 238 additions and 34 deletions.
21 changes: 11 additions & 10 deletions erroranalysis/erroranalysis/_internal/cohort_filter.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import numpy as np
import pandas as pd

from erroranalysis._internal.constants import (METHOD, PRED_Y, ROW_INDEX,
TRUE_Y, CohortFilterMethods,
ModelTask)
from erroranalysis._internal.constants import (ARG, COLUMN, COMPOSITE_FILTERS,
METHOD, OPERATION, PRED_Y,
ROW_INDEX, TRUE_Y,
CohortFilterMethods,
CohortFilterOps, ModelTask)
from erroranalysis._internal.metrics import get_ordered_classes

COLUMN = 'column'
MODEL = 'model'
CLASSIFICATION_OUTCOME = 'Classification outcome'

Expand Down Expand Up @@ -168,7 +169,7 @@ def build_query(filters, categorical_features, categories):
for filter in filters:
if METHOD in filter:
method = filter[METHOD]
arg0 = str(filter['arg'][0])
arg0 = str(filter[ARG][0])
colname = filter[COLUMN]
if method == CohortFilterMethods.METHOD_GREATER:
queries.append("`" + colname + "` > " + arg0)
Expand All @@ -179,7 +180,7 @@ def build_query(filters, categorical_features, categories):
elif method == CohortFilterMethods.METHOD_GREATER_AND_EQUAL:
queries.append("`" + colname + "` >= " + arg0)
elif method == CohortFilterMethods.METHOD_RANGE:
arg1 = str(filter['arg'][1])
arg1 = str(filter[ARG][1])
queries.append("`" + colname + "` >= " + arg0 +
' & `' + colname + "` <= " + arg1)
elif method == CohortFilterMethods.METHOD_INCLUDES or \
Expand All @@ -194,7 +195,7 @@ def build_query(filters, categorical_features, categories):
is_categorical = colname in categorical_features
if is_categorical:
cat_idx = categorical_features.index(colname)
arg0i = filter['arg'][0]
arg0i = filter[ARG][0]
arg_cat = categories[cat_idx][arg0i]
if isinstance(arg_cat, str):
queries.append("`{}` == '{}'".format(colname, arg_cat))
Expand All @@ -207,11 +208,11 @@ def build_query(filters, categorical_features, categories):
"Unsupported method type: {}".format(method))
else:
cqueries = []
for composite_filter in filter['compositeFilters']:
for composite_filter in filter[COMPOSITE_FILTERS]:
cqueries.append(build_query([composite_filter],
categorical_features,
categories))
if filter['operation'] == 'and':
if filter[OPERATION] == CohortFilterOps.AND:
queries.append('(' + ') & ('.join(cqueries) + ')')
else:
queries.append('(' + ') | ('.join(cqueries) + ')')
Expand Down Expand Up @@ -245,7 +246,7 @@ def build_bounds_query(filter, colname, method,
is_categorical = False
if categorical_features:
is_categorical = colname in categorical_features
for arg in filter['arg']:
for arg in filter[ARG]:
if is_categorical:
cat_idx = categorical_features.index(colname)
if isinstance(categories[cat_idx][arg], str):
Expand Down
22 changes: 17 additions & 5 deletions erroranalysis/erroranalysis/_internal/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,14 +3,18 @@

from enum import Enum

PRED_Y = 'pred_y'
ROW_INDEX = 'Index'
TRUE_Y = 'true_y'
ARG = 'arg'
COLUMN = 'column'
COMPOSITE_FILTERS = 'compositeFilters'
DIFF = 'diff'
SPLIT_INDEX = 'split_index'
SPLIT_FEATURE = 'split_feature'
LEAF_INDEX = 'leaf_index'
METHOD = 'method'
OPERATION = 'operation'
PRED_Y = 'pred_y'
ROW_INDEX = 'Index'
SPLIT_FEATURE = 'split_feature'
SPLIT_INDEX = 'split_index'
TRUE_Y = 'true_y'


class CohortFilterMethods:
Expand All @@ -27,6 +31,14 @@ class CohortFilterMethods:
METHOD_RANGE = 'in the range of'


class CohortFilterOps:
"""Cohort filter operations.
"""

AND = 'and'
OR = 'or'


class ModelTask(str, Enum):
"""Provide model task constants.
Expand Down
62 changes: 59 additions & 3 deletions erroranalysis/erroranalysis/_internal/surrogate_error_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,6 +124,8 @@ def compute_error_tree(analyzer,
filtered_df = filter_from_cohort(analyzer,
filters,
composite_filters)
if filtered_df.shape[0] == 0:
return create_empty_node(analyzer.metric)
row_index = filtered_df[ROW_INDEX]
true_y = filtered_df[TRUE_Y]
dropped_cols = [TRUE_Y, ROW_INDEX]
Expand Down Expand Up @@ -548,7 +550,62 @@ def node_to_dict(df, tree, nodeid, categories, json,
node_name = feature_names[tree[SPLIT_FEATURE]]
else:
node_name = None
json.append({
json.append(get_json_node(arg, condition, error, nodeid, method,
node_name, parentid, p_node_name,
total, success, metric_name,
metric_value, is_error_metric))
return json, df


def create_empty_node(metric):
"""Create an empty node for the tree.
:param metric: The metric to use for the node.
:type metric: str
:return: The empty node.
:rtype: dict
"""
metric_name = metric_to_display_name[metric]
is_error_metric = metric in error_metrics
return [get_json_node(None, None, 0, 0, None, None, None,
None, 0, 0, metric_name, 0, is_error_metric)]


def get_json_node(arg, condition, error, nodeid, method, node_name,
parentid, p_node_name, total, success, metric_name,
metric_value, is_error_metric):
"""Get the json node for the tree.
:param arg: The arg for the node.
:type arg: str
:param condition: The condition for the node.
:type condition: str
:param error: The error for the node.
:type error: int
:param nodeid: The node id for the node.
:type nodeid: int
:param method: The method for the node.
:type method: str
:param node_name: The node name for the node.
:type node_name: str
:param parentid: The parent id for the node.
:type parentid: int
:param p_node_name: The parent node name for the node.
:type p_node_name: str
:param total: The total number of instances in the node.
:type total: int
:param success: The total number of success instances for the node.
:type success: int
:param metric_name: The metric name for the node.
:type metric_name: str
:param metric_value: The metric value for the node.
:type metric_value: float
:param is_error_metric: Whether the metric is an error metric.
:type is_error_metric: bool
:return: The json node.
:rtype: dict
"""
return {
"arg": arg,
"badFeaturesRowCount": 0, # Note: remove this eventually
"condition": condition,
Expand All @@ -566,8 +623,7 @@ def node_to_dict(df, tree, nodeid, categories, json,
"metricName": metric_name,
"metricValue": float(metric_value),
"isErrorMetric": is_error_metric
})
return json, df
}


def get_regression_metric_data(df):
Expand Down
4 changes: 2 additions & 2 deletions erroranalysis/erroranalysis/version.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,6 @@

name = 'erroranalysis'
_major = '0'
_minor = '1'
_patch = '31'
_minor = '2'
_patch = '0'
version = '{}.{}.{}'.format(_major, _minor, _patch)
77 changes: 63 additions & 14 deletions erroranalysis/tests/test_surrogate_error_tree.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,17 +3,23 @@

import time

import pandas as pd
import pytest
from common_utils import (create_adult_census_data,
create_binary_classification_dataset,
create_cancer_data, create_diabetes_data,
create_iris_data, create_kneighbors_classifier,
create_models_classification,
create_sklearn_random_forest_regressor,
replicate_dataset)

from erroranalysis._internal.constants import (DIFF, LEAF_INDEX, PRED_Y,
from erroranalysis._internal.cohort_filter import filter_from_cohort
from erroranalysis._internal.constants import (ARG, COLUMN, COMPOSITE_FILTERS,
DIFF, LEAF_INDEX, METHOD,
OPERATION, PRED_Y, ROW_INDEX,
SPLIT_FEATURE, SPLIT_INDEX,
TRUE_Y, Metrics)
TRUE_Y, CohortFilterMethods,
CohortFilterOps, Metrics)
from erroranalysis._internal.error_analyzer import ModelAnalyzer
from erroranalysis._internal.surrogate_error_tree import (
TreeSide, cache_subtree_features, create_surrogate_model,
Expand All @@ -33,9 +39,7 @@ def test_surrogate_error_tree_iris(self):
models = create_models_classification(X_train, y_train)

for model in models:
categorical_features = []
run_error_analyzer(model, X_test, y_test, feature_names,
categorical_features)
run_error_analyzer(model, X_test, y_test, feature_names)

def test_surrogate_error_tree_int_categorical(self):
X_train, X_test, y_train, y_test, categorical_features = \
Expand All @@ -57,8 +61,7 @@ def test_large_data_surrogate_error_tree(self):
t0 = time.time()
categorical_features = []
model_analyzer = ModelAnalyzer(model, X_test, y_test,
feature_names,
categorical_features)
feature_names, categorical_features)
max_depth = 3
num_leaves = 31
min_child_samples = 20
Expand Down Expand Up @@ -173,42 +176,88 @@ def test_parameters(self, metric, min_child_samples,
X_train, X_test, y_train, y_test, feature_names, _ = create_iris_data()

model = create_kneighbors_classifier(X_train, y_train)
categorical_features = []
run_error_analyzer(model, X_test, y_test, feature_names,
categorical_features,
max_depth=max_depth,
num_leaves=num_leaves,
min_child_samples=min_child_samples,
metric=metric)

def test_empty_cohort_cancer_classification(self):
X_train, X_test, y_train, y_test, feature_names, _ = \
create_cancer_data()

model = create_kneighbors_classifier(X_train, y_train)

composite_filters = [{COMPOSITE_FILTERS:
[{COMPOSITE_FILTERS:
[{ARG: [20.45, 22.27],
COLUMN: 'mean radius',
METHOD: CohortFilterMethods.METHOD_RANGE},
{ARG: [10.88, 14.46],
COLUMN: 'mean texture',
METHOD: CohortFilterMethods.METHOD_RANGE}],
OPERATION: CohortFilterOps.AND}],
OPERATION: CohortFilterOps.OR}]
run_error_analyzer(model, X_test, y_test, feature_names,
composite_filters=composite_filters)

def test_empty_cohort_diabetes_regression(self):
X_train, X_test, y_train, y_test, feature_names = \
create_diabetes_data()

model = create_kneighbors_classifier(X_train, y_train)

composite_filters = [{COMPOSITE_FILTERS:
[{COMPOSITE_FILTERS:
[{ARG: [0.06],
COLUMN: 's1',
METHOD: CohortFilterMethods.METHOD_GREATER},
{ARG: [-0.01],
COLUMN: 's2',
METHOD: CohortFilterMethods.METHOD_LESS}],
OPERATION: CohortFilterOps.AND}],
OPERATION: CohortFilterOps.OR}]
run_error_analyzer(model, X_test, y_test, feature_names,
composite_filters=composite_filters)


def run_error_analyzer(model, X_test, y_test, feature_names,
categorical_features, tree_features=None,
categorical_features=None, tree_features=None,
max_depth=3, num_leaves=31,
min_child_samples=20,
filters=None,
composite_filters=None,
metric=None):
error_analyzer = ModelAnalyzer(model, X_test, y_test,
feature_names,
categorical_features,
metric=metric)
if tree_features is None:
tree_features = feature_names
filters = None
composite_filters = None
tree = error_analyzer.compute_error_tree(
tree_features, filters, composite_filters,
max_depth=max_depth, num_leaves=num_leaves,
min_child_samples=min_child_samples)
validation_data = X_test
if filters is not None or composite_filters is not None:
validation_data = filter_from_cohort(error_analyzer,
filters,
composite_filters)
y_test = validation_data[TRUE_Y]
validation_data = validation_data.drop(columns=[TRUE_Y, ROW_INDEX])
if not isinstance(X_test, pd.DataFrame):
validation_data = validation_data.values
validation_data_len = len(validation_data)
assert tree is not None
assert len(tree) > 0
assert ERROR in tree[0]
assert ID in tree[0]
assert PARENTID in tree[0]
assert tree[0][PARENTID] is None
assert SIZE in tree[0]
assert tree[0][SIZE] == len(X_test)
assert tree[0][SIZE] == validation_data_len
for node in tree:
assert node[SIZE] >= min_child_samples
assert node[SIZE] >= min(min_child_samples, validation_data_len)


def validate_traversed_tree(tree, tree_dict, max_split_index,
Expand Down
30 changes: 30 additions & 0 deletions responsibleai/tests/causal/conftest.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,8 @@
import numpy as np
import pandas as pd
import pytest
import shap
from sklearn.model_selection import train_test_split

from ..common_utils import create_adult_income_dataset, create_housing_data

Expand Down Expand Up @@ -103,3 +105,31 @@ def parks_data() -> Tuple[pd.DataFrame, pd.DataFrame, str]:

target_feature = 'area'
return train_df, test_df, target_feature


@pytest.fixture(scope='session')
def get_adult_shap_dataset():
X, y = shap.datasets.adult()

target_feature = "income"
y = [1 if y_i else 0 for y_i in y]

full_data = X.copy()
full_data[target_feature] = y

data_train, data_test = train_test_split(
full_data, test_size=1000, random_state=96132,
stratify=full_data[target_feature]
)

data_train.reset_index(drop=True, inplace=True)
data_test.reset_index(drop=True, inplace=True)

treatment_features = ["Age", "Sex"]
heterogeneity_features = ["Marital Status"]

cat_cols = ["Race", "Sex", "Workclass", "Marital Status",
"Country", "Occupation"]

return data_train, data_test, treatment_features, \
heterogeneity_features, cat_cols, target_feature

0 comments on commit 89e5d76

Please sign in to comment.