Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update AutoML to check for imbalanced binary or category output features #2052

Merged
merged 6 commits into from
May 25, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 10 additions & 2 deletions ludwig/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
_ray_init,
get_available_resources,
get_model_type,
has_imbalanced_output,
set_output_feature_metric,
)
from ludwig.constants import (
Expand Down Expand Up @@ -147,8 +148,10 @@ def create_auto_config(
# Return
:return: (dict) selected model configuration
"""
default_configs = _create_default_config(dataset, target, time_limit_s, random_seed)
model_config, model_category, row_count = _model_select(dataset, default_configs, user_config, use_reference_config)
default_configs, features_metadata = _create_default_config(dataset, target, time_limit_s, random_seed)
model_config, model_category, row_count = _model_select(
dataset, default_configs, features_metadata, user_config, use_reference_config
)
if tune_for_memory:
if ray.is_initialized():
resources = get_available_resources() # check if cluster has GPUS
Expand Down Expand Up @@ -218,6 +221,7 @@ def train_with_config(
def _model_select(
dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
default_configs,
features_metadata,
user_config,
use_reference_config: bool,
):
Expand Down Expand Up @@ -271,6 +275,10 @@ def _model_select(
if param in user_config[config_section]:
del base_config["hyperopt"]["parameters"][hyperopt_params]

# check if any binary or category output feature has highly imbalanced minority vs majority values
# note: check is done after any relevant user_config has been applied
has_imbalanced_output(base_config, features_metadata)

# if single output feature, set relevant metric and goal if not already set
base_config = set_output_feature_metric(base_config)

Expand Down
12 changes: 8 additions & 4 deletions ludwig/automl/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -131,7 +131,7 @@ def _create_default_config(
if not isinstance(dataset, DatasetInfo):
dataset_info = get_dataset_info(dataset)

input_and_output_feature_config = get_features_config(
input_and_output_feature_config, features_metadata = get_features_config(
dataset_info.fields, dataset_info.row_count, resources, target_name
)
# create set of all feature types appearing in the dataset
Expand Down Expand Up @@ -165,7 +165,7 @@ def _create_default_config(
combiner_config = load_yaml(default_config)
model_configs[COMBINER][combiner_type] = combiner_config

return model_configs
return model_configs, features_metadata


# Read in the score and configuration of a reference model trained by Ludwig for each dataset in a list.
Expand Down Expand Up @@ -194,7 +194,9 @@ def get_dataset_info_from_source(source: DataSource) -> DatasetInfo:
fields = []
for field in source.columns:
dtype = source.get_dtype(field)
num_distinct_values, distinct_values = source.get_distinct_values(field, MAX_DISTINCT_VALUES_TO_RETURN)
num_distinct_values, distinct_values, distinct_values_balance = source.get_distinct_values(
field, MAX_DISTINCT_VALUES_TO_RETURN
)
nonnull_values = source.get_nonnull_values(field)
image_values = source.get_image_values(field)
audio_values = source.get_audio_values(field)
Expand All @@ -207,6 +209,7 @@ def get_dataset_info_from_source(source: DataSource) -> DatasetInfo:
dtype=dtype,
distinct_values=distinct_values,
num_distinct_values=num_distinct_values,
distinct_values_balance=distinct_values_balance,
nonnull_values=nonnull_values,
image_values=image_values,
audio_values=audio_values,
Expand Down Expand Up @@ -241,7 +244,7 @@ def get_features_config(
targets = set(targets)

metadata = get_field_metadata(fields, row_count, resources, targets)
return get_config_from_metadata(metadata, targets)
return get_config_from_metadata(metadata, targets), metadata


def get_config_from_metadata(metadata: List[FieldMetadata], targets: Set[str] = None) -> dict:
Expand Down Expand Up @@ -297,6 +300,7 @@ def get_field_metadata(
excluded=should_exclude(idx, field, dtype, row_count, targets),
mode=infer_mode(field, targets),
missing_values=missing_value_percent,
imbalance_ratio=field.distinct_values_balance,
)
)

Expand Down
6 changes: 5 additions & 1 deletion ludwig/automl/data_source.py
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,11 @@ def get_dtype(self, column: str) -> str:
def get_distinct_values(self, column, max_values_to_return: int) -> Tuple[int, List[str]]:
unique_values = self.df[column].dropna().unique()
num_unique_values = len(unique_values)
return num_unique_values, unique_values[:max_values_to_return]
unique_values_counts = self.df[column].value_counts()
unique_majority_values = unique_values_counts[unique_values_counts.idxmax()]
unique_minority_values = unique_values_counts[unique_values_counts.idxmin()]
unique_values_balance = unique_minority_values / unique_majority_values
return num_unique_values, unique_values[:max_values_to_return], unique_values_balance

def get_nonnull_values(self, column: str) -> int:
return len(self.df[column].notnull())
Expand Down
34 changes: 33 additions & 1 deletion ludwig/automl/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,20 @@
from numpy import nan_to_num
from pandas import Series

from ludwig.constants import COMBINER, CONFIG, HYPEROPT, NAME, NUMBER, PARAMETERS, SEARCH_ALG, TRAINER, TYPE
from ludwig.constants import (
BINARY,
CATEGORY,
COMBINER,
CONFIG,
HYPEROPT,
IMBALANCE_DETECTION_RATIO,
NAME,
NUMBER,
PARAMETERS,
SEARCH_ALG,
TRAINER,
TYPE,
)
from ludwig.features.feature_registries import output_type_registry
from ludwig.modules.metric_registry import metric_registry
from ludwig.utils.defaults import default_combiner_type
Expand All @@ -29,6 +42,7 @@ class FieldInfo:
dtype: str
key: str = None
distinct_values: List = None
distinct_values_balance: float = 1.0
num_distinct_values: int = 0
nonnull_values: int = 0
image_values: int = 0
Expand All @@ -52,6 +66,7 @@ class FieldMetadata:
excluded: bool
mode: str
missing_values: float
imbalance_ratio: float


def avg_num_tokens(field: Series) -> int:
Expand Down Expand Up @@ -178,3 +193,20 @@ def set_output_feature_metric(base_config):
base_config[HYPEROPT]["metric"] = output_metric
base_config[HYPEROPT]["goal"] = output_goal
return base_config


def has_imbalanced_output(base_config, features_metadata) -> bool:
"""Check binary and category output feature(s) for imbalance, i.e., low minority/majority instance count
ratio."""
imbalanced_output = False
for output_feature in base_config["output_features"]:
if output_feature[TYPE] == BINARY or output_feature[TYPE] == CATEGORY:
for feature_metadata in features_metadata:
if output_feature[NAME] == feature_metadata.name:
if feature_metadata.imbalance_ratio < IMBALANCE_DETECTION_RATIO:
logging.info(
f"Imbalance in {output_feature[NAME]}: minority/majority={feature_metadata.imbalance_ratio}"
)
imbalanced_output = True
break
return imbalanced_output
Comment on lines +210 to +212
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it be more simple to return True directly and then return False at the end of the function?

Copy link
Collaborator Author

@amholler amholler May 24, 2022

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The function is written to handle if there is more than one output feature
(the break is only out of the inner loop) and to log info for each imbalanced feature.
The function returns True if any of the output_features is imbalanced.

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I see. If there's value in logging each imbalanced output feature, then I agree that we shouldn't short-circuit.

1 change: 1 addition & 0 deletions ludwig/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -151,6 +151,7 @@
COMBINER = "combiner"

BALANCE_PERCENTAGE_TOLERANCE = 0.03
IMBALANCE_DETECTION_RATIO = 0.05

TABULAR = "tabular"
AUTOML_DEFAULT_TABULAR_MODEL = "tabnet"
Expand Down