Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Reduce number of distributed tests. #2270

Merged
merged 8 commits into from
Jul 13, 2022
1 change: 0 additions & 1 deletion ludwig/automl/__init__.py
Original file line number Diff line number Diff line change
@@ -1 +0,0 @@
from ludwig.automl.automl import auto_train, cli_init_config, create_auto_config, train_with_config # noqa
2 changes: 1 addition & 1 deletion ludwig/automl/auto_tune_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@
try:
import GPUtil
except ImportError:
raise ImportError(" ray is not installed. " "In order to use auto_train please run " "pip install ludwig[ray]")
raise ImportError(" ray is not installed. In order to use auto_train please run pip install ludwig[ray]")

from ludwig.api import LudwigModel
from ludwig.automl.utils import get_model_type
Expand Down
10 changes: 2 additions & 8 deletions ludwig/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,14 +20,8 @@
from ludwig.api import LudwigModel
from ludwig.automl.auto_tune_config import memory_tune_config
from ludwig.automl.base_config import _create_default_config, _get_reference_configs, DatasetInfo, get_dataset_info
from ludwig.automl.utils import (
_add_transfer_config,
_ray_init,
get_available_resources,
get_model_type,
has_imbalanced_output,
set_output_feature_metric,
)
from ludwig.automl.ray_utils import _ray_init, get_available_resources
from ludwig.automl.utils import _add_transfer_config, get_model_type, has_imbalanced_output, set_output_feature_metric
from ludwig.constants import (
AUTOML_DEFAULT_IMAGE_ENCODER,
AUTOML_DEFAULT_TABULAR_MODEL,
Expand Down
92 changes: 4 additions & 88 deletions ludwig/automl/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,23 +21,10 @@
from dataclasses_json import dataclass_json, LetterCase

from ludwig.automl.data_source import DataframeSource, DataSource
from ludwig.automl.utils import _ray_init, FieldConfig, FieldInfo, FieldMetadata, get_available_resources
from ludwig.constants import (
AUDIO,
BINARY,
CATEGORY,
COMBINER,
DATE,
EXECUTOR,
HYPEROPT,
IMAGE,
NUMBER,
SCHEDULER,
SEARCH_ALG,
TEXT,
TYPE,
)
from ludwig.utils import strings_utils
from ludwig.automl.field_info import FieldConfig, FieldInfo, FieldMetadata
from ludwig.automl.ray_utils import _ray_init, get_available_resources
from ludwig.automl.type_inference import infer_type, should_exclude
from ludwig.constants import COMBINER, EXECUTOR, HYPEROPT, SCHEDULER, SEARCH_ALG, TEXT, TYPE
from ludwig.utils.data_utils import load_dataset, load_yaml
from ludwig.utils.defaults import default_random_seed

Expand All @@ -55,10 +42,6 @@

encoder_defaults = {"text": {"bert": os.path.join(CONFIG_DIR, "text/bert_config.yaml")}}

# For a given feature, the highest percentage of distinct values out of the total number of rows that we might still
# assign the CATEGORY type.
CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF = 0.5

# Cap for number of distinct values to return.
MAX_DISTINCT_VALUES_TO_RETURN = 10

Expand Down Expand Up @@ -341,73 +324,6 @@ def get_field_metadata(
return metadata


def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -> str:
"""Perform type inference on field.

# Inputs
:param field: (FieldInfo) object describing field
:param missing_value_percent: (float) percent of missing values in the column
:param row_count: (int) total number of entries in original dataset

# Return
:return: (str) feature type
"""
if field.dtype == DATE:
return DATE

num_distinct_values = field.num_distinct_values
if num_distinct_values == 0:
return CATEGORY
distinct_values = field.distinct_values
if num_distinct_values <= 2 and missing_value_percent == 0:
# Check that all distinct values are conventional bools.
if strings_utils.are_conventional_bools(distinct_values):
return BINARY

if field.image_values >= 3:
return IMAGE

if field.audio_values >= 3:
return AUDIO

# Use CATEGORY if:
# - The number of distinct values is significantly less than the total number of examples.
# - The distinct values are not all numbers.
# - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the
# values represent categories.
if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and (
(not strings_utils.are_all_numbers(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)
):
return CATEGORY

# Use NUMBER if all of the distinct values are numbers.
if strings_utils.are_all_numbers(distinct_values):
return NUMBER

# TODO (ASN): add other modalities (image, etc. )
# Fallback to TEXT.
return TEXT


def should_exclude(idx: int, field: FieldInfo, dtype: str, row_count: int, targets: Set[str]) -> bool:
if field.key == "PRI":
return True

if field.name in targets:
return False

if field.num_distinct_values == 0:
return True

distinct_value_percent = float(field.num_distinct_values) / row_count
if distinct_value_percent == 1.0:
upper_name = field.name.upper()
if (idx == 0 and dtype == NUMBER) or upper_name.endswith("ID") or upper_name.startswith("ID"):
return True

return False


def infer_mode(field: FieldInfo, targets: Set[str] = None) -> str:
if field.name in targets:
return "output"
Expand Down
38 changes: 38 additions & 0 deletions ludwig/automl/field_info.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from dataclasses import dataclass
from typing import List

from dataclasses_json import dataclass_json, LetterCase


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FieldInfo:
name: str
dtype: str
key: str = None
distinct_values: List = None
distinct_values_balance: float = 1.0
num_distinct_values: int = 0
nonnull_values: int = 0
image_values: int = 0
audio_values: int = 0
avg_words: int = None


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FieldConfig:
name: str
column: str
type: str


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FieldMetadata:
name: str
config: FieldConfig
excluded: bool
mode: str
missing_values: float
imbalance_ratio: float
30 changes: 30 additions & 0 deletions ludwig/automl/ray_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,30 @@
import logging
import os

try:
import ray
except ImportError:
raise ImportError(" ray is not installed. " "In order to use auto_train please run " "pip install ludwig[ray]")


def get_available_resources() -> dict:
# returns total number of gpus and cpus
resources = ray.cluster_resources()
gpus = resources.get("GPU", 0)
cpus = resources.get("CPU", 0)
resources = {"gpu": gpus, "cpu": cpus}
return resources


def _ray_init():
if ray.is_initialized():
return

# Forcibly terminate trial requested to stop after this amount of time passes
os.environ.setdefault("TUNE_FORCE_TRIAL_CLEANUP_S", "120")

try:
ray.init("auto", ignore_reinit_error=True)
except ConnectionError:
logging.info("Initializing new Ray cluster...")
ray.init()
76 changes: 76 additions & 0 deletions ludwig/automl/type_inference.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from typing import Set

from ludwig.automl.field_info import FieldInfo
from ludwig.constants import AUDIO, BINARY, CATEGORY, DATE, IMAGE, NUMBER, TEXT
from ludwig.utils import strings_utils

# For a given feature, the highest percentage of distinct values out of the total number of rows that we might still
# assign the CATEGORY type.
CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF = 0.5


def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -> str:
"""Perform type inference on field.

# Inputs
:param field: (FieldInfo) object describing field
:param missing_value_percent: (float) percent of missing values in the column
:param row_count: (int) total number of entries in original dataset

# Return
:return: (str) feature type
"""
if field.dtype == DATE:
return DATE

num_distinct_values = field.num_distinct_values
if num_distinct_values == 0:
return CATEGORY
distinct_values = field.distinct_values
if num_distinct_values <= 2 and missing_value_percent == 0:
# Check that all distinct values are conventional bools.
if strings_utils.are_conventional_bools(distinct_values):
return BINARY

if field.image_values >= 3:
return IMAGE

if field.audio_values >= 3:
return AUDIO

# Use CATEGORY if:
# - The number of distinct values is significantly less than the total number of examples.
# - The distinct values are not all numbers.
# - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the
# values represent categories.
if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and (
(not strings_utils.are_all_numbers(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)
):
return CATEGORY

# Use NUMBER if all of the distinct values are numbers.
if strings_utils.are_all_numbers(distinct_values):
return NUMBER

# TODO (ASN): add other modalities (image, etc. )
# Fallback to TEXT.
return TEXT


def should_exclude(idx: int, field: FieldInfo, dtype: str, row_count: int, targets: Set[str]) -> bool:
if field.key == "PRI":
return True

if field.name in targets:
return False

if field.num_distinct_values == 0:
return True

distinct_value_percent = float(field.num_distinct_values) / row_count
if distinct_value_percent == 1.0:
upper_name = field.name.upper()
if (idx == 0 and dtype == NUMBER) or upper_name.endswith("ID") or upper_name.startswith("ID"):
return True

return False
72 changes: 2 additions & 70 deletions ludwig/automl/utils.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,7 @@
import bisect
import logging
import os
from dataclasses import dataclass
from typing import Dict, List
from typing import Dict

from dataclasses_json import dataclass_json, LetterCase
from numpy import nan_to_num
from pandas import Series

Expand All @@ -26,48 +23,6 @@
from ludwig.modules.metric_registry import metric_registry
from ludwig.utils.defaults import default_combiner_type

try:
import ray
except ImportError:
raise ImportError(" ray is not installed. " "In order to use auto_train please run " "pip install ludwig[ray]")


logger = logging.getLogger(__name__)


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FieldInfo:
name: str
dtype: str
key: str = None
distinct_values: List = None
distinct_values_balance: float = 1.0
num_distinct_values: int = 0
nonnull_values: int = 0
image_values: int = 0
audio_values: int = 0
avg_words: int = None


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FieldConfig:
name: str
column: str
type: str


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
class FieldMetadata:
name: str
config: FieldConfig
excluded: bool
mode: str
missing_values: float
imbalance_ratio: float


def avg_num_tokens(field: Series) -> int:
# sample a subset if dataframe is large
Expand All @@ -78,15 +33,6 @@ def avg_num_tokens(field: Series) -> int:
return avg_words


def get_available_resources() -> dict:
# returns total number of gpus and cpus
resources = ray.cluster_resources()
gpus = resources.get("GPU", 0)
cpus = resources.get("CPU", 0)
resources = {"gpu": gpus, "cpu": cpus}
return resources


def get_model_type(config: dict) -> str:
if (
"input_features" in config
Expand All @@ -102,20 +48,6 @@ def get_model_type(config: dict) -> str:
return model_type


def _ray_init():
if ray.is_initialized():
return

# Forcibly terminate trial requested to stop after this amount of time passes
os.environ.setdefault("TUNE_FORCE_TRIAL_CLEANUP_S", "120")

try:
ray.init("auto", ignore_reinit_error=True)
except ConnectionError:
logger.info("Initializing new Ray cluster...")
ray.init()


# ref_configs comes from a file storing the config for a high-performing model per reference dataset.
# If the automl model type matches that of any reference models, set the initial point_to_evaluate
# in the automl hyperparameter search to the config of the reference model with the closest-matching
Expand All @@ -136,7 +68,7 @@ def _add_transfer_config(base_config: Dict, ref_configs: Dict) -> Dict:
min_dataset = dataset

if min_dataset is not None:
logger.info("Transfer config from dataset {}".format(min_dataset["name"]))
logging.info("Transfer config from dataset {}".format(min_dataset["name"]))
min_dataset_config = min_dataset[CONFIG]
hyperopt_params = base_config[HYPEROPT][PARAMETERS]
point_to_evaluate = {}
Expand Down
Loading