ludwig-ai · justinxzhao · Jul 13, 2022 · Jul 13, 2022 · Jul 13, 2022 · Jul 13, 2022
@@ -1 +0,0 @@
-from ludwig.automl.automl import auto_train, cli_init_config, create_auto_config, train_with_config  # noqa

@@ -9,7 +9,7 @@
 try:
     import GPUtil
 except ImportError:
-    raise ImportError(" ray is not installed. " "In order to use auto_train please run " "pip install ludwig[ray]")
+    raise ImportError(" ray is not installed. In order to use auto_train please run pip install ludwig[ray]")
 
 from ludwig.api import LudwigModel
 from ludwig.automl.utils import get_model_type

@@ -20,14 +20,8 @@
 from ludwig.api import LudwigModel
 from ludwig.automl.auto_tune_config import memory_tune_config
 from ludwig.automl.base_config import _create_default_config, _get_reference_configs, DatasetInfo, get_dataset_info
-from ludwig.automl.utils import (
-    _add_transfer_config,
-    _ray_init,
-    get_available_resources,
-    get_model_type,
-    has_imbalanced_output,
-    set_output_feature_metric,
-)
+from ludwig.automl.ray_utils import _ray_init, get_available_resources
+from ludwig.automl.utils import _add_transfer_config, get_model_type, has_imbalanced_output, set_output_feature_metric
 from ludwig.constants import (
     AUTOML_DEFAULT_IMAGE_ENCODER,
     AUTOML_DEFAULT_TABULAR_MODEL,

@@ -21,23 +21,10 @@
 from dataclasses_json import dataclass_json, LetterCase
 
 from ludwig.automl.data_source import DataframeSource, DataSource
-from ludwig.automl.utils import _ray_init, FieldConfig, FieldInfo, FieldMetadata, get_available_resources
-from ludwig.constants import (
-    AUDIO,
-    BINARY,
-    CATEGORY,
-    COMBINER,
-    DATE,
-    EXECUTOR,
-    HYPEROPT,
-    IMAGE,
-    NUMBER,
-    SCHEDULER,
-    SEARCH_ALG,
-    TEXT,
-    TYPE,
-)
-from ludwig.utils import strings_utils
+from ludwig.automl.field_info import FieldConfig, FieldInfo, FieldMetadata
+from ludwig.automl.ray_utils import _ray_init, get_available_resources
+from ludwig.automl.type_inference import infer_type, should_exclude
+from ludwig.constants import COMBINER, EXECUTOR, HYPEROPT, SCHEDULER, SEARCH_ALG, TEXT, TYPE
 from ludwig.utils.data_utils import load_dataset, load_yaml
 from ludwig.utils.defaults import default_random_seed
 
@@ -55,10 +42,6 @@
 
 encoder_defaults = {"text": {"bert": os.path.join(CONFIG_DIR, "text/bert_config.yaml")}}
 
-# For a given feature, the highest percentage of distinct values out of the total number of rows that we might still
-# assign the CATEGORY type.
-CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF = 0.5
-
 # Cap for number of distinct values to return.
 MAX_DISTINCT_VALUES_TO_RETURN = 10
 
@@ -341,73 +324,6 @@ def get_field_metadata(
     return metadata
 
 
-def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -> str:
-    """Perform type inference on field.
-
-    # Inputs
-    :param field: (FieldInfo) object describing field
-    :param missing_value_percent: (float) percent of missing values in the column
-    :param row_count: (int) total number of entries in original dataset
-
-    # Return
-    :return: (str) feature type
-    """
-    if field.dtype == DATE:
-        return DATE
-
-    num_distinct_values = field.num_distinct_values
-    if num_distinct_values == 0:
-        return CATEGORY
-    distinct_values = field.distinct_values
-    if num_distinct_values <= 2 and missing_value_percent == 0:
-        # Check that all distinct values are conventional bools.
-        if strings_utils.are_conventional_bools(distinct_values):
-            return BINARY
-
-    if field.image_values >= 3:
-        return IMAGE
-
-    if field.audio_values >= 3:
-        return AUDIO
-
-    # Use CATEGORY if:
-    # - The number of distinct values is significantly less than the total number of examples.
-    # - The distinct values are not all numbers.
-    # - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the
-    #   values represent categories.
-    if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and (
-        (not strings_utils.are_all_numbers(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)
-    ):
-        return CATEGORY
-
-    # Use NUMBER if all of the distinct values are numbers.
-    if strings_utils.are_all_numbers(distinct_values):
-        return NUMBER
-
-    # TODO (ASN): add other modalities (image, etc. )
-    # Fallback to TEXT.
-    return TEXT
-
-
-def should_exclude(idx: int, field: FieldInfo, dtype: str, row_count: int, targets: Set[str]) -> bool:
-    if field.key == "PRI":
-        return True
-
-    if field.name in targets:
-        return False
-
-    if field.num_distinct_values == 0:
-        return True
-
-    distinct_value_percent = float(field.num_distinct_values) / row_count
-    if distinct_value_percent == 1.0:
-        upper_name = field.name.upper()
-        if (idx == 0 and dtype == NUMBER) or upper_name.endswith("ID") or upper_name.startswith("ID"):
-            return True
-
-    return False
-
-
 def infer_mode(field: FieldInfo, targets: Set[str] = None) -> str:
     if field.name in targets:
         return "output"

@@ -0,0 +1,38 @@
+from dataclasses import dataclass
+from typing import List
+
+from dataclasses_json import dataclass_json, LetterCase
+
+
+@dataclass_json(letter_case=LetterCase.CAMEL)
+@dataclass
+class FieldInfo:
+    name: str
+    dtype: str
+    key: str = None
+    distinct_values: List = None
+    distinct_values_balance: float = 1.0
+    num_distinct_values: int = 0
+    nonnull_values: int = 0
+    image_values: int = 0
+    audio_values: int = 0
+    avg_words: int = None
+
+
+@dataclass_json(letter_case=LetterCase.CAMEL)
+@dataclass
+class FieldConfig:
+    name: str
+    column: str
+    type: str
+
+
+@dataclass_json(letter_case=LetterCase.CAMEL)
+@dataclass
+class FieldMetadata:
+    name: str
+    config: FieldConfig
+    excluded: bool
+    mode: str
+    missing_values: float
+    imbalance_ratio: float
@@ -0,0 +1,30 @@
+import logging
+import os
+
+try:
+    import ray
+except ImportError:
+    raise ImportError(" ray is not installed. " "In order to use auto_train please run " "pip install ludwig[ray]")
+
+
+def get_available_resources() -> dict:
+    # returns total number of gpus and cpus
+    resources = ray.cluster_resources()
+    gpus = resources.get("GPU", 0)
+    cpus = resources.get("CPU", 0)
+    resources = {"gpu": gpus, "cpu": cpus}
+    return resources
+
+
+def _ray_init():
+    if ray.is_initialized():
+        return
+
+    # Forcibly terminate trial requested to stop after this amount of time passes
+    os.environ.setdefault("TUNE_FORCE_TRIAL_CLEANUP_S", "120")
+
+    try:
+        ray.init("auto", ignore_reinit_error=True)
+    except ConnectionError:
+        logging.info("Initializing new Ray cluster...")
+        ray.init()
@@ -0,0 +1,76 @@
+from typing import Set
+
+from ludwig.automl.field_info import FieldInfo
+from ludwig.constants import AUDIO, BINARY, CATEGORY, DATE, IMAGE, NUMBER, TEXT
+from ludwig.utils import strings_utils
+
+# For a given feature, the highest percentage of distinct values out of the total number of rows that we might still
+# assign the CATEGORY type.
+CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF = 0.5
+
+
+def infer_type(field: FieldInfo, missing_value_percent: float, row_count: int) -> str:
+    """Perform type inference on field.
+
+    # Inputs
+    :param field: (FieldInfo) object describing field
+    :param missing_value_percent: (float) percent of missing values in the column
+    :param row_count: (int) total number of entries in original dataset
+
+    # Return
+    :return: (str) feature type
+    """
+    if field.dtype == DATE:
+        return DATE
+
+    num_distinct_values = field.num_distinct_values
+    if num_distinct_values == 0:
+        return CATEGORY
+    distinct_values = field.distinct_values
+    if num_distinct_values <= 2 and missing_value_percent == 0:
+        # Check that all distinct values are conventional bools.
+        if strings_utils.are_conventional_bools(distinct_values):
+            return BINARY
+
+    if field.image_values >= 3:
+        return IMAGE
+
+    if field.audio_values >= 3:
+        return AUDIO
+
+    # Use CATEGORY if:
+    # - The number of distinct values is significantly less than the total number of examples.
+    # - The distinct values are not all numbers.
+    # - The distinct values are all numbers but comprise of a perfectly sequential list of integers that suggests the
+    #   values represent categories.
+    if num_distinct_values < row_count * CATEGORY_TYPE_DISTINCT_VALUE_PERCENTAGE_CUTOFF and (
+        (not strings_utils.are_all_numbers(distinct_values)) or strings_utils.are_sequential_integers(distinct_values)
+    ):
+        return CATEGORY
+
+    # Use NUMBER if all of the distinct values are numbers.
+    if strings_utils.are_all_numbers(distinct_values):
+        return NUMBER
+
+    # TODO (ASN): add other modalities (image, etc. )
+    # Fallback to TEXT.
+    return TEXT
+
+
+def should_exclude(idx: int, field: FieldInfo, dtype: str, row_count: int, targets: Set[str]) -> bool:
+    if field.key == "PRI":
+        return True
+
+    if field.name in targets:
+        return False
+
+    if field.num_distinct_values == 0:
+        return True
+
+    distinct_value_percent = float(field.num_distinct_values) / row_count
+    if distinct_value_percent == 1.0:
+        upper_name = field.name.upper()
+        if (idx == 0 and dtype == NUMBER) or upper_name.endswith("ID") or upper_name.startswith("ID"):
+            return True
+
+    return False
@@ -1,10 +1,7 @@
 import bisect
 import logging
-import os
-from dataclasses import dataclass
-from typing import Dict, List
+from typing import Dict
 
-from dataclasses_json import dataclass_json, LetterCase
 from numpy import nan_to_num
 from pandas import Series
 
@@ -26,48 +23,6 @@
 from ludwig.modules.metric_registry import metric_registry
 from ludwig.utils.defaults import default_combiner_type
 
-try:
-    import ray
-except ImportError:
-    raise ImportError(" ray is not installed. " "In order to use auto_train please run " "pip install ludwig[ray]")
-
-
-logger = logging.getLogger(__name__)
-
-
-@dataclass_json(letter_case=LetterCase.CAMEL)
-@dataclass
-class FieldInfo:
-    name: str
-    dtype: str
-    key: str = None
-    distinct_values: List = None
-    distinct_values_balance: float = 1.0
-    num_distinct_values: int = 0
-    nonnull_values: int = 0
-    image_values: int = 0
-    audio_values: int = 0
-    avg_words: int = None
-
-
-@dataclass_json(letter_case=LetterCase.CAMEL)
-@dataclass
-class FieldConfig:
-    name: str
-    column: str
-    type: str
-
-
-@dataclass_json(letter_case=LetterCase.CAMEL)
-@dataclass
-class FieldMetadata:
-    name: str
-    config: FieldConfig
-    excluded: bool
-    mode: str
-    missing_values: float
-    imbalance_ratio: float
-
 
 def avg_num_tokens(field: Series) -> int:
     # sample a subset if dataframe is large
@@ -78,15 +33,6 @@ def avg_num_tokens(field: Series) -> int:
     return avg_words
 
 
-def get_available_resources() -> dict:
-    # returns total number of gpus and cpus
-    resources = ray.cluster_resources()
-    gpus = resources.get("GPU", 0)
-    cpus = resources.get("CPU", 0)
-    resources = {"gpu": gpus, "cpu": cpus}
-    return resources
-
-
 def get_model_type(config: dict) -> str:
     if (
         "input_features" in config
@@ -102,20 +48,6 @@ def get_model_type(config: dict) -> str:
     return model_type
 
 
-def _ray_init():
-    if ray.is_initialized():
-        return
-
-    # Forcibly terminate trial requested to stop after this amount of time passes
-    os.environ.setdefault("TUNE_FORCE_TRIAL_CLEANUP_S", "120")
-
-    try:
-        ray.init("auto", ignore_reinit_error=True)
-    except ConnectionError:
-        logger.info("Initializing new Ray cluster...")
-        ray.init()
-
-
 # ref_configs comes from a file storing the config for a high-performing model per reference dataset.
 # If the automl model type matches that of any reference models, set the initial point_to_evaluate
 # in the automl hyperparameter search to the config of the reference model with the closest-matching
@@ -136,7 +68,7 @@ def _add_transfer_config(base_config: Dict, ref_configs: Dict) -> Dict:
                 min_dataset = dataset
 
     if min_dataset is not None:
-        logger.info("Transfer config from dataset {}".format(min_dataset["name"]))
+        logging.info("Transfer config from dataset {}".format(min_dataset["name"]))
         min_dataset_config = min_dataset[CONFIG]
         hyperopt_params = base_config[HYPEROPT][PARAMETERS]
         point_to_evaluate = {}