-
Notifications
You must be signed in to change notification settings - Fork 1.2k
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Initial implementation of the end-to-end autotrain module #1219
Changes from 7 commits
26a9663
ff115c8
d8543f4
7419fc8
aeba080
fa28c82
ee130f7
cc81e27
03ba456
1da9aae
79e66e2
aa5e174
efcca6e
bfa0794
725f688
32fa44b
a887f04
df96d21
cfda49f
45a9af3
dc995b9
eb116e0
336b17e
e783e60
f93289a
85e7b56
5bb9312
01aa523
cb2b171
1767247
72cb96a
ba1d952
aa79c17
49a334e
59190b5
5da3865
74715c9
d23d1de
1eeddd5
9d663e8
cf922e6
56170bb
e08e758
4926ff1
e16b3a7
415acd2
faae740
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -8,15 +8,29 @@ | |
(2) Tunes config based on resource constraints | ||
(3) Runs hyperparameter optimization experiment | ||
""" | ||
from logging import raiseExceptions | ||
import logging | ||
import sys | ||
from typing import Dict, Union | ||
|
||
import dask.dataframe as dd | ||
import numpy as np | ||
import pandas as pd | ||
from ludwig.automl.base_config import create_default_config | ||
from ludwig.hyperopt.run import hyperopt | ||
|
||
logger = logging.getLogger(__name__) | ||
|
||
|
||
try: | ||
import dask.dataframe as dd | ||
import ray | ||
except ImportError: | ||
logger.error( | ||
' ray is not installed. ' | ||
'In order to use auto_train please run ' | ||
'pip install ludwig[ray]' | ||
) | ||
sys.exit(-1) | ||
|
||
OUTPUT_DIR = "." | ||
|
||
|
||
|
@@ -26,14 +40,15 @@ def model_select(default_configs): | |
Note: Current implementation returns tabnet by default. This will be | ||
improved in subsequent iterations | ||
""" | ||
return default_configs['tabnet'], 'tabnet' | ||
return default_configs['tabnet'] | ||
|
||
|
||
def auto_train( | ||
tgaddair marked this conversation as resolved.
Show resolved
Hide resolved
|
||
dataset: Union[str, pd.DataFrame, dd.core.DataFrame], | ||
target: str, | ||
time_limit_s: Union[int, float], | ||
output_dir: str = OUTPUT_DIR | ||
output_dir: str = OUTPUT_DIR, | ||
config=None, | ||
): | ||
""" | ||
Main auto train API that first builds configs for each model type | ||
|
@@ -51,13 +66,12 @@ def auto_train( | |
# Returns | ||
:return: (str) path to best trained model | ||
""" | ||
|
||
default_configs = create_default_config(dataset, target, time_limit_s) | ||
model_config, model_name = model_select(default_configs) | ||
hyperopt_results = _train(model_config, dataset, | ||
if config is None: | ||
config = _create_auto_config(dataset, target, time_limit_s) | ||
model_name = config['combiner']['type'] | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. COMBINER and TYPE should be constants |
||
hyperopt_results = _train(config, dataset, | ||
output_dir, model_name=model_name) | ||
experiment_analysis = hyperopt_results.experiment_analysis | ||
|
||
# catch edge case where metric_score is nan | ||
# TODO (ASN): Decide how we want to proceed if at least one trial has | ||
# completed | ||
|
@@ -76,6 +90,12 @@ def auto_train( | |
return autotrain_results | ||
|
||
|
||
def _create_auto_config(dataset, target, time_limit_s) -> dict: | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Let's make this public by removing the underscore. But There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @tgaddair Totally agree with making There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The idea would be if the user wants to inspect the auto config and modify it before training, e.g.:
Does that seem reasonable to you? There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Right right. this make total sense! |
||
default_configs = create_default_config(dataset, target, time_limit_s) | ||
model_config = model_select(default_configs) | ||
return model_config | ||
|
||
|
||
def _train( | ||
config: Dict, | ||
dataset: Union[str, pd.DataFrame, dd.core.DataFrame], | ||
|
Original file line number | Diff line number | Diff line change |
---|---|---|
|
@@ -14,15 +14,29 @@ | |
(base implementation -- # CPU, # GPU) | ||
""" | ||
|
||
import logging | ||
import os | ||
from typing import Dict, List, Union | ||
import sys | ||
from typing import List, Union | ||
|
||
import pandas as pd | ||
import dask.dataframe as dd | ||
from ludwig.automl.utils import (FieldInfo, get_available_resources, | ||
avg_num_tokens) | ||
from ludwig.automl.utils import (FieldInfo, avg_num_tokens, | ||
get_available_resources) | ||
from ludwig.constants import BINARY, CATEGORY, CONFIG, NUMERICAL, TEXT, TYPE | ||
from ludwig.utils.data_utils import load_yaml | ||
|
||
logger = logging.getLogger(__name__) | ||
try: | ||
import dask.dataframe as dd | ||
import ray | ||
except ImportError: | ||
logger.error( | ||
' ray is not installed. ' | ||
'In order to use auto_train please run ' | ||
'pip install ludwig[ray]' | ||
) | ||
sys.exit(-1) | ||
|
||
PATH_HERE = os.path.abspath(os.path.dirname(__file__)) | ||
CONFIG_DIR = os.path.join(PATH_HERE, 'defaults') | ||
|
||
|
@@ -33,16 +47,16 @@ | |
} | ||
|
||
|
||
def allocate_experiment_resources(resources: Dict) -> Dict: | ||
def allocate_experiment_resources(resources: dict) -> dict: | ||
""" | ||
Allocates ray trial resources based on available resources | ||
|
||
# Inputs | ||
:param resources (Dict) specifies all available GPUs, CPUs and associated | ||
:param resources (dict) specifies all available GPUs, CPUs and associated | ||
metadata of the machines (i.e. memory) | ||
|
||
# Return | ||
:return: (Dict) gpu and cpu resources per trial | ||
:return: (dict) gpu and cpu resources per trial | ||
""" | ||
# TODO (ASN): | ||
# (1) expand logic to support multiple GPUs per trial (multi-gpu training) | ||
|
@@ -51,18 +65,23 @@ def allocate_experiment_resources(resources: Dict) -> Dict: | |
experiment_resources = { | ||
'cpu_resources_per_trial': 1 | ||
} | ||
if resources['gpu'] > 0: | ||
gpu_count, cpu_count = resources['gpu'], resources['cpu'] | ||
if gpu_count > 0: | ||
experiment_resources.update({ | ||
tgaddair marked this conversation as resolved.
Show resolved
Hide resolved
|
||
'gpu_resources_per_trial': 1 | ||
}) | ||
if cpu_count > 1: | ||
cpus_per_trial = int(cpu_count/gpu_count) | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Maybe |
||
experiment_resources['cpu_resources_per_trial'] = cpus_per_trial | ||
|
||
return experiment_resources | ||
|
||
|
||
def create_default_config( | ||
dataset: Union[str, dd.core.DataFrame, pd.DataFrame], | ||
target_name: str = None, | ||
time_limit_s: Union[int, float] = None | ||
) -> Dict: | ||
) -> dict: | ||
""" | ||
Returns auto_train configs for three available combiner models. | ||
Coordinates the following tasks: | ||
|
@@ -81,7 +100,7 @@ def create_default_config( | |
as the stopping parameter | ||
|
||
# Return | ||
:return: (Dict) dictionaries contain auto train config files for all available | ||
:return: (dict) dictionaries contain auto train config files for all available | ||
combiner types | ||
|
||
""" | ||
|
@@ -105,7 +124,7 @@ def create_default_config( | |
|
||
def get_field_info(dataset: str): | ||
""" | ||
Constructs FeildInfo objects for each feature in dataset. These objects | ||
Constructs FieldInfo objects for each feature in dataset. These objects | ||
are used for downstream type inference | ||
|
||
# Inputs | ||
|
@@ -138,9 +157,9 @@ def get_field_info(dataset: str): | |
def get_features_config( | ||
fields: List[FieldInfo], | ||
row_count: int, | ||
resources: Dict, | ||
resources: dict, | ||
target_name: str = None, | ||
) -> Dict: | ||
) -> dict: | ||
""" | ||
Constructs FeildInfo objects for each feature in dataset. These objects | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Nit: typo in "FieldInfo" |
||
are used for downstream type inference | ||
|
@@ -151,7 +170,7 @@ def get_features_config( | |
:param target_name (str) name of target feature | ||
|
||
# Return | ||
:return: (Dict) section of auto_train config for input_features and output_features | ||
:return: (dict) section of auto_train config for input_features and output_features | ||
""" | ||
metadata = get_field_metadata(fields, row_count, resources, target_name) | ||
return get_config_from_metadata(metadata, target_name) | ||
|
@@ -163,11 +182,11 @@ def get_config_from_metadata(metadata: list, target_name: str = None) -> dict: | |
metadata | ||
|
||
# Inputs | ||
:param metadata: (List[Dict]) field descriptions | ||
:param metadata: (List[dict]) field descriptions | ||
:param target_name (str) name of target feature | ||
|
||
# Return | ||
:return: (Dict) section of auto_train config for input_features and output_features | ||
:return: (dict) section of auto_train config for input_features and output_features | ||
""" | ||
config = { | ||
"input_features": [], | ||
|
@@ -176,15 +195,15 @@ def get_config_from_metadata(metadata: list, target_name: str = None) -> dict: | |
|
||
for field_meta in metadata: | ||
if field_meta["name"] == target_name: | ||
config["output_features"].append(field_meta["config"]) | ||
config["output_features"].append(field_meta[CONFIG]) | ||
elif not field_meta["excluded"] and field_meta["mode"] == "input": | ||
config["input_features"].append(field_meta["config"]) | ||
config["input_features"].append(field_meta[CONFIG]) | ||
|
||
return config | ||
|
||
|
||
def get_field_metadata( | ||
fields: List[FieldInfo], row_count: int, resources: Dict, target_name: str = None | ||
fields: List[FieldInfo], row_count: int, resources: dict, target_name: str = None | ||
) -> list: | ||
""" | ||
Computes metadata for each field in dataset | ||
|
@@ -221,7 +240,7 @@ def get_field_metadata( | |
sum( | ||
not meta["excluded"] | ||
and meta["mode"] == "input" | ||
and meta["config"]["type"] != "text" | ||
and meta[CONFIG][TYPE] != TEXT | ||
for meta in metadata | ||
) | ||
- 1 | ||
|
@@ -230,7 +249,7 @@ def get_field_metadata( | |
# Exclude text fields if no GPUs are available | ||
if resources['gpu'] == 0: | ||
for meta in metadata: | ||
if input_count > 2 and meta["config"]["type"] == "text": | ||
if input_count > 2 and meta[CONFIG][TYPE] == TEXT: | ||
# By default, exclude text inputs when there are other candidate inputs | ||
meta["excluded"] = True | ||
|
||
|
@@ -255,20 +274,20 @@ def infer_type( | |
if distinct_values == 2 and ( | ||
missing_value_percent == 0 or field.name == target_name | ||
): | ||
return "binary" | ||
return BINARY | ||
|
||
if distinct_values < 20: | ||
# TODO (tgaddair): come up with something better than this, maybe attempt to fit to Gaussian | ||
# NOTE (ASN): edge case -- there are less than 20 samples in dataset | ||
return "category" | ||
return CATEGORY | ||
|
||
# add criteria for number of spaces | ||
if field.avg_words and field.avg_words > 2: | ||
return "text" | ||
return TEXT | ||
|
||
# TODO (ASN): add other modalities (image, etc. ) | ||
|
||
return "numerical" | ||
return NUMERICAL | ||
|
||
|
||
def should_exclude(field: FieldInfo, row_count: int, target_name: str) -> bool: | ||
|
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
I know we do this in a few other places in Ludwig, but for programatic usage, we should probably avoid calling
sys.exit
in case the user doesn't want their notebook to crash. Maybe raise an exception?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Good point