Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Initial implementation of the end-to-end autotrain module #1219

Merged
merged 47 commits into from
Jul 14, 2021
Merged
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
47 commits
Select commit Hold shift + click to select a range
26a9663
first pass @ e2e autotrain
ANarayan Jun 25, 2021
ff115c8
first pass @ auto batch scaling
ANarayan Jun 29, 2021
d8543f4
add additional parameter for pbt scheduler and supports passing time …
ANarayan Jun 29, 2021
7419fc8
add default hyperparameter search space + tune_batch_size parameter
ANarayan Jun 29, 2021
aeba080
add comments and delete tune_config.py
ANarayan Jun 29, 2021
fa28c82
fix bug in assignment of pbt scheduler paramter
ANarayan Jun 29, 2021
ee130f7
fix bug to support pbt scheduler
ANarayan Jun 29, 2021
cc81e27
fix bug and cpu/gpu resource specification in config
ANarayan Jun 29, 2021
03ba456
fix pbt scheduelr params and validation metric bug in config files
ANarayan Jun 29, 2021
1da9aae
add max_trials to auto tune function
ANarayan Jun 29, 2021
79e66e2
change search space encoding to only json encode lists which do not c…
ANarayan Jun 30, 2021
aa5e174
add function to support training for tune_batch_size and tune_learnin…
ANarayan Jun 30, 2021
efcca6e
change default scheduler to async_hyperband
ANarayan Jun 30, 2021
bfa0794
sort imports
ANarayan Jun 30, 2021
725f688
makes train an internal func. & adds output_dir param to auto_train
ANarayan Jun 30, 2021
32fa44b
minor naming changes
ANarayan Jul 1, 2021
a887f04
add a first pass @ an auto learning rate tuner
ANarayan Jul 1, 2021
df96d21
minor naming change
ANarayan Jul 1, 2021
cfda49f
replace GPUtil/psutil with ray cluster resources
ANarayan Jul 1, 2021
45a9af3
fix bugs in tune_learning_rate
ANarayan Jul 1, 2021
dc995b9
fix bugs in function imports
ANarayan Jul 1, 2021
eb116e0
add missing type to concat config
ANarayan Jul 1, 2021
336b17e
add support for dask df inputs and add return dict from auto_train api
ANarayan Jul 2, 2021
e783e60
only exclude text features if there are no available GPUs
ANarayan Jul 2, 2021
f93289a
add float to TrialResults dataclass to handle nans produced when auto…
ANarayan Jul 2, 2021
85e7b56
add support for auto keyword for batch_size and learning_rate
ANarayan Jul 2, 2021
5bb9312
add limit on tune batch size halving capacity
ANarayan Jul 2, 2021
01aa523
fix bug in tune batch size
ANarayan Jul 2, 2021
cb2b171
fixed bug in halving logic and added limit on batch_size bound
ANarayan Jul 6, 2021
1767247
add eager mode execution to tune_batch_size
ANarayan Jul 9, 2021
72cb96a
catch failed trials
ANarayan Jul 10, 2021
ba1d952
handles edge case where a trial never starts
ANarayan Jul 10, 2021
aa79c17
fix variable passing bug
ANarayan Jul 12, 2021
49a334e
format value error message
ANarayan Jul 12, 2021
59190b5
add constants BATCH_SIZE, LEARNING_RATE, AUTO
ANarayan Jul 13, 2021
5da3865
add more constants
ANarayan Jul 13, 2021
74715c9
add ray import exception
ANarayan Jul 13, 2021
d23d1de
add try/finally catch to ensure eager execution mode is properly reset
ANarayan Jul 13, 2021
1eeddd5
add ray import exception to utils.py
ANarayan Jul 13, 2021
9d663e8
remove accidental batch_size import
ANarayan Jul 13, 2021
cf922e6
add CONFIG to constants
ANarayan Jul 13, 2021
56170bb
minor change
ANarayan Jul 13, 2021
e08e758
add COMBINER to constants
ANarayan Jul 14, 2021
4926ff1
fix ray import exception and function signatures
ANarayan Jul 14, 2021
e16b3a7
remove unused import
ANarayan Jul 14, 2021
415acd2
change nan excpetion catch to warning
ANarayan Jul 14, 2021
faae740
Merge branch 'master' into automl
tgaddair Jul 14, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 5 additions & 5 deletions ludwig/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@

from ludwig.backend import Backend, initialize_backend
from ludwig.callbacks import Callback
from ludwig.constants import FULL, PREPROCESSING, TEST, TRAINING, VALIDATION
from ludwig.constants import FULL, PREPROCESSING, TEST, TRAINING, VALIDATION, LEARNING_RATE, BATCH_SIZE, AUTO
from ludwig.data.dataset.base import Dataset
from ludwig.data.postprocessing import convert_predictions, postprocess
from ludwig.data.preprocessing import (load_metadata,
Expand Down Expand Up @@ -481,24 +481,24 @@ def train(
)

# auto tune batch size
if self.config[TRAINING]["batch_size"] == "auto":
if self.config[TRAINING][BATCH_SIZE] == AUTO:
# TODO (ASN): add support for substitute_with_max parameter
tuned_batch_size = trainer.tune_batch_size(
self.config,
training_set,
random_seed=random_seed
)
self.config[TRAINING]['batch_size'] = tuned_batch_size
self.config[TRAINING][BATCH_SIZE] = tuned_batch_size

# auto tune learning rate
if self.config[TRAINING]["learning_rate"] == "auto":
if self.config[TRAINING][LEARNING_RATE] == AUTO:
new_learning_rate = trainer.tune_learning_rate(
self.config,
LudwigModel.create_model(self.config, random_seed),
training_set,
random_seed=random_seed
)
self.config[TRAINING]['learning_rate'] = new_learning_rate
self.config[TRAINING][LEARNING_RATE] = new_learning_rate

# train model
if self.backend.is_coordinator():
Expand Down
38 changes: 29 additions & 9 deletions ludwig/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,29 @@
(2) Tunes config based on resource constraints
(3) Runs hyperparameter optimization experiment
"""
from logging import raiseExceptions
import logging
import sys
from typing import Dict, Union

import dask.dataframe as dd
import numpy as np
import pandas as pd
from ludwig.automl.base_config import create_default_config
from ludwig.hyperopt.run import hyperopt

logger = logging.getLogger(__name__)


try:
import dask.dataframe as dd
import ray
except ImportError:
logger.error(
' ray is not installed. '
'In order to use auto_train please run '
'pip install ludwig[ray]'
)
sys.exit(-1)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I know we do this in a few other places in Ludwig, but for programatic usage, we should probably avoid calling sys.exit in case the user doesn't want their notebook to crash. Maybe raise an exception?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Good point


OUTPUT_DIR = "."


Expand All @@ -26,14 +40,15 @@ def model_select(default_configs):
Note: Current implementation returns tabnet by default. This will be
improved in subsequent iterations
"""
return default_configs['tabnet'], 'tabnet'
return default_configs['tabnet']


def auto_train(
tgaddair marked this conversation as resolved.
Show resolved Hide resolved
dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
target: str,
time_limit_s: Union[int, float],
output_dir: str = OUTPUT_DIR
output_dir: str = OUTPUT_DIR,
config=None,
):
"""
Main auto train API that first builds configs for each model type
Expand All @@ -51,13 +66,12 @@ def auto_train(
# Returns
:return: (str) path to best trained model
"""

default_configs = create_default_config(dataset, target, time_limit_s)
model_config, model_name = model_select(default_configs)
hyperopt_results = _train(model_config, dataset,
if config is None:
config = _create_auto_config(dataset, target, time_limit_s)
model_name = config['combiner']['type']
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

COMBINER and TYPE should be constants

hyperopt_results = _train(config, dataset,
output_dir, model_name=model_name)
experiment_analysis = hyperopt_results.experiment_analysis

# catch edge case where metric_score is nan
# TODO (ASN): Decide how we want to proceed if at least one trial has
# completed
Expand All @@ -76,6 +90,12 @@ def auto_train(
return autotrain_results


def _create_auto_config(dataset, target, time_limit_s) -> dict:
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Let's make this public by removing the underscore. But create_default_config and model_select it may make sense to be private.

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

@tgaddair Totally agree with making create_default_config and model_select private. Whats the reasoning for making create_auto_config public?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The idea would be if the user wants to inspect the auto config and modify it before training, e.g.:

config = create_auto_config()
config['training']['learning_rate'] = 1
auto_train(..., config=config)

Does that seem reasonable to you?

Copy link
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Right right. this make total sense!

default_configs = create_default_config(dataset, target, time_limit_s)
model_config = model_select(default_configs)
return model_config


def _train(
config: Dict,
dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
Expand Down
69 changes: 44 additions & 25 deletions ludwig/automl/base_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,29 @@
(base implementation -- # CPU, # GPU)
"""

import logging
import os
from typing import Dict, List, Union
import sys
from typing import List, Union

import pandas as pd
import dask.dataframe as dd
from ludwig.automl.utils import (FieldInfo, get_available_resources,
avg_num_tokens)
from ludwig.automl.utils import (FieldInfo, avg_num_tokens,
get_available_resources)
from ludwig.constants import BINARY, CATEGORY, CONFIG, NUMERICAL, TEXT, TYPE
from ludwig.utils.data_utils import load_yaml

logger = logging.getLogger(__name__)
try:
import dask.dataframe as dd
import ray
except ImportError:
logger.error(
' ray is not installed. '
'In order to use auto_train please run '
'pip install ludwig[ray]'
)
sys.exit(-1)

PATH_HERE = os.path.abspath(os.path.dirname(__file__))
CONFIG_DIR = os.path.join(PATH_HERE, 'defaults')

Expand All @@ -33,16 +47,16 @@
}


def allocate_experiment_resources(resources: Dict) -> Dict:
def allocate_experiment_resources(resources: dict) -> dict:
"""
Allocates ray trial resources based on available resources

# Inputs
:param resources (Dict) specifies all available GPUs, CPUs and associated
:param resources (dict) specifies all available GPUs, CPUs and associated
metadata of the machines (i.e. memory)

# Return
:return: (Dict) gpu and cpu resources per trial
:return: (dict) gpu and cpu resources per trial
"""
# TODO (ASN):
# (1) expand logic to support multiple GPUs per trial (multi-gpu training)
Expand All @@ -51,18 +65,23 @@ def allocate_experiment_resources(resources: Dict) -> Dict:
experiment_resources = {
'cpu_resources_per_trial': 1
}
if resources['gpu'] > 0:
gpu_count, cpu_count = resources['gpu'], resources['cpu']
if gpu_count > 0:
experiment_resources.update({
tgaddair marked this conversation as resolved.
Show resolved Hide resolved
'gpu_resources_per_trial': 1
})
if cpu_count > 1:
cpus_per_trial = int(cpu_count/gpu_count)
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Maybe max(int(cpu_count / gpu_count), 1)

experiment_resources['cpu_resources_per_trial'] = cpus_per_trial

return experiment_resources


def create_default_config(
dataset: Union[str, dd.core.DataFrame, pd.DataFrame],
target_name: str = None,
time_limit_s: Union[int, float] = None
) -> Dict:
) -> dict:
"""
Returns auto_train configs for three available combiner models.
Coordinates the following tasks:
Expand All @@ -81,7 +100,7 @@ def create_default_config(
as the stopping parameter

# Return
:return: (Dict) dictionaries contain auto train config files for all available
:return: (dict) dictionaries contain auto train config files for all available
combiner types

"""
Expand All @@ -105,7 +124,7 @@ def create_default_config(

def get_field_info(dataset: str):
"""
Constructs FeildInfo objects for each feature in dataset. These objects
Constructs FieldInfo objects for each feature in dataset. These objects
are used for downstream type inference

# Inputs
Expand Down Expand Up @@ -138,9 +157,9 @@ def get_field_info(dataset: str):
def get_features_config(
fields: List[FieldInfo],
row_count: int,
resources: Dict,
resources: dict,
target_name: str = None,
) -> Dict:
) -> dict:
"""
Constructs FeildInfo objects for each feature in dataset. These objects
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Nit: typo in "FieldInfo"

are used for downstream type inference
Expand All @@ -151,7 +170,7 @@ def get_features_config(
:param target_name (str) name of target feature

# Return
:return: (Dict) section of auto_train config for input_features and output_features
:return: (dict) section of auto_train config for input_features and output_features
"""
metadata = get_field_metadata(fields, row_count, resources, target_name)
return get_config_from_metadata(metadata, target_name)
Expand All @@ -163,11 +182,11 @@ def get_config_from_metadata(metadata: list, target_name: str = None) -> dict:
metadata

# Inputs
:param metadata: (List[Dict]) field descriptions
:param metadata: (List[dict]) field descriptions
:param target_name (str) name of target feature

# Return
:return: (Dict) section of auto_train config for input_features and output_features
:return: (dict) section of auto_train config for input_features and output_features
"""
config = {
"input_features": [],
Expand All @@ -176,15 +195,15 @@ def get_config_from_metadata(metadata: list, target_name: str = None) -> dict:

for field_meta in metadata:
if field_meta["name"] == target_name:
config["output_features"].append(field_meta["config"])
config["output_features"].append(field_meta[CONFIG])
elif not field_meta["excluded"] and field_meta["mode"] == "input":
config["input_features"].append(field_meta["config"])
config["input_features"].append(field_meta[CONFIG])

return config


def get_field_metadata(
fields: List[FieldInfo], row_count: int, resources: Dict, target_name: str = None
fields: List[FieldInfo], row_count: int, resources: dict, target_name: str = None
) -> list:
"""
Computes metadata for each field in dataset
Expand Down Expand Up @@ -221,7 +240,7 @@ def get_field_metadata(
sum(
not meta["excluded"]
and meta["mode"] == "input"
and meta["config"]["type"] != "text"
and meta[CONFIG][TYPE] != TEXT
for meta in metadata
)
- 1
Expand All @@ -230,7 +249,7 @@ def get_field_metadata(
# Exclude text fields if no GPUs are available
if resources['gpu'] == 0:
for meta in metadata:
if input_count > 2 and meta["config"]["type"] == "text":
if input_count > 2 and meta[CONFIG][TYPE] == TEXT:
# By default, exclude text inputs when there are other candidate inputs
meta["excluded"] = True

Expand All @@ -255,20 +274,20 @@ def infer_type(
if distinct_values == 2 and (
missing_value_percent == 0 or field.name == target_name
):
return "binary"
return BINARY

if distinct_values < 20:
# TODO (tgaddair): come up with something better than this, maybe attempt to fit to Gaussian
# NOTE (ASN): edge case -- there are less than 20 samples in dataset
return "category"
return CATEGORY

# add criteria for number of spaces
if field.avg_words and field.avg_words > 2:
return "text"
return TEXT

# TODO (ASN): add other modalities (image, etc. )

return "numerical"
return NUMERICAL


def should_exclude(field: FieldInfo, row_count: int, target_name: str) -> bool:
Expand Down
16 changes: 15 additions & 1 deletion ludwig/automl/utils.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,23 @@
import logging
import sys
from dataclasses import dataclass

import ray
from dataclasses_json import LetterCase, dataclass_json
from pandas import Series

logger = logging.getLogger(__name__)


try:
import ray
except ImportError:
logger.error(
' ray is not installed. '
'In order to use auto_train please run '
'pip install ludwig[ray]'
)
sys.exit(-1)


@dataclass_json(letter_case=LetterCase.CAMEL)
@dataclass
Expand Down
5 changes: 5 additions & 0 deletions ludwig/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -124,3 +124,8 @@
TFRECORD = "tfrecord"

SRC = 'dataset_src'

BATCH_SIZE = 'batch_size'
LEARNING_RATE = 'learning_rate'
AUTO = 'auto'
CONFIG = 'config'
Loading