Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[automl] Memory Aware Config Tuning #1257

Merged
merged 4 commits into from
Aug 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
164 changes: 164 additions & 0 deletions ludwig/automl/auto_tune_config.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,164 @@
import copy
from collections import OrderedDict

import psutil
import ray

try:
import GPUtil
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can we add this to the requirements_ray.txt instead of asking the user to install this by hand?

Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should probably move the import ray in here, too, given the error message.

except ImportError:
raise ImportError(
' ray is not installed. '
'In order to use auto_train please run '
'pip install ludwig[ray]'
)

from ludwig.api import LudwigModel
from ludwig.automl.utils import get_available_resources
from ludwig.data.preprocessing import preprocess_for_training
from ludwig.features.feature_registries import update_config_with_metadata
from ludwig.utils.defaults import merge_with_defaults
from ludwig.constants import COMBINER, HYPEROPT, BATCH_SIZE, TRAINING, TYPE, PREPROCESSING, SPACE

# maps variable search space that can be modified to minimum permissible value for the range
RANKED_MODIFIABLE_PARAM_LIST = {
'tabnet': OrderedDict({
'training.batch_size': 32,
'combiner.size': 8,
'combiner.output_size': 8,
}),
'concat': OrderedDict({
'training.batch_size': 32,
'combiner.fc_size': 64,
'combiner.num_fc_layers': 1,

}),
'tabtransformer': OrderedDict({
'training.batch_size': 32,
'combiner.num_heads:': 4,
'combiner.output_size': 8,
'combiner.num_layers': 4,
'combiner.num_fc_layers': 1,
}),
}


BYTES_PER_MiB = 1048576


def get_trainingset_metadata(config, dataset):
(_, _, _, training_set_metadata) = preprocess_for_training(
config,
dataset=dataset,
preprocessing_params=config[PREPROCESSING])
return training_set_metadata


def get_machine_memory():

if ray.is_initialized(): # using ray cluster
@ray.remote(num_gpus=1)
def get_remote_gpu():
gpus = GPUtil.getGPUs()
total_mem_mb = gpus[0].memory_total
return total_mem_mb * BYTES_PER_MiB

@ray.remote(num_cpus=1)
def get_remote_cpu():
total_mem = psutil.virtual_memory().total
return total_mem

resources = get_available_resources() # check if cluster has GPUS

if resources['gpu'] > 0:
machine_mem = ray.get(get_remote_gpu.remote())
else:
machine_mem = ray.get(get_remote_cpu.remote())
else: # not using ray cluster
if GPUtil.getGPUs():
machine_mem = GPUtil.getGPUs()[0].memory_total * BYTES_PER_MiB
else:
machine_mem = psutil.virtual_memory().total

return machine_mem


def compute_memory_usage(config, training_set_metadata) -> int:
update_config_with_metadata(config, training_set_metadata)
lm = LudwigModel.create_model(config)
lm.get_connected_model()
model_tensors = lm.collect_weights()
total_size = 0
batch_size = config[TRAINING][BATCH_SIZE]
for tnsr in model_tensors:
total_size += tnsr[1].numpy().size * batch_size
total_bytes = total_size * 32 # assumes 32-bit precision
return total_bytes


def sub_new_params(config: dict, new_param_vals: dict):
new_config = copy.deepcopy(config)
for param, val in new_param_vals.items():
config_section = param.split(".")[0]
param_name = param.split(".")[1]
new_config[config_section][param_name] = val
return new_config


def get_new_params(current_param_values, hyperparam_search_space, params_to_modify):
for param, _ in params_to_modify.items():
if hyperparam_search_space[param][SPACE] == "choice":
current_param_values[param] = hyperparam_search_space[param]['categories'][-1]
else:
current_param_values[param] = hyperparam_search_space[param]['upper']
return current_param_values


def memory_tune_config(config, dataset):
Copy link
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

If would suggest running this with ray.remote if ray is available, just because TensorFlow has a tendency to have some issues with state corruption when creating models and potentially allocating GPU memory.

fits_in_memory = False
raw_config = merge_with_defaults(config)
training_set_metadata = get_trainingset_metadata(raw_config, dataset)
modified_hyperparam_search_space = copy.deepcopy(
raw_config[HYPEROPT]['parameters'])
params_to_modify = RANKED_MODIFIABLE_PARAM_LIST[raw_config[COMBINER][TYPE]]
param_list = list(params_to_modify.keys())
current_param_values = {}
max_memory = get_machine_memory()

while param_list is not None:
# compute memory utilization
current_param_values = get_new_params(
current_param_values, modified_hyperparam_search_space, params_to_modify)
temp_config = sub_new_params(raw_config, current_param_values)
if compute_memory_usage(temp_config, training_set_metadata) < max_memory:
fits_in_memory = True
break
# check if we have exhausted tuning of current param (e.g. we can no longer reduce the param value)
param, min_value = param_list[0], params_to_modify[param_list[0]]

if param in modified_hyperparam_search_space.keys():
param_space = modified_hyperparam_search_space[param]["space"]
if param_space == "choice":
if len(modified_hyperparam_search_space[param]['categories']) > 2 and \
modified_hyperparam_search_space[param]['categories'][-2] > min_value:
modified_hyperparam_search_space[param][
'categories'] = modified_hyperparam_search_space[param]['categories'][:-1]
else:
param_list.pop(0) # exhausted reduction of this parameter
else:
# reduce by 10%
upper_bound, lower_bound = modified_hyperparam_search_space[param][
"upper"], modified_hyperparam_search_space[param]["lower"]
reduction_val = (upper_bound - lower_bound) * 0.1
new_upper_bound = upper_bound - reduction_val
if (new_upper_bound) > lower_bound and new_upper_bound > min_value:
modified_hyperparam_search_space[param]["upper"] = new_upper_bound
else:
param_list.pop(0) # exhausted reduction of this parameter
else:
param_list.pop(0) # param not in hyperopt search space

modified_config = copy.deepcopy(config)

modified_config[HYPEROPT]["parameters"] = modified_hyperparam_search_space
return modified_config, fits_in_memory
13 changes: 12 additions & 1 deletion ludwig/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@

from ludwig.api import LudwigModel
from ludwig.automl.base_config import _create_default_config, DatasetInfo
from ludwig.automl.auto_tune_config import memory_tune_config
from ludwig.automl.utils import _ray_init
from ludwig.constants import COMBINER, TYPE
from ludwig.hyperopt.run import hyperopt
Expand Down Expand Up @@ -61,6 +62,7 @@ def auto_train(
target: str,
time_limit_s: Union[int, float],
output_directory: str = OUTPUT_DIR,
tune_for_memory: bool = False,
**kwargs
) -> AutoTrainResults:
"""
Expand All @@ -81,7 +83,8 @@ def auto_train(
# Returns
:return: (AutoTrainResults) results containing hyperopt experiments and best model
"""
config = create_auto_config(dataset, target, time_limit_s)
config = create_auto_config(
dataset, target, time_limit_s, tune_for_memory, **kwargs)
return train_with_config(
dataset,
config,
Expand All @@ -94,6 +97,7 @@ def create_auto_config(
dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
target: str,
time_limit_s: Union[int, float],
tune_for_memory: bool,
) -> dict:
"""
Returns an auto-generated Ludwig config with the intent of training
Expand All @@ -111,6 +115,13 @@ def create_auto_config(
"""
default_configs = _create_default_config(dataset, target, time_limit_s)
model_config = _model_select(default_configs)
if tune_for_memory:
if ray.is_initialized():
model_config, _ = ray.get(ray.remote(num_cpus=1)(
memory_tune_config
).remote(model_config, dataset))
else:
model_config, _ = memory_tune_config(model_config, dataset)
return model_config


Expand Down
2 changes: 2 additions & 0 deletions requirements_ray.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,5 @@
ray[default,tune]
pickle5
tensorboardX<2.3
GPUtil