Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[automl] refactored to provide extended public API #1235

Merged
merged 8 commits into from
Jul 20, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 69 additions & 23 deletions ludwig/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,10 @@
import copy
import logging
import os
import subprocess
import sys
import tempfile
from collections import OrderedDict
from pprint import pformat
from typing import Dict, List, Optional, Tuple, Union

Expand All @@ -47,7 +50,7 @@
from ludwig.globals import (MODEL_HYPERPARAMETERS_FILE_NAME,
MODEL_WEIGHTS_FILE_NAME,
TRAIN_SET_METADATA_FILE_NAME,
set_disable_progressbar)
set_disable_progressbar, LUDWIG_VERSION)
from ludwig.models.ecd import ECD
from ludwig.models.predictor import (Predictor, calculate_overall_stats,
print_evaluation_stats,
Expand All @@ -58,11 +61,9 @@
DICT_FORMATS,
external_data_reader_registry,
figure_data_format, generate_kfold_splits,
load_json, save_json, load_yaml)
load_json, save_json, load_yaml, load_dataset)
from ludwig.utils.defaults import default_random_seed, merge_with_defaults
from ludwig.utils.misc_utils import (get_experiment_description,
get_file_names, get_from_registry,
get_output_directory)
from ludwig.utils.misc_utils import get_file_names, get_output_directory
from ludwig.utils.print_utils import print_boxed
from ludwig.utils.schema import validate_config

Expand Down Expand Up @@ -1717,30 +1718,18 @@ def kfold_cross_validate(
if not data_format or data_format == 'auto':
data_format = figure_data_format(dataset)

# use appropriate reader to create dataframe
if data_format in DATAFRAME_FORMATS:
data_df = dataset
data_dir = os.getcwd()
elif data_format in DICT_FORMATS:
data_df = pd.DataFrame(dataset)
data_dir = os.getcwd()
elif data_format in CACHEABLE_FORMATS:
data_reader = get_from_registry(data_format,
external_data_reader_registry)
data_df = data_reader(dataset, backend.df_engine.df_lib)
data_dir = os.path.dirname(dataset)
else:
ValueError(
"{} format is not supported for k_fold_cross_validate()"
.format(data_format)
)
data_df = load_dataset(
dataset,
data_format=data_format,
df_lib=backend.df_engine.df_lib
)

kfold_cv_stats = {}
kfold_split_indices = {}

for train_indices, test_indices, fold_num in \
generate_kfold_splits(data_df, num_folds, random_seed):
with tempfile.TemporaryDirectory(dir=data_dir) as temp_dir_name:
with tempfile.TemporaryDirectory() as temp_dir_name:
curr_train_df = data_df.iloc[train_indices]
curr_test_df = data_df.iloc[test_indices]

Expand Down Expand Up @@ -1832,3 +1821,60 @@ def kfold_cross_validate(
logger.info('completed {:d}-fold cross validation'.format(num_folds))

return kfold_cv_stats, kfold_split_indices


def get_experiment_description(
config,
dataset=None,
training_set=None,
validation_set=None,
test_set=None,
training_set_metadata=None,
data_format=None,
random_seed=None
):
description = OrderedDict()
description['ludwig_version'] = LUDWIG_VERSION
description['command'] = ' '.join(sys.argv)

try:
with open(os.devnull, 'w') as devnull:
is_a_git_repo = subprocess.call(['git', 'branch'],
stderr=subprocess.STDOUT,
stdout=devnull) == 0
if is_a_git_repo:
description['commit_hash'] = \
subprocess.check_output(['git', 'rev-parse', 'HEAD']).decode(
'utf-8')[:12]
except:
pass

if random_seed is not None:
description['random_seed'] = random_seed

if isinstance(dataset, str):
description['dataset'] = dataset
if isinstance(training_set, str):
description['training_set'] = training_set
if isinstance(validation_set, str):
description['validation_set'] = validation_set
if isinstance(test_set, str):
description['test_set'] = test_set
if training_set_metadata is not None:
description['training_set_metadata'] = training_set_metadata

# determine data format if not provided or auto
if not data_format or data_format == 'auto':
data_format = figure_data_format(
dataset, training_set, validation_set, test_set
)

if data_format:
description['data_format'] = str(data_format)

description['config'] = config

import tensorflow as tf
description['tf_version'] = tf.__version__

return description
1 change: 1 addition & 0 deletions ludwig/automl/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
from ludwig.automl.automl import auto_train, create_auto_config, train_with_config
141 changes: 109 additions & 32 deletions ludwig/automl/automl.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,17 +9,21 @@
(3) Runs hyperparameter optimization experiment
"""
from typing import Dict, Union
import warnings

import numpy as np
import pandas as pd
import warnings
from ludwig.automl.base_config import _create_default_config

from ludwig.api import LudwigModel
from ludwig.automl.base_config import _create_default_config, DatasetInfo
from ludwig.automl.utils import _ray_init
from ludwig.constants import COMBINER, TYPE
from ludwig.hyperopt.run import hyperopt

try:
import dask.dataframe as dd
import ray
from ray.tune import ExperimentAnalysis
except ImportError:
raise ImportError(
' ray is not installed. '
Expand All @@ -31,22 +35,34 @@
OUTPUT_DIR = "."


def _model_select(default_configs):
"""
Performs model selection based on dataset.
Note: Current implementation returns tabnet by default. This will be
improved in subsequent iterations
"""
return default_configs['tabnet']
class AutoTrainResults:
def __init__(self, experiment_analysis: ExperimentAnalysis):
self._experiment_analysis = experiment_analysis

@property
def experiment_analysis(self):
return self._experiment_analysis

@property
def path_to_best_model(self) -> str:
return self._experiment_analysis.best_checkpoint

@property
def best_trial_id(self) -> str:
return self._experiment_analysis.best_trial.trial_id

@property
def best_model(self) -> LudwigModel:
return LudwigModel.load(self.path_to_best_model)


def auto_train(
dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
target: str,
time_limit_s: Union[int, float],
output_dir: str = OUTPUT_DIR,
config=None,
):
output_directory: str = OUTPUT_DIR,
**kwargs
) -> AutoTrainResults:
"""
Main auto train API that first builds configs for each model type
(e.g. concat, tabnet, transformer). Then selects model based on dataset
Expand All @@ -56,19 +72,77 @@ def auto_train(

# Inputs
:param dataset: (str) filepath to dataset.
:param target_name: (str) name of target feature
:param target: (str) name of target feature
:param time_limit_s: (int, float) total time allocated to auto_train. acts
as the stopping parameter
:param output_directory: (str) directory into which to write results, defaults to
current working directory.

# Returns
:return: (str) path to best trained model
:return: (AutoTrainResults) results containing hyperopt experiments and best model
"""
config = create_auto_config(dataset, target, time_limit_s)
return train_with_config(
dataset,
config,
output_directory=output_directory,
**kwargs
)


def create_auto_config(
dataset: Union[str, pd.DataFrame, dd.core.DataFrame, DatasetInfo],
target: str,
time_limit_s: Union[int, float],
) -> dict:
"""
Returns an auto-generated Ludwig config with the intent of training
the best model on given given dataset / target in the given time
limit.

# Inputs
:param dataset: (str) filepath to dataset.
:param target: (str) name of target feature
:param time_limit_s: (int, float) total time allocated to auto_train. acts
as the stopping parameter

# Return
:return: (dict) selected model configuration
"""
if config is None:
config = create_auto_config(dataset, target, time_limit_s)
default_configs = _create_default_config(dataset, target, time_limit_s)
model_config = _model_select(default_configs)
return model_config


def train_with_config(
dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
config: dict,
output_directory: str = OUTPUT_DIR,
**kwargs,
) -> AutoTrainResults:
"""
Performs hyperparameter optimization with respect to the given config
and selects the best model.

# Inputs
:param dataset: (str) filepath to dataset.
:param config: (dict) optional Ludwig configuration to use for training, defaults
to `create_auto_config`.
:param output_directory: (str) directory into which to write results, defaults to
current working directory.

# Returns
:return: (AutoTrainResults) results containing hyperopt experiments and best model
"""
_ray_init()
model_name = config[COMBINER][TYPE]
hyperopt_results = _train(config, dataset,
output_dir, model_name=model_name)
experiment_analysis = hyperopt_results.experiment_analysis
hyperopt_results = _train(
config,
dataset,
output_directory=output_directory,
model_name=model_name,
**kwargs
)
# catch edge case where metric_score is nan
# TODO (ASN): Decide how we want to proceed if at least one trial has
# completed
Expand All @@ -80,29 +154,32 @@ def auto_train(
"Consider increasing the time budget for experiment. "
)

autotrain_results = {
'path_to_best_model': experiment_analysis.best_checkpoint,
'trial_id': "_".join(experiment_analysis.best_logdir.split("/")[-1].split("_")[1:])
}
return autotrain_results
experiment_analysis = hyperopt_results.experiment_analysis
return AutoTrainResults(experiment_analysis)


def create_auto_config(dataset, target, time_limit_s) -> dict:
default_configs = _create_default_config(dataset, target, time_limit_s)
model_config = _model_select(default_configs)
return model_config
def _model_select(default_configs):
"""
Performs model selection based on dataset.
Note: Current implementation returns tabnet by default. This will be
improved in subsequent iterations
"""
return default_configs['tabnet']


def _train(
config: Dict,
dataset: Union[str, pd.DataFrame, dd.core.DataFrame],
output_dir: str,
model_name: str
output_directory: str,
model_name: str,
**kwargs
):
hyperopt_results = hyperopt(
config,
dataset=dataset,
output_directory=output_dir,
model_name=model_name
output_directory=output_directory,
model_name=model_name,
backend='local',
**kwargs
)
return hyperopt_results
Loading