From fc55d36af4297e34ece083979652f8bdc92148ae Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Wed, 18 Nov 2020 20:25:02 -0500 Subject: [PATCH 01/18] feat: add support for transforming numeric predictions --- ludwig/features/numerical_feature.py | 111 ++++++++++++++++++++++++--- 1 file changed, 99 insertions(+), 12 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 64d9aa21db8..0e7618ed8bd 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -35,6 +35,7 @@ from ludwig.utils.horovod_utils import is_on_master from ludwig.utils.misc_utils import set_default_value from ludwig.utils.misc_utils import set_default_values +from ludwig.utils.misc_utils import get_from_registry logger = logging.getLogger(__name__) @@ -81,17 +82,44 @@ def add_feature_data( dataset[feature[PROC_COLUMN]] = dataset_df[feature[COLUMN]].astype( np.float32).values if preprocessing_parameters['normalization'] is not None: - if preprocessing_parameters['normalization'] == 'zscore': - mean = metadata[feature[NAME]]['mean'] - std = metadata[feature[NAME]]['std'] - dataset[feature[PROC_COLUMN]] = (dataset[ - feature[ - PROC_COLUMN]] - mean) / std - elif preprocessing_parameters['normalization'] == 'minmax': - min_ = metadata[feature[NAME]]['min'] - max_ = metadata[feature[NAME]]['max'] - values = dataset[feature[PROC_COLUMN]] - dataset[feature[PROC_COLUMN]] = (values - min_) / (max_ - min_) + normalization_type = preprocessing_parameters['normalization'] + NumericTransformer = get_from_registry( + normalization_type, + numeric_transformation_registry + ) + + if normalization_type == 'zscore': + numeric_transformer = NumericTransformer( + metadata[feature[NAME]]['mean'], + metadata[feature[NAME]]['std'] + ) + elif normalization_type == 'minmax': + numeric_transformer = NumericTransformer( + metadata[feature[NAME]]['min'], + metadata[feature[NAME]]['max'] + ) + else: + raise ValueError( + 'Normalization "{}" not supported. Valid values are ' + '"minmax" or "zscore"'.format(normalization_type) + ) + + values = dataset[feature[PROC_COLUMN]] + dataset[feature[PROC_COLUMN]] = numeric_transformer.transform( + values) + + # todo: clean up after implementing numeric transformers + # if preprocessing_parameters['normalization'] == 'zscore': + # mean = metadata[feature[NAME]]['mean'] + # std = metadata[feature[NAME]]['std'] + # dataset[feature[PROC_COLUMN]] = (dataset[ + # feature[ + # PROC_COLUMN]] - mean) / std + # elif preprocessing_parameters['normalization'] == 'minmax': + # min_ = metadata[feature[NAME]]['min'] + # max_ = metadata[feature[NAME]]['max'] + # values = dataset[feature[PROC_COLUMN]] + # dataset[feature[PROC_COLUMN]] = (values - min_) / (max_ - min_) class NumericalInputFeature(NumericalFeatureMixin, InputFeature): @@ -267,7 +295,35 @@ def postprocess_predictions( skip_save_unprocessed_output = True if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0: - postprocessed[PREDICTIONS] = predictions[PREDICTIONS].numpy() + if metadata['preprocessing']['normalization'] is not None: + normalization_type = metadata['preprocessing']['normalization'] + NumericTransformer = get_from_registry( + normalization_type, + numeric_transformation_registry + ) + if normalization_type == 'zscore': + numeric_transformer = NumericTransformer( + metadata['mean'], + metadata['std'] + ) + elif normalization_type == 'minmax': + numeric_transformer = NumericTransformer( + metadata['min'], + metadata['max'] + ) + else: + raise ValueError( + 'Normalization "{}" not supported. Valid values are ' + '"minmax" or "zscore"'.format(normalization_type) + ) + + values_to_return = numeric_transformer.inverse_transform( + predictions[PREDICTIONS].numpy() + ) + else: + values_to_return = predictions[PREDICTIONS].numpy() + + postprocessed[PREDICTIONS] = values_to_return if not skip_save_unprocessed_output: np.save( npy_filename.format(name, PREDICTIONS), @@ -314,3 +370,34 @@ def populate_defaults(output_feature): 'None': Regressor, None: Regressor } + + +class ZScoreTransformer: + def __init__(self, mu: float, sigma: float): + self.mu = mu + self.sigma = sigma + + def transform(self, x: np.ndarray) -> np.ndarray: + return (x - self.mu) / self.sigma + + def inverse_transform(self, x: np.ndarray) -> np.ndarray: + return x * self.sigma + self.mu + + +class MinMaxTransformer: + def __init__(self, min_value: float, max_value: float): + self.min_value = min_value + self.max_value = max_value + self.range = max_value - min_value + + def transform(self, x: np.ndarray) -> np.ndarray: + return (x - self.min_value) / self.range + + def inverse_transform(self, x: np.ndarray) -> np.ndarray: + return x * self.range + self.min_value + + +numeric_transformation_registry = { + 'minmax': MinMaxTransformer, + 'zscore': ZScoreTransformer +} From e96fcd2a4b33bbb9d62a96f2b688e4e259d4757e Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Thu, 19 Nov 2020 20:50:04 -0500 Subject: [PATCH 02/18] feat: add log1p normalization for numerical feature --- ludwig/features/numerical_feature.py | 23 ++++++++++++++++++++++- 1 file changed, 22 insertions(+), 1 deletion(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 0e7618ed8bd..47200dcf289 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -61,6 +61,8 @@ def get_feature_meta(column, preprocessing_parameters): 'min': column.astype(np.float32).min(), 'max': column.astype(np.float32).max() } + elif preprocessing_parameters['normalization'] == 'log1p': + return {} else: logger.info( 'Currently zscore and minmax are the only ' @@ -98,6 +100,8 @@ def add_feature_data( metadata[feature[NAME]]['min'], metadata[feature[NAME]]['max'] ) + elif normalization_type == 'log1p': + numeric_transformer = NumericTransformer() else: raise ValueError( 'Normalization "{}" not supported. Valid values are ' @@ -311,6 +315,8 @@ def postprocess_predictions( metadata['min'], metadata['max'] ) + elif normalization_type == 'log1p': + numeric_transformer = NumericTransformer() else: raise ValueError( 'Normalization "{}" not supported. Valid values are ' @@ -397,7 +403,22 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: return x * self.range + self.min_value +class Log1pTransformer: + + def transform(self, x: np.ndarray) -> np.ndarray: + if np.any(x <= 0): + raise ValueError( + 'One or more values are non-positive. ' + 'log1p normalization only defined for positive values.' + ) + return np.log1p(x) + + def inverse_transform(self, x: np.ndarray) -> np.ndarray: + return np.expm1(x) + + numeric_transformation_registry = { 'minmax': MinMaxTransformer, - 'zscore': ZScoreTransformer + 'zscore': ZScoreTransformer, + 'log1p': Log1pTransformer } From 5e076bb396e3328d712d75a7433a0cbbaef2dca9 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Fri, 20 Nov 2020 18:55:21 -0500 Subject: [PATCH 03/18] refactor: incorporate reviewer comments --- ludwig/features/numerical_feature.py | 65 +++++----------------------- 1 file changed, 11 insertions(+), 54 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 47200dcf289..a2922f12c7e 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -90,41 +90,12 @@ def add_feature_data( numeric_transformation_registry ) - if normalization_type == 'zscore': - numeric_transformer = NumericTransformer( - metadata[feature[NAME]]['mean'], - metadata[feature[NAME]]['std'] - ) - elif normalization_type == 'minmax': - numeric_transformer = NumericTransformer( - metadata[feature[NAME]]['min'], - metadata[feature[NAME]]['max'] - ) - elif normalization_type == 'log1p': - numeric_transformer = NumericTransformer() - else: - raise ValueError( - 'Normalization "{}" not supported. Valid values are ' - '"minmax" or "zscore"'.format(normalization_type) - ) + numeric_transformer = NumericTransformer(**metadata[feature[NAME]]) values = dataset[feature[PROC_COLUMN]] dataset[feature[PROC_COLUMN]] = numeric_transformer.transform( values) - # todo: clean up after implementing numeric transformers - # if preprocessing_parameters['normalization'] == 'zscore': - # mean = metadata[feature[NAME]]['mean'] - # std = metadata[feature[NAME]]['std'] - # dataset[feature[PROC_COLUMN]] = (dataset[ - # feature[ - # PROC_COLUMN]] - mean) / std - # elif preprocessing_parameters['normalization'] == 'minmax': - # min_ = metadata[feature[NAME]]['min'] - # max_ = metadata[feature[NAME]]['max'] - # values = dataset[feature[PROC_COLUMN]] - # dataset[feature[PROC_COLUMN]] = (values - min_) / (max_ - min_) - class NumericalInputFeature(NumericalFeatureMixin, InputFeature): encoder = 'passthrough' @@ -305,24 +276,8 @@ def postprocess_predictions( normalization_type, numeric_transformation_registry ) - if normalization_type == 'zscore': - numeric_transformer = NumericTransformer( - metadata['mean'], - metadata['std'] - ) - elif normalization_type == 'minmax': - numeric_transformer = NumericTransformer( - metadata['min'], - metadata['max'] - ) - elif normalization_type == 'log1p': - numeric_transformer = NumericTransformer() - else: - raise ValueError( - 'Normalization "{}" not supported. Valid values are ' - '"minmax" or "zscore"'.format(normalization_type) - ) + numeric_transformer = NumericTransformer(**metadata) values_to_return = numeric_transformer.inverse_transform( predictions[PREDICTIONS].numpy() ) @@ -379,9 +334,9 @@ def populate_defaults(output_feature): class ZScoreTransformer: - def __init__(self, mu: float, sigma: float): - self.mu = mu - self.sigma = sigma + def __init__(self, mean: float = None, std: float = None, **kwargs: dict): + self.mu = mean + self.sigma = std def transform(self, x: np.ndarray) -> np.ndarray: return (x - self.mu) / self.sigma @@ -391,10 +346,10 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: class MinMaxTransformer: - def __init__(self, min_value: float, max_value: float): - self.min_value = min_value - self.max_value = max_value - self.range = max_value - min_value + def __init__(self, min: float = None, max: float = None, **kwargs: dict): + self.min_value = min + self.max_value = max + self.range = self.max_value - self.min_value def transform(self, x: np.ndarray) -> np.ndarray: return (x - self.min_value) / self.range @@ -404,6 +359,8 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: class Log1pTransformer: + def __init__(self, **kwargs: dict): + pass def transform(self, x: np.ndarray) -> np.ndarray: if np.any(x <= 0): From 43f726132ca1a8ced4493d6869996b9ea802e710 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Fri, 20 Nov 2020 18:58:20 -0500 Subject: [PATCH 04/18] doc: incorporated reviewer comments re: error message --- ludwig/features/numerical_feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index a2922f12c7e..fc40a232800 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -366,7 +366,7 @@ def transform(self, x: np.ndarray) -> np.ndarray: if np.any(x <= 0): raise ValueError( 'One or more values are non-positive. ' - 'log1p normalization only defined for positive values.' + 'log1p normalization is defined only for positive values.' ) return np.log1p(x) From 358302f2201aa58e4a4d4e5119eb2354271449c3 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Fri, 20 Nov 2020 19:44:30 -0500 Subject: [PATCH 05/18] refactor: incorporated reviewer comments for improved abstraction --- ludwig/features/numerical_feature.py | 50 +++++++++++++++------------- 1 file changed, 27 insertions(+), 23 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index fc40a232800..ca09e8c4f8b 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -83,18 +83,15 @@ def add_feature_data( ): dataset[feature[PROC_COLUMN]] = dataset_df[feature[COLUMN]].astype( np.float32).values - if preprocessing_parameters['normalization'] is not None: - normalization_type = preprocessing_parameters['normalization'] - NumericTransformer = get_from_registry( - normalization_type, - numeric_transformation_registry - ) - numeric_transformer = NumericTransformer(**metadata[feature[NAME]]) + # normalize data as required + numeric_transformer = get_from_registry( + preprocessing_parameters.get('normalization', None), + numeric_transformation_registry + )(**metadata[feature[NAME]]) - values = dataset[feature[PROC_COLUMN]] - dataset[feature[PROC_COLUMN]] = numeric_transformer.transform( - values) + dataset[feature[PROC_COLUMN]] = \ + numeric_transformer.transform(dataset[feature[PROC_COLUMN]]) class NumericalInputFeature(NumericalFeatureMixin, InputFeature): @@ -270,21 +267,16 @@ def postprocess_predictions( skip_save_unprocessed_output = True if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0: - if metadata['preprocessing']['normalization'] is not None: - normalization_type = metadata['preprocessing']['normalization'] - NumericTransformer = get_from_registry( - normalization_type, - numeric_transformation_registry - ) - - numeric_transformer = NumericTransformer(**metadata) - values_to_return = numeric_transformer.inverse_transform( + # as needed convert predictions make to original value space + numeric_transformer = get_from_registry( + metadata['preprocessing'].get('normalization', None), + numeric_transformation_registry + )(**metadata) + postprocessed[PREDICTIONS] = \ + numeric_transformer.inverse_transform( predictions[PREDICTIONS].numpy() ) - else: - values_to_return = predictions[PREDICTIONS].numpy() - postprocessed[PREDICTIONS] = values_to_return if not skip_save_unprocessed_output: np.save( npy_filename.format(name, PREDICTIONS), @@ -374,8 +366,20 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: return np.expm1(x) +class IdentityTransformer: + def __init__(self, **kwargs): + pass + + def transform(self, x: np.ndarray) -> np.ndarray: + return x + + def inverse_transform(self, x: np.ndarray) -> np.ndarray: + return x + + numeric_transformation_registry = { 'minmax': MinMaxTransformer, 'zscore': ZScoreTransformer, - 'log1p': Log1pTransformer + 'log1p': Log1pTransformer, + None: IdentityTransformer } From d0d62edcdde3e40101cf8c5951bec66f1bf344b8 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Fri, 20 Nov 2020 22:22:46 -0500 Subject: [PATCH 06/18] refactor: add fit_transform_params method --- ludwig/features/numerical_feature.py | 100 +++++++++++++++++++++------ 1 file changed, 77 insertions(+), 23 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index ca09e8c4f8b..6be4bbf12a2 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -50,28 +50,36 @@ class NumericalFeatureMixin(object): @staticmethod def get_feature_meta(column, preprocessing_parameters): - if preprocessing_parameters['normalization'] is not None: - if preprocessing_parameters['normalization'] == 'zscore': - return { - 'mean': column.astype(np.float32).mean(), - 'std': column.astype(np.float32).std() - } - elif preprocessing_parameters['normalization'] == 'minmax': - return { - 'min': column.astype(np.float32).min(), - 'max': column.astype(np.float32).max() - } - elif preprocessing_parameters['normalization'] == 'log1p': - return {} - else: - logger.info( - 'Currently zscore and minmax are the only ' - 'normalization strategies available. No {}'.format( - preprocessing_parameters['normalization']) - ) - return {} - else: - return {} + numeric_transformer = get_from_registry( + preprocessing_parameters.get('normalization', None), + numeric_transformation_registry + )(**preprocessing_parameters) + + return numeric_transformer.fit_transform_params(column) + + # todo clean up if new code works + # if preprocessing_parameters['normalization'] is not None: + # if preprocessing_parameters['normalization'] == 'zscore': + # return { + # 'mean': column.astype(np.float32).mean(), + # 'std': column.astype(np.float32).std() + # } + # elif preprocessing_parameters['normalization'] == 'minmax': + # return { + # 'min': column.astype(np.float32).min(), + # 'max': column.astype(np.float32).max() + # } + # elif preprocessing_parameters['normalization'] == 'log1p': + # return {} + # else: + # logger.info( + # 'Currently zscore and minmax are the only ' + # 'normalization strategies available. No {}'.format( + # preprocessing_parameters['normalization']) + # ) + # return {} + # else: + # return {} @staticmethod def add_feature_data( @@ -327,28 +335,66 @@ def populate_defaults(output_feature): class ZScoreTransformer: def __init__(self, mean: float = None, std: float = None, **kwargs: dict): + # When parameters are None we only need object to use + # the fit_transform_params method, other methods should not be used self.mu = mean self.sigma = std def transform(self, x: np.ndarray) -> np.ndarray: + if self.mu is None or self.sigma is None: + raise ValueError( + 'Numeric transformer needs to be instantiated with ' + 'min and max values.' + ) return (x - self.mu) / self.sigma def inverse_transform(self, x: np.ndarray) -> np.ndarray: + if self.mu is None or self.sigma is None: + raise ValueError( + 'Numeric transformer needs to be instantiated with ' + 'min and max values.' + ) return x * self.sigma + self.mu + @staticmethod + def fit_transform_params(column: np.ndarray) -> dict: + return { + 'mean': column.astype(np.float32).mean(), + 'std': column.astype(np.float32).std() + } + class MinMaxTransformer: def __init__(self, min: float = None, max: float = None, **kwargs: dict): + # When parameters are None we only need object to use + # the fit_transform_params method, other methods should not be used self.min_value = min self.max_value = max - self.range = self.max_value - self.min_value + self.range = None if min is None or max is None else max - min def transform(self, x: np.ndarray) -> np.ndarray: + if self.range is None: + raise ValueError( + 'Numeric transformer needs to be instantiated with ' + 'min and max values.' + ) return (x - self.min_value) / self.range def inverse_transform(self, x: np.ndarray) -> np.ndarray: + if self.range is None: + raise ValueError( + 'Numeric transformer needs to be instantiated with ' + 'min and max values.' + ) return x * self.range + self.min_value + @staticmethod + def fit_transform_params(column: np.ndarray) -> dict: + return { + 'min': column.astype(np.float32).min(), + 'max': column.astype(np.float32).max() + } + class Log1pTransformer: def __init__(self, **kwargs: dict): @@ -365,6 +411,10 @@ def transform(self, x: np.ndarray) -> np.ndarray: def inverse_transform(self, x: np.ndarray) -> np.ndarray: return np.expm1(x) + @staticmethod + def fit_transform_params(column: np.ndarray) -> dict: + return {} + class IdentityTransformer: def __init__(self, **kwargs): @@ -376,6 +426,10 @@ def transform(self, x: np.ndarray) -> np.ndarray: def inverse_transform(self, x: np.ndarray) -> np.ndarray: return x + @staticmethod + def fit_transform_params(column: np.ndarray) -> dict: + return {} + numeric_transformation_registry = { 'minmax': MinMaxTransformer, From 388ca52f7f52e467e3d7ad6990210749cd75fc5b Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Fri, 20 Nov 2020 22:29:24 -0500 Subject: [PATCH 07/18] doc: fix error message text --- ludwig/features/numerical_feature.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 6be4bbf12a2..57d3e8b9e3e 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -344,7 +344,7 @@ def transform(self, x: np.ndarray) -> np.ndarray: if self.mu is None or self.sigma is None: raise ValueError( 'Numeric transformer needs to be instantiated with ' - 'min and max values.' + 'mean and std values.' ) return (x - self.mu) / self.sigma @@ -352,7 +352,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: if self.mu is None or self.sigma is None: raise ValueError( 'Numeric transformer needs to be instantiated with ' - 'min and max values.' + 'mean and std values.' ) return x * self.sigma + self.mu From 6f446eaef63f00f58dc87780eb43c74e4b51d590 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 22 Nov 2020 20:15:00 -0500 Subject: [PATCH 08/18] refactor: incorporated reviewer comments --- ludwig/features/numerical_feature.py | 21 +-------------------- 1 file changed, 1 insertion(+), 20 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 57d3e8b9e3e..d38e224c04d 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -53,7 +53,7 @@ def get_feature_meta(column, preprocessing_parameters): numeric_transformer = get_from_registry( preprocessing_parameters.get('normalization', None), numeric_transformation_registry - )(**preprocessing_parameters) + ) return numeric_transformer.fit_transform_params(column) @@ -335,25 +335,13 @@ def populate_defaults(output_feature): class ZScoreTransformer: def __init__(self, mean: float = None, std: float = None, **kwargs: dict): - # When parameters are None we only need object to use - # the fit_transform_params method, other methods should not be used self.mu = mean self.sigma = std def transform(self, x: np.ndarray) -> np.ndarray: - if self.mu is None or self.sigma is None: - raise ValueError( - 'Numeric transformer needs to be instantiated with ' - 'mean and std values.' - ) return (x - self.mu) / self.sigma def inverse_transform(self, x: np.ndarray) -> np.ndarray: - if self.mu is None or self.sigma is None: - raise ValueError( - 'Numeric transformer needs to be instantiated with ' - 'mean and std values.' - ) return x * self.sigma + self.mu @staticmethod @@ -366,18 +354,11 @@ def fit_transform_params(column: np.ndarray) -> dict: class MinMaxTransformer: def __init__(self, min: float = None, max: float = None, **kwargs: dict): - # When parameters are None we only need object to use - # the fit_transform_params method, other methods should not be used self.min_value = min self.max_value = max self.range = None if min is None or max is None else max - min def transform(self, x: np.ndarray) -> np.ndarray: - if self.range is None: - raise ValueError( - 'Numeric transformer needs to be instantiated with ' - 'min and max values.' - ) return (x - self.min_value) / self.range def inverse_transform(self, x: np.ndarray) -> np.ndarray: From 29fc97b20f9e86527e89f934b5f21aad5ed0bd3e Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 22 Nov 2020 20:17:04 -0500 Subject: [PATCH 09/18] refactor: old code clean-up --- ludwig/features/numerical_feature.py | 24 ------------------------ 1 file changed, 24 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index d38e224c04d..af0508401e9 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -57,30 +57,6 @@ def get_feature_meta(column, preprocessing_parameters): return numeric_transformer.fit_transform_params(column) - # todo clean up if new code works - # if preprocessing_parameters['normalization'] is not None: - # if preprocessing_parameters['normalization'] == 'zscore': - # return { - # 'mean': column.astype(np.float32).mean(), - # 'std': column.astype(np.float32).std() - # } - # elif preprocessing_parameters['normalization'] == 'minmax': - # return { - # 'min': column.astype(np.float32).min(), - # 'max': column.astype(np.float32).max() - # } - # elif preprocessing_parameters['normalization'] == 'log1p': - # return {} - # else: - # logger.info( - # 'Currently zscore and minmax are the only ' - # 'normalization strategies available. No {}'.format( - # preprocessing_parameters['normalization']) - # ) - # return {} - # else: - # return {} - @staticmethod def add_feature_data( feature, From d6a65cb8bcc5fc9bcc8e238071552e5229058685 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 22 Nov 2020 20:20:24 -0500 Subject: [PATCH 10/18] refactor: marked static methods in numeric transformers --- ludwig/features/numerical_feature.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index af0508401e9..87f08a557ab 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -357,7 +357,8 @@ class Log1pTransformer: def __init__(self, **kwargs: dict): pass - def transform(self, x: np.ndarray) -> np.ndarray: + @staticmethod + def transform(x: np.ndarray) -> np.ndarray: if np.any(x <= 0): raise ValueError( 'One or more values are non-positive. ' @@ -365,7 +366,8 @@ def transform(self, x: np.ndarray) -> np.ndarray: ) return np.log1p(x) - def inverse_transform(self, x: np.ndarray) -> np.ndarray: + @staticmethod + def inverse_transform(x: np.ndarray) -> np.ndarray: return np.expm1(x) @staticmethod @@ -377,10 +379,12 @@ class IdentityTransformer: def __init__(self, **kwargs): pass - def transform(self, x: np.ndarray) -> np.ndarray: + @staticmethod + def transform(x: np.ndarray) -> np.ndarray: return x - def inverse_transform(self, x: np.ndarray) -> np.ndarray: + @staticmethod + def inverse_transform(x: np.ndarray) -> np.ndarray: return x @staticmethod From 3eb583796e525f3883bf722bc268dda6cf43bf47 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 22 Nov 2020 20:41:21 -0500 Subject: [PATCH 11/18] refactor: incorporated reviewer comments --- ludwig/features/numerical_feature.py | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 87f08a557ab..af0508401e9 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -357,8 +357,7 @@ class Log1pTransformer: def __init__(self, **kwargs: dict): pass - @staticmethod - def transform(x: np.ndarray) -> np.ndarray: + def transform(self, x: np.ndarray) -> np.ndarray: if np.any(x <= 0): raise ValueError( 'One or more values are non-positive. ' @@ -366,8 +365,7 @@ def transform(x: np.ndarray) -> np.ndarray: ) return np.log1p(x) - @staticmethod - def inverse_transform(x: np.ndarray) -> np.ndarray: + def inverse_transform(self, x: np.ndarray) -> np.ndarray: return np.expm1(x) @staticmethod @@ -379,12 +377,10 @@ class IdentityTransformer: def __init__(self, **kwargs): pass - @staticmethod - def transform(x: np.ndarray) -> np.ndarray: + def transform(self, x: np.ndarray) -> np.ndarray: return x - @staticmethod - def inverse_transform(x: np.ndarray) -> np.ndarray: + def inverse_transform(self, x: np.ndarray) -> np.ndarray: return x @staticmethod From ba184fefb00b2ebf11fa88cbaf8ef78a683d0c0e Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 22 Nov 2020 21:26:38 -0500 Subject: [PATCH 12/18] refactor: cache checksum test internal names --- .../test_model_training_options.py | 29 +++++++++++-------- 1 file changed, 17 insertions(+), 12 deletions(-) diff --git a/tests/integration_tests/test_model_training_options.py b/tests/integration_tests/test_model_training_options.py index 5c1b71ad93a..8e8ccb97f5b 100644 --- a/tests/integration_tests/test_model_training_options.py +++ b/tests/integration_tests/test_model_training_options.py @@ -405,48 +405,52 @@ def test_cache_checksum(csv_filename, tmp_path): model = LudwigModel(config) _, _, train_output_directory2 = \ model.train(dataset=source_dataset, output_directory=output_directory) - second_training_timestamp = \ + current_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # time stamps should be the same - assert first_training_timestamp == second_training_timestamp + assert first_training_timestamp == current_training_timestamp # force recreating cache file by changing checksum + prior_training_timestamp = current_training_timestamp config['preprocessing']['text']['most_common_word'] = 2000 model = LudwigModel(config) _, _, train_output_directory3 = \ model.train(dataset=source_dataset, output_directory=output_directory) - third_training_timestamp = \ + current_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamp should differ - assert first_training_timestamp < third_training_timestamp + assert prior_training_timestamp < current_training_timestamp # force recreating cache by updating modification time of source dataset + prior_training_timestamp = current_training_timestamp os.utime(source_dataset) model = LudwigModel(config) _, _, train_output_directory4 = \ model.train(dataset=source_dataset, output_directory=output_directory) - fourth_training_timestamp = \ + current_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different - assert third_training_timestamp < fourth_training_timestamp + assert prior_training_timestamp < current_training_timestamp # force change in feature preprocessing + prior_training_timestamp = current_training_timestamp input_features = config['input_features'].copy() input_features[0]['preprocessing'] = {'lowercase': True} config['input_features'] = input_features model = LudwigModel(config) _, _, train_output_directory5 = \ model.train(dataset=source_dataset, output_directory=output_directory) - fifth_training_timestamp = \ + current_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different - assert fourth_training_timestamp < fifth_training_timestamp + assert prior_training_timestamp < current_training_timestamp # force change in features names (and properties) + prior_training_timestamp = current_training_timestamp input_features = [category_feature(vocab_size=5), category_feature()] source_dataset = generate_data(input_features, output_features, source_dataset) @@ -454,19 +458,20 @@ def test_cache_checksum(csv_filename, tmp_path): model = LudwigModel(config) _, _, train_output_directory5 = \ model.train(dataset=source_dataset, output_directory=output_directory) - sixth_training_timestamp = \ + current_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different - assert fifth_training_timestamp < sixth_training_timestamp + assert prior_training_timestamp < current_training_timestamp # force change in Ludwig version + prior_training_timestamp = current_training_timestamp global_vars.LUDWIG_VERSION = 'new_version' model = LudwigModel(config) _, _, train_output_directory5 = \ model.train(dataset=source_dataset, output_directory=output_directory) - seventh_training_timestamp = \ + current_training_timestamp = \ os.path.getmtime(replace_file_extension(source_dataset, 'hdf5')) # timestamps should be different - assert sixth_training_timestamp < seventh_training_timestamp + assert prior_training_timestamp < current_training_timestamp From dad61dba8da0d68b809df06ede115bc6684844e7 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 22 Nov 2020 23:05:21 -0500 Subject: [PATCH 13/18] feat: add unit test for numeric transformers --- .../test_model_training_options.py | 66 +++++++++++++++++++ 1 file changed, 66 insertions(+) diff --git a/tests/integration_tests/test_model_training_options.py b/tests/integration_tests/test_model_training_options.py index 8e8ccb97f5b..e5e7d6a3c24 100644 --- a/tests/integration_tests/test_model_training_options.py +++ b/tests/integration_tests/test_model_training_options.py @@ -2,6 +2,7 @@ import os.path import re from collections import namedtuple +import logging import numpy as np import pandas as pd @@ -12,8 +13,10 @@ from ludwig import globals as global_vars from ludwig.api import LudwigModel from ludwig.experiment import experiment_cli +from ludwig.features.numerical_feature import numeric_transformation_registry from ludwig.modules.optimization_modules import optimizers_registry from ludwig.utils.data_utils import load_json, replace_file_extension +from ludwig.utils.misc_utils import get_from_registry from tests.integration_tests.utils import category_feature, generate_data RANDOM_SEED = 42 @@ -475,3 +478,66 @@ def test_cache_checksum(csv_filename, tmp_path): # timestamps should be different assert prior_training_timestamp < current_training_timestamp + + +@pytest.mark.parametrize( + 'transformer_key', list(numeric_transformation_registry.keys()) +) +def test_numeric_transformer(transformer_key, tmpdir): + Transformer = get_from_registry(transformer_key, + numeric_transformation_registry) + transformer_name = Transformer().__class__.__name__ + if transformer_name == 'Log1pTransformer': + raw_values = np.random.lognormal(5, 2, size=100) + else: + raw_values = np.random.normal(5, 2, size=100) + + parameters = Transformer.fit_transform_params(raw_values) + if transformer_name in {'Log1pTransformer', 'IdentityTransformer'}: + # should be empty + assert not bool(parameters) + else: + # should not be empty + assert bool(parameters) + + # instantiate numeric transformer + numeric_transfomer = Transformer(**parameters) + + # transform values + transformed_values = numeric_transfomer.transform(raw_values) + + # inverse transform the prior transformed values + reconstructed_values = \ + numeric_transfomer.inverse_transform(transformed_values) + + # should now match + assert np.allclose(raw_values, reconstructed_values) + + # now test numeric transformer with output feature + df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=['x', 'y']) + config = { + 'input_features': [ + {'name': 'x', 'type': 'numerical'} + ], + 'output_features': [ + {'name': 'y', 'type': 'numerical', + 'preprocessing': {'normalization': transformer_key}} + ], + 'combiner': { + 'type': 'concat', + }, + 'training': { + 'epochs': 2, + 'batch_size': 16, + } + } + + args = { + 'config': config, + 'skip_save_processed_input': True, + 'output_directory': os.path.join(tmpdir, 'results'), + 'logging_level': logging.WARN + } + + # ensure no exceptions are raised + experiment_cli(dataset=df, **args) From 40be903a60a80a8b4ded2c0ba8c71967618f4aa5 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 29 Nov 2020 00:58:26 -0500 Subject: [PATCH 14/18] fix: error retrieving backend df_engine compute --- ludwig/features/numerical_feature.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 9eff0580434..48682aed6f8 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -50,7 +50,7 @@ class NumericalFeatureMixin(object): @staticmethod def get_feature_meta(column, preprocessing_parameters, backend): - compute = backend.df_engine_compute + compute = backend.df_engine.compute numeric_transformer = get_from_registry( preprocessing_parameters.get('normalization', None), numeric_transformation_registry From 5bcb7e286362a8618d3eb4bf5210699d869a54f0 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 29 Nov 2020 01:23:27 -0500 Subject: [PATCH 15/18] fix: error preprocessing numeric feature --- ludwig/features/numerical_feature.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 48682aed6f8..ccf62b92431 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -79,6 +79,8 @@ def add_feature_data( proc_df[feature[PROC_COLUMN]] = \ numeric_transformer.transform(proc_df[feature[PROC_COLUMN]]) + return proc_df + class NumericalInputFeature(NumericalFeatureMixin, InputFeature): encoder = 'passthrough' From f2535a6a860b0ce9d1294bfbf8a41688ae5ae2f3 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 29 Nov 2020 09:17:01 -0500 Subject: [PATCH 16/18] refactor: incorporate new backend design --- ludwig/features/numerical_feature.py | 33 +++++++++++++++++++--------- 1 file changed, 23 insertions(+), 10 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index ccf62b92431..7070a51b087 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -50,13 +50,12 @@ class NumericalFeatureMixin(object): @staticmethod def get_feature_meta(column, preprocessing_parameters, backend): - compute = backend.df_engine.compute numeric_transformer = get_from_registry( preprocessing_parameters.get('normalization', None), numeric_transformation_registry ) - return numeric_transformer.fit_transform_params(column) + return numeric_transformer.fit_transform_params(column, backend) @staticmethod def add_feature_data( @@ -327,10 +326,14 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: return x * self.sigma + self.mu @staticmethod - def fit_transform_params(column: np.ndarray) -> dict: + def fit_transform_params( + column: np.ndarray, + backend: 'LocalBackend' + ) -> dict: + compute = backend.df_engine.compute return { - 'mean': column.astype(np.float32).mean(), - 'std': column.astype(np.float32).std() + 'mean': compute(column.astype(np.float32).mean()), + 'std': compute(column.astype(np.float32).std()) } @@ -352,10 +355,14 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: return x * self.range + self.min_value @staticmethod - def fit_transform_params(column: np.ndarray) -> dict: + def fit_transform_params( + column: np.ndarray, + backend: 'LocalBackend' + ) -> dict: + compute = backend.df_engine.compute return { - 'min': column.astype(np.float32).min(), - 'max': column.astype(np.float32).max() + 'min': compute(column.astype(np.float32).min()), + 'max': compute(column.astype(np.float32).max()) } @@ -375,7 +382,10 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: return np.expm1(x) @staticmethod - def fit_transform_params(column: np.ndarray) -> dict: + def fit_transform_params( + column: np.ndarray, + backend: 'LocalBackend' + ) -> dict: return {} @@ -390,7 +400,10 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: return x @staticmethod - def fit_transform_params(column: np.ndarray) -> dict: + def fit_transform_params( + column: np.ndarray, + backend: 'LocalBackend' + ) -> dict: return {} From 59fd83b28a45ca87fe04e7dfa6b8996ee4a5aef6 Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 29 Nov 2020 09:51:57 -0500 Subject: [PATCH 17/18] refactor: incorporate new backend design to unit test --- tests/integration_tests/test_model_training_options.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/tests/integration_tests/test_model_training_options.py b/tests/integration_tests/test_model_training_options.py index e5e7d6a3c24..f77ec023a20 100644 --- a/tests/integration_tests/test_model_training_options.py +++ b/tests/integration_tests/test_model_training_options.py @@ -12,6 +12,7 @@ from ludwig import globals as global_vars from ludwig.api import LudwigModel +from ludwig.backend import LOCAL_BACKEND from ludwig.experiment import experiment_cli from ludwig.features.numerical_feature import numeric_transformation_registry from ludwig.modules.optimization_modules import optimizers_registry @@ -492,7 +493,8 @@ def test_numeric_transformer(transformer_key, tmpdir): else: raw_values = np.random.normal(5, 2, size=100) - parameters = Transformer.fit_transform_params(raw_values) + backend = LOCAL_BACKEND + parameters = Transformer.fit_transform_params(raw_values, backend) if transformer_name in {'Log1pTransformer', 'IdentityTransformer'}: # should be empty assert not bool(parameters) From 630ef2bf633a4d5dd55f2c65ceda8f3390ea456a Mon Sep 17 00:00:00 2001 From: Jim Thompson Date: Sun, 29 Nov 2020 16:43:30 -0500 Subject: [PATCH 18/18] doc: incorporated reviewer comments --- ludwig/features/numerical_feature.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py index 7070a51b087..1e60d55134b 100644 --- a/ludwig/features/numerical_feature.py +++ b/ludwig/features/numerical_feature.py @@ -328,7 +328,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: @staticmethod def fit_transform_params( column: np.ndarray, - backend: 'LocalBackend' + backend: 'Backend' ) -> dict: compute = backend.df_engine.compute return { @@ -357,7 +357,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: @staticmethod def fit_transform_params( column: np.ndarray, - backend: 'LocalBackend' + backend: 'Backend' ) -> dict: compute = backend.df_engine.compute return { @@ -384,7 +384,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: @staticmethod def fit_transform_params( column: np.ndarray, - backend: 'LocalBackend' + backend: 'Backend' ) -> dict: return {} @@ -402,7 +402,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray: @staticmethod def fit_transform_params( column: np.ndarray, - backend: 'LocalBackend' + backend: 'Backend' ) -> dict: return {}