From fc55d36af4297e34ece083979652f8bdc92148ae Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Wed, 18 Nov 2020 20:25:02 -0500
Subject: [PATCH 01/18] feat: add support for transforming numeric predictions

---
 ludwig/features/numerical_feature.py | 111 ++++++++++++++++++++++++---
 1 file changed, 99 insertions(+), 12 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 64d9aa21db8..0e7618ed8bd 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -35,6 +35,7 @@
 from ludwig.utils.horovod_utils import is_on_master
 from ludwig.utils.misc_utils import set_default_value
 from ludwig.utils.misc_utils import set_default_values
+from ludwig.utils.misc_utils import get_from_registry
 
 logger = logging.getLogger(__name__)
 
@@ -81,17 +82,44 @@ def add_feature_data(
         dataset[feature[PROC_COLUMN]] = dataset_df[feature[COLUMN]].astype(
             np.float32).values
         if preprocessing_parameters['normalization'] is not None:
-            if preprocessing_parameters['normalization'] == 'zscore':
-                mean = metadata[feature[NAME]]['mean']
-                std = metadata[feature[NAME]]['std']
-                dataset[feature[PROC_COLUMN]] = (dataset[
-                                                     feature[
-                                                         PROC_COLUMN]] - mean) / std
-            elif preprocessing_parameters['normalization'] == 'minmax':
-                min_ = metadata[feature[NAME]]['min']
-                max_ = metadata[feature[NAME]]['max']
-                values = dataset[feature[PROC_COLUMN]]
-                dataset[feature[PROC_COLUMN]] = (values - min_) / (max_ - min_)
+            normalization_type = preprocessing_parameters['normalization']
+            NumericTransformer = get_from_registry(
+                normalization_type,
+                numeric_transformation_registry
+            )
+
+            if normalization_type == 'zscore':
+                numeric_transformer = NumericTransformer(
+                    metadata[feature[NAME]]['mean'],
+                    metadata[feature[NAME]]['std']
+                )
+            elif normalization_type == 'minmax':
+                numeric_transformer = NumericTransformer(
+                    metadata[feature[NAME]]['min'],
+                    metadata[feature[NAME]]['max']
+                )
+            else:
+                raise ValueError(
+                    'Normalization "{}" not supported. Valid values are '
+                    '"minmax" or "zscore"'.format(normalization_type)
+                )
+
+            values = dataset[feature[PROC_COLUMN]]
+            dataset[feature[PROC_COLUMN]] = numeric_transformer.transform(
+                values)
+
+            # todo: clean up after implementing numeric transformers
+            # if preprocessing_parameters['normalization'] == 'zscore':
+            #     mean = metadata[feature[NAME]]['mean']
+            #     std = metadata[feature[NAME]]['std']
+            #     dataset[feature[PROC_COLUMN]] = (dataset[
+            #                                          feature[
+            #                                              PROC_COLUMN]] - mean) / std
+            # elif preprocessing_parameters['normalization'] == 'minmax':
+            #     min_ = metadata[feature[NAME]]['min']
+            #     max_ = metadata[feature[NAME]]['max']
+            #     values = dataset[feature[PROC_COLUMN]]
+            #     dataset[feature[PROC_COLUMN]] = (values - min_) / (max_ - min_)
 
 
 class NumericalInputFeature(NumericalFeatureMixin, InputFeature):
@@ -267,7 +295,35 @@ def postprocess_predictions(
             skip_save_unprocessed_output = True
 
         if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0:
-            postprocessed[PREDICTIONS] = predictions[PREDICTIONS].numpy()
+            if metadata['preprocessing']['normalization'] is not None:
+                normalization_type = metadata['preprocessing']['normalization']
+                NumericTransformer = get_from_registry(
+                    normalization_type,
+                    numeric_transformation_registry
+                )
+                if normalization_type == 'zscore':
+                    numeric_transformer = NumericTransformer(
+                        metadata['mean'],
+                        metadata['std']
+                    )
+                elif normalization_type == 'minmax':
+                    numeric_transformer = NumericTransformer(
+                        metadata['min'],
+                        metadata['max']
+                    )
+                else:
+                    raise ValueError(
+                        'Normalization "{}" not supported. Valid values are '
+                        '"minmax" or "zscore"'.format(normalization_type)
+                    )
+
+                values_to_return = numeric_transformer.inverse_transform(
+                    predictions[PREDICTIONS].numpy()
+                )
+            else:
+                values_to_return = predictions[PREDICTIONS].numpy()
+
+            postprocessed[PREDICTIONS] = values_to_return
             if not skip_save_unprocessed_output:
                 np.save(
                     npy_filename.format(name, PREDICTIONS),
@@ -314,3 +370,34 @@ def populate_defaults(output_feature):
         'None': Regressor,
         None: Regressor
     }
+
+
+class ZScoreTransformer:
+    def __init__(self, mu: float, sigma: float):
+        self.mu = mu
+        self.sigma = sigma
+
+    def transform(self, x: np.ndarray) -> np.ndarray:
+        return (x - self.mu) / self.sigma
+
+    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+        return x * self.sigma + self.mu
+
+
+class MinMaxTransformer:
+    def __init__(self, min_value: float, max_value: float):
+        self.min_value = min_value
+        self.max_value = max_value
+        self.range = max_value - min_value
+
+    def transform(self, x: np.ndarray) -> np.ndarray:
+        return (x - self.min_value) / self.range
+
+    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+        return x * self.range + self.min_value
+
+
+numeric_transformation_registry = {
+    'minmax': MinMaxTransformer,
+    'zscore': ZScoreTransformer
+}

From e96fcd2a4b33bbb9d62a96f2b688e4e259d4757e Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Thu, 19 Nov 2020 20:50:04 -0500
Subject: [PATCH 02/18] feat: add log1p normalization for numerical feature

---
 ludwig/features/numerical_feature.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 0e7618ed8bd..47200dcf289 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -61,6 +61,8 @@ def get_feature_meta(column, preprocessing_parameters):
                     'min': column.astype(np.float32).min(),
                     'max': column.astype(np.float32).max()
                 }
+            elif preprocessing_parameters['normalization'] == 'log1p':
+                return {}
             else:
                 logger.info(
                     'Currently zscore and minmax are the only '
@@ -98,6 +100,8 @@ def add_feature_data(
                     metadata[feature[NAME]]['min'],
                     metadata[feature[NAME]]['max']
                 )
+            elif normalization_type == 'log1p':
+                numeric_transformer = NumericTransformer()
             else:
                 raise ValueError(
                     'Normalization "{}" not supported. Valid values are '
@@ -311,6 +315,8 @@ def postprocess_predictions(
                         metadata['min'],
                         metadata['max']
                     )
+                elif normalization_type == 'log1p':
+                    numeric_transformer = NumericTransformer()
                 else:
                     raise ValueError(
                         'Normalization "{}" not supported. Valid values are '
@@ -397,7 +403,22 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return x * self.range + self.min_value
 
 
+class Log1pTransformer:
+
+    def transform(self, x: np.ndarray) -> np.ndarray:
+        if np.any(x <= 0):
+            raise ValueError(
+                'One or more values are non-positive.  '
+                'log1p normalization only defined for positive values.'
+            )
+        return np.log1p(x)
+
+    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+        return np.expm1(x)
+
+
 numeric_transformation_registry = {
     'minmax': MinMaxTransformer,
-    'zscore': ZScoreTransformer
+    'zscore': ZScoreTransformer,
+    'log1p': Log1pTransformer
 }

From 5e076bb396e3328d712d75a7433a0cbbaef2dca9 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Fri, 20 Nov 2020 18:55:21 -0500
Subject: [PATCH 03/18] refactor: incorporate reviewer comments

---
 ludwig/features/numerical_feature.py | 65 +++++-----------------------
 1 file changed, 11 insertions(+), 54 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 47200dcf289..a2922f12c7e 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -90,41 +90,12 @@ def add_feature_data(
                 numeric_transformation_registry
             )
 
-            if normalization_type == 'zscore':
-                numeric_transformer = NumericTransformer(
-                    metadata[feature[NAME]]['mean'],
-                    metadata[feature[NAME]]['std']
-                )
-            elif normalization_type == 'minmax':
-                numeric_transformer = NumericTransformer(
-                    metadata[feature[NAME]]['min'],
-                    metadata[feature[NAME]]['max']
-                )
-            elif normalization_type == 'log1p':
-                numeric_transformer = NumericTransformer()
-            else:
-                raise ValueError(
-                    'Normalization "{}" not supported. Valid values are '
-                    '"minmax" or "zscore"'.format(normalization_type)
-                )
+            numeric_transformer = NumericTransformer(**metadata[feature[NAME]])
 
             values = dataset[feature[PROC_COLUMN]]
             dataset[feature[PROC_COLUMN]] = numeric_transformer.transform(
                 values)
 
-            # todo: clean up after implementing numeric transformers
-            # if preprocessing_parameters['normalization'] == 'zscore':
-            #     mean = metadata[feature[NAME]]['mean']
-            #     std = metadata[feature[NAME]]['std']
-            #     dataset[feature[PROC_COLUMN]] = (dataset[
-            #                                          feature[
-            #                                              PROC_COLUMN]] - mean) / std
-            # elif preprocessing_parameters['normalization'] == 'minmax':
-            #     min_ = metadata[feature[NAME]]['min']
-            #     max_ = metadata[feature[NAME]]['max']
-            #     values = dataset[feature[PROC_COLUMN]]
-            #     dataset[feature[PROC_COLUMN]] = (values - min_) / (max_ - min_)
-
 
 class NumericalInputFeature(NumericalFeatureMixin, InputFeature):
     encoder = 'passthrough'
@@ -305,24 +276,8 @@ def postprocess_predictions(
                     normalization_type,
                     numeric_transformation_registry
                 )
-                if normalization_type == 'zscore':
-                    numeric_transformer = NumericTransformer(
-                        metadata['mean'],
-                        metadata['std']
-                    )
-                elif normalization_type == 'minmax':
-                    numeric_transformer = NumericTransformer(
-                        metadata['min'],
-                        metadata['max']
-                    )
-                elif normalization_type == 'log1p':
-                    numeric_transformer = NumericTransformer()
-                else:
-                    raise ValueError(
-                        'Normalization "{}" not supported. Valid values are '
-                        '"minmax" or "zscore"'.format(normalization_type)
-                    )
 
+                numeric_transformer = NumericTransformer(**metadata)
                 values_to_return = numeric_transformer.inverse_transform(
                     predictions[PREDICTIONS].numpy()
                 )
@@ -379,9 +334,9 @@ def populate_defaults(output_feature):
 
 
 class ZScoreTransformer:
-    def __init__(self, mu: float, sigma: float):
-        self.mu = mu
-        self.sigma = sigma
+    def __init__(self, mean: float = None, std: float = None, **kwargs: dict):
+        self.mu = mean
+        self.sigma = std
 
     def transform(self, x: np.ndarray) -> np.ndarray:
         return (x - self.mu) / self.sigma
@@ -391,10 +346,10 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
 
 
 class MinMaxTransformer:
-    def __init__(self, min_value: float, max_value: float):
-        self.min_value = min_value
-        self.max_value = max_value
-        self.range = max_value - min_value
+    def __init__(self, min: float = None, max: float = None, **kwargs: dict):
+        self.min_value = min
+        self.max_value = max
+        self.range = self.max_value - self.min_value
 
     def transform(self, x: np.ndarray) -> np.ndarray:
         return (x - self.min_value) / self.range
@@ -404,6 +359,8 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
 
 
 class Log1pTransformer:
+    def __init__(self, **kwargs: dict):
+        pass
 
     def transform(self, x: np.ndarray) -> np.ndarray:
         if np.any(x <= 0):

From 43f726132ca1a8ced4493d6869996b9ea802e710 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Fri, 20 Nov 2020 18:58:20 -0500
Subject: [PATCH 04/18] doc: incorporated reviewer comments re: error message

---
 ludwig/features/numerical_feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index a2922f12c7e..fc40a232800 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -366,7 +366,7 @@ def transform(self, x: np.ndarray) -> np.ndarray:
         if np.any(x <= 0):
             raise ValueError(
                 'One or more values are non-positive.  '
-                'log1p normalization only defined for positive values.'
+                'log1p normalization is defined only for positive values.'
             )
         return np.log1p(x)
 

From 358302f2201aa58e4a4d4e5119eb2354271449c3 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Fri, 20 Nov 2020 19:44:30 -0500
Subject: [PATCH 05/18] refactor: incorporated reviewer comments for improved
 abstraction

---
 ludwig/features/numerical_feature.py | 50 +++++++++++++++-------------
 1 file changed, 27 insertions(+), 23 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index fc40a232800..ca09e8c4f8b 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -83,18 +83,15 @@ def add_feature_data(
     ):
         dataset[feature[PROC_COLUMN]] = dataset_df[feature[COLUMN]].astype(
             np.float32).values
-        if preprocessing_parameters['normalization'] is not None:
-            normalization_type = preprocessing_parameters['normalization']
-            NumericTransformer = get_from_registry(
-                normalization_type,
-                numeric_transformation_registry
-            )
 
-            numeric_transformer = NumericTransformer(**metadata[feature[NAME]])
+        # normalize data as required
+        numeric_transformer = get_from_registry(
+            preprocessing_parameters.get('normalization', None),
+            numeric_transformation_registry
+        )(**metadata[feature[NAME]])
 
-            values = dataset[feature[PROC_COLUMN]]
-            dataset[feature[PROC_COLUMN]] = numeric_transformer.transform(
-                values)
+        dataset[feature[PROC_COLUMN]] = \
+            numeric_transformer.transform(dataset[feature[PROC_COLUMN]])
 
 
 class NumericalInputFeature(NumericalFeatureMixin, InputFeature):
@@ -270,21 +267,16 @@ def postprocess_predictions(
             skip_save_unprocessed_output = True
 
         if PREDICTIONS in predictions and len(predictions[PREDICTIONS]) > 0:
-            if metadata['preprocessing']['normalization'] is not None:
-                normalization_type = metadata['preprocessing']['normalization']
-                NumericTransformer = get_from_registry(
-                    normalization_type,
-                    numeric_transformation_registry
-                )
-
-                numeric_transformer = NumericTransformer(**metadata)
-                values_to_return = numeric_transformer.inverse_transform(
+            # as needed convert predictions make to original value space
+            numeric_transformer = get_from_registry(
+                metadata['preprocessing'].get('normalization', None),
+                numeric_transformation_registry
+            )(**metadata)
+            postprocessed[PREDICTIONS] = \
+                numeric_transformer.inverse_transform(
                     predictions[PREDICTIONS].numpy()
                 )
-            else:
-                values_to_return = predictions[PREDICTIONS].numpy()
 
-            postprocessed[PREDICTIONS] = values_to_return
             if not skip_save_unprocessed_output:
                 np.save(
                     npy_filename.format(name, PREDICTIONS),
@@ -374,8 +366,20 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return np.expm1(x)
 
 
+class IdentityTransformer:
+    def __init__(self, **kwargs):
+        pass
+
+    def transform(self, x: np.ndarray) -> np.ndarray:
+        return x
+
+    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+        return x
+
+
 numeric_transformation_registry = {
     'minmax': MinMaxTransformer,
     'zscore': ZScoreTransformer,
-    'log1p': Log1pTransformer
+    'log1p': Log1pTransformer,
+    None: IdentityTransformer
 }

From d0d62edcdde3e40101cf8c5951bec66f1bf344b8 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Fri, 20 Nov 2020 22:22:46 -0500
Subject: [PATCH 06/18] refactor: add fit_transform_params method

---
 ludwig/features/numerical_feature.py | 100 +++++++++++++++++++++------
 1 file changed, 77 insertions(+), 23 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index ca09e8c4f8b..6be4bbf12a2 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -50,28 +50,36 @@ class NumericalFeatureMixin(object):
 
     @staticmethod
     def get_feature_meta(column, preprocessing_parameters):
-        if preprocessing_parameters['normalization'] is not None:
-            if preprocessing_parameters['normalization'] == 'zscore':
-                return {
-                    'mean': column.astype(np.float32).mean(),
-                    'std': column.astype(np.float32).std()
-                }
-            elif preprocessing_parameters['normalization'] == 'minmax':
-                return {
-                    'min': column.astype(np.float32).min(),
-                    'max': column.astype(np.float32).max()
-                }
-            elif preprocessing_parameters['normalization'] == 'log1p':
-                return {}
-            else:
-                logger.info(
-                    'Currently zscore and minmax are the only '
-                    'normalization strategies available. No {}'.format(
-                        preprocessing_parameters['normalization'])
-                )
-                return {}
-        else:
-            return {}
+        numeric_transformer = get_from_registry(
+            preprocessing_parameters.get('normalization', None),
+            numeric_transformation_registry
+        )(**preprocessing_parameters)
+
+        return numeric_transformer.fit_transform_params(column)
+
+        # todo clean up if new code works
+        # if preprocessing_parameters['normalization'] is not None:
+        #     if preprocessing_parameters['normalization'] == 'zscore':
+        #         return {
+        #             'mean': column.astype(np.float32).mean(),
+        #             'std': column.astype(np.float32).std()
+        #         }
+        #     elif preprocessing_parameters['normalization'] == 'minmax':
+        #         return {
+        #             'min': column.astype(np.float32).min(),
+        #             'max': column.astype(np.float32).max()
+        #         }
+        #     elif preprocessing_parameters['normalization'] == 'log1p':
+        #         return {}
+        #     else:
+        #         logger.info(
+        #             'Currently zscore and minmax are the only '
+        #             'normalization strategies available. No {}'.format(
+        #                 preprocessing_parameters['normalization'])
+        #         )
+        #         return {}
+        # else:
+        #     return {}
 
     @staticmethod
     def add_feature_data(
@@ -327,28 +335,66 @@ def populate_defaults(output_feature):
 
 class ZScoreTransformer:
     def __init__(self, mean: float = None, std: float = None, **kwargs: dict):
+        # When parameters are None we only need object to use
+        # the fit_transform_params method, other methods should not be used
         self.mu = mean
         self.sigma = std
 
     def transform(self, x: np.ndarray) -> np.ndarray:
+        if self.mu is None or self.sigma is None:
+            raise ValueError(
+                'Numeric transformer needs to be instantiated with '
+                'min and max values.'
+            )
         return (x - self.mu) / self.sigma
 
     def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+        if self.mu is None or self.sigma is None:
+            raise ValueError(
+                'Numeric transformer needs to be instantiated with '
+                'min and max values.'
+            )
         return x * self.sigma + self.mu
 
+    @staticmethod
+    def fit_transform_params(column: np.ndarray) -> dict:
+        return {
+            'mean': column.astype(np.float32).mean(),
+            'std': column.astype(np.float32).std()
+        }
+
 
 class MinMaxTransformer:
     def __init__(self, min: float = None, max: float = None, **kwargs: dict):
+        # When parameters are None we only need object to use
+        # the fit_transform_params method, other methods should not be used
         self.min_value = min
         self.max_value = max
-        self.range = self.max_value - self.min_value
+        self.range = None if min is None or max is None else max - min
 
     def transform(self, x: np.ndarray) -> np.ndarray:
+        if self.range is None:
+            raise ValueError(
+                'Numeric transformer needs to be instantiated with '
+                'min and max values.'
+            )
         return (x - self.min_value) / self.range
 
     def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+        if self.range is None:
+            raise ValueError(
+                'Numeric transformer needs to be instantiated with '
+                'min and max values.'
+            )
         return x * self.range + self.min_value
 
+    @staticmethod
+    def fit_transform_params(column: np.ndarray) -> dict:
+        return {
+            'min': column.astype(np.float32).min(),
+            'max': column.astype(np.float32).max()
+        }
+
 
 class Log1pTransformer:
     def __init__(self, **kwargs: dict):
@@ -365,6 +411,10 @@ def transform(self, x: np.ndarray) -> np.ndarray:
     def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return np.expm1(x)
 
+    @staticmethod
+    def fit_transform_params(column: np.ndarray) -> dict:
+        return {}
+
 
 class IdentityTransformer:
     def __init__(self, **kwargs):
@@ -376,6 +426,10 @@ def transform(self, x: np.ndarray) -> np.ndarray:
     def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return x
 
+    @staticmethod
+    def fit_transform_params(column: np.ndarray) -> dict:
+        return {}
+
 
 numeric_transformation_registry = {
     'minmax': MinMaxTransformer,

From 388ca52f7f52e467e3d7ad6990210749cd75fc5b Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Fri, 20 Nov 2020 22:29:24 -0500
Subject: [PATCH 07/18] doc: fix error message text

---
 ludwig/features/numerical_feature.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 6be4bbf12a2..57d3e8b9e3e 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -344,7 +344,7 @@ def transform(self, x: np.ndarray) -> np.ndarray:
         if self.mu is None or self.sigma is None:
             raise ValueError(
                 'Numeric transformer needs to be instantiated with '
-                'min and max values.'
+                'mean and std values.'
             )
         return (x - self.mu) / self.sigma
 
@@ -352,7 +352,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         if self.mu is None or self.sigma is None:
             raise ValueError(
                 'Numeric transformer needs to be instantiated with '
-                'min and max values.'
+                'mean and std values.'
             )
         return x * self.sigma + self.mu
 

From 6f446eaef63f00f58dc87780eb43c74e4b51d590 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 22 Nov 2020 20:15:00 -0500
Subject: [PATCH 08/18] refactor: incorporated reviewer comments

---
 ludwig/features/numerical_feature.py | 21 +--------------------
 1 file changed, 1 insertion(+), 20 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 57d3e8b9e3e..d38e224c04d 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -53,7 +53,7 @@ def get_feature_meta(column, preprocessing_parameters):
         numeric_transformer = get_from_registry(
             preprocessing_parameters.get('normalization', None),
             numeric_transformation_registry
-        )(**preprocessing_parameters)
+        )
 
         return numeric_transformer.fit_transform_params(column)
 
@@ -335,25 +335,13 @@ def populate_defaults(output_feature):
 
 class ZScoreTransformer:
     def __init__(self, mean: float = None, std: float = None, **kwargs: dict):
-        # When parameters are None we only need object to use
-        # the fit_transform_params method, other methods should not be used
         self.mu = mean
         self.sigma = std
 
     def transform(self, x: np.ndarray) -> np.ndarray:
-        if self.mu is None or self.sigma is None:
-            raise ValueError(
-                'Numeric transformer needs to be instantiated with '
-                'mean and std values.'
-            )
         return (x - self.mu) / self.sigma
 
     def inverse_transform(self, x: np.ndarray) -> np.ndarray:
-        if self.mu is None or self.sigma is None:
-            raise ValueError(
-                'Numeric transformer needs to be instantiated with '
-                'mean and std values.'
-            )
         return x * self.sigma + self.mu
 
     @staticmethod
@@ -366,18 +354,11 @@ def fit_transform_params(column: np.ndarray) -> dict:
 
 class MinMaxTransformer:
     def __init__(self, min: float = None, max: float = None, **kwargs: dict):
-        # When parameters are None we only need object to use
-        # the fit_transform_params method, other methods should not be used
         self.min_value = min
         self.max_value = max
         self.range = None if min is None or max is None else max - min
 
     def transform(self, x: np.ndarray) -> np.ndarray:
-        if self.range is None:
-            raise ValueError(
-                'Numeric transformer needs to be instantiated with '
-                'min and max values.'
-            )
         return (x - self.min_value) / self.range
 
     def inverse_transform(self, x: np.ndarray) -> np.ndarray:

From 29fc97b20f9e86527e89f934b5f21aad5ed0bd3e Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 22 Nov 2020 20:17:04 -0500
Subject: [PATCH 09/18] refactor: old code clean-up

---
 ludwig/features/numerical_feature.py | 24 ------------------------
 1 file changed, 24 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index d38e224c04d..af0508401e9 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -57,30 +57,6 @@ def get_feature_meta(column, preprocessing_parameters):
 
         return numeric_transformer.fit_transform_params(column)
 
-        # todo clean up if new code works
-        # if preprocessing_parameters['normalization'] is not None:
-        #     if preprocessing_parameters['normalization'] == 'zscore':
-        #         return {
-        #             'mean': column.astype(np.float32).mean(),
-        #             'std': column.astype(np.float32).std()
-        #         }
-        #     elif preprocessing_parameters['normalization'] == 'minmax':
-        #         return {
-        #             'min': column.astype(np.float32).min(),
-        #             'max': column.astype(np.float32).max()
-        #         }
-        #     elif preprocessing_parameters['normalization'] == 'log1p':
-        #         return {}
-        #     else:
-        #         logger.info(
-        #             'Currently zscore and minmax are the only '
-        #             'normalization strategies available. No {}'.format(
-        #                 preprocessing_parameters['normalization'])
-        #         )
-        #         return {}
-        # else:
-        #     return {}
-
     @staticmethod
     def add_feature_data(
             feature,

From d6a65cb8bcc5fc9bcc8e238071552e5229058685 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 22 Nov 2020 20:20:24 -0500
Subject: [PATCH 10/18] refactor: marked static methods in numeric transformers

---
 ludwig/features/numerical_feature.py | 12 ++++++++----
 1 file changed, 8 insertions(+), 4 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index af0508401e9..87f08a557ab 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -357,7 +357,8 @@ class Log1pTransformer:
     def __init__(self, **kwargs: dict):
         pass
 
-    def transform(self, x: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def transform(x: np.ndarray) -> np.ndarray:
         if np.any(x <= 0):
             raise ValueError(
                 'One or more values are non-positive.  '
@@ -365,7 +366,8 @@ def transform(self, x: np.ndarray) -> np.ndarray:
             )
         return np.log1p(x)
 
-    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def inverse_transform(x: np.ndarray) -> np.ndarray:
         return np.expm1(x)
 
     @staticmethod
@@ -377,10 +379,12 @@ class IdentityTransformer:
     def __init__(self, **kwargs):
         pass
 
-    def transform(self, x: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def transform(x: np.ndarray) -> np.ndarray:
         return x
 
-    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
+    @staticmethod
+    def inverse_transform(x: np.ndarray) -> np.ndarray:
         return x
 
     @staticmethod

From 3eb583796e525f3883bf722bc268dda6cf43bf47 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 22 Nov 2020 20:41:21 -0500
Subject: [PATCH 11/18] refactor: incorporated reviewer comments

---
 ludwig/features/numerical_feature.py | 12 ++++--------
 1 file changed, 4 insertions(+), 8 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 87f08a557ab..af0508401e9 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -357,8 +357,7 @@ class Log1pTransformer:
     def __init__(self, **kwargs: dict):
         pass
 
-    @staticmethod
-    def transform(x: np.ndarray) -> np.ndarray:
+    def transform(self, x: np.ndarray) -> np.ndarray:
         if np.any(x <= 0):
             raise ValueError(
                 'One or more values are non-positive.  '
@@ -366,8 +365,7 @@ def transform(x: np.ndarray) -> np.ndarray:
             )
         return np.log1p(x)
 
-    @staticmethod
-    def inverse_transform(x: np.ndarray) -> np.ndarray:
+    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return np.expm1(x)
 
     @staticmethod
@@ -379,12 +377,10 @@ class IdentityTransformer:
     def __init__(self, **kwargs):
         pass
 
-    @staticmethod
-    def transform(x: np.ndarray) -> np.ndarray:
+    def transform(self, x: np.ndarray) -> np.ndarray:
         return x
 
-    @staticmethod
-    def inverse_transform(x: np.ndarray) -> np.ndarray:
+    def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return x
 
     @staticmethod

From ba184fefb00b2ebf11fa88cbaf8ef78a683d0c0e Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 22 Nov 2020 21:26:38 -0500
Subject: [PATCH 12/18] refactor: cache checksum test internal names

---
 .../test_model_training_options.py            | 29 +++++++++++--------
 1 file changed, 17 insertions(+), 12 deletions(-)

diff --git a/tests/integration_tests/test_model_training_options.py b/tests/integration_tests/test_model_training_options.py
index 5c1b71ad93a..8e8ccb97f5b 100644
--- a/tests/integration_tests/test_model_training_options.py
+++ b/tests/integration_tests/test_model_training_options.py
@@ -405,48 +405,52 @@ def test_cache_checksum(csv_filename, tmp_path):
     model = LudwigModel(config)
     _, _, train_output_directory2 = \
         model.train(dataset=source_dataset, output_directory=output_directory)
-    second_training_timestamp = \
+    current_training_timestamp = \
         os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))
 
     # time stamps should be the same
-    assert first_training_timestamp == second_training_timestamp
+    assert first_training_timestamp == current_training_timestamp
 
     # force recreating cache file by changing checksum
+    prior_training_timestamp = current_training_timestamp
     config['preprocessing']['text']['most_common_word'] = 2000
     model = LudwigModel(config)
     _, _, train_output_directory3 = \
         model.train(dataset=source_dataset, output_directory=output_directory)
-    third_training_timestamp = \
+    current_training_timestamp = \
         os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))
 
     # timestamp should differ
-    assert first_training_timestamp < third_training_timestamp
+    assert prior_training_timestamp < current_training_timestamp
 
     # force recreating cache by updating modification time of source dataset
+    prior_training_timestamp = current_training_timestamp
     os.utime(source_dataset)
     model = LudwigModel(config)
     _, _, train_output_directory4 = \
         model.train(dataset=source_dataset, output_directory=output_directory)
-    fourth_training_timestamp = \
+    current_training_timestamp = \
         os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))
 
     # timestamps should be different
-    assert third_training_timestamp < fourth_training_timestamp
+    assert prior_training_timestamp < current_training_timestamp
 
     # force change in feature preprocessing
+    prior_training_timestamp = current_training_timestamp
     input_features = config['input_features'].copy()
     input_features[0]['preprocessing'] = {'lowercase': True}
     config['input_features'] = input_features
     model = LudwigModel(config)
     _, _, train_output_directory5 = \
         model.train(dataset=source_dataset, output_directory=output_directory)
-    fifth_training_timestamp = \
+    current_training_timestamp = \
         os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))
 
     # timestamps should be different
-    assert fourth_training_timestamp < fifth_training_timestamp
+    assert prior_training_timestamp < current_training_timestamp
 
     # force change in features names (and properties)
+    prior_training_timestamp = current_training_timestamp
     input_features = [category_feature(vocab_size=5), category_feature()]
     source_dataset = generate_data(input_features, output_features,
                                    source_dataset)
@@ -454,19 +458,20 @@ def test_cache_checksum(csv_filename, tmp_path):
     model = LudwigModel(config)
     _, _, train_output_directory5 = \
         model.train(dataset=source_dataset, output_directory=output_directory)
-    sixth_training_timestamp = \
+    current_training_timestamp = \
         os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))
 
     # timestamps should be different
-    assert fifth_training_timestamp < sixth_training_timestamp
+    assert prior_training_timestamp < current_training_timestamp
 
     # force change in Ludwig version
+    prior_training_timestamp = current_training_timestamp
     global_vars.LUDWIG_VERSION = 'new_version'
     model = LudwigModel(config)
     _, _, train_output_directory5 = \
         model.train(dataset=source_dataset, output_directory=output_directory)
-    seventh_training_timestamp = \
+    current_training_timestamp = \
         os.path.getmtime(replace_file_extension(source_dataset, 'hdf5'))
 
     # timestamps should be different
-    assert sixth_training_timestamp < seventh_training_timestamp
+    assert prior_training_timestamp < current_training_timestamp

From dad61dba8da0d68b809df06ede115bc6684844e7 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 22 Nov 2020 23:05:21 -0500
Subject: [PATCH 13/18] feat: add unit test for numeric transformers

---
 .../test_model_training_options.py            | 66 +++++++++++++++++++
 1 file changed, 66 insertions(+)

diff --git a/tests/integration_tests/test_model_training_options.py b/tests/integration_tests/test_model_training_options.py
index 8e8ccb97f5b..e5e7d6a3c24 100644
--- a/tests/integration_tests/test_model_training_options.py
+++ b/tests/integration_tests/test_model_training_options.py
@@ -2,6 +2,7 @@
 import os.path
 import re
 from collections import namedtuple
+import logging
 
 import numpy as np
 import pandas as pd
@@ -12,8 +13,10 @@
 from ludwig import globals as global_vars
 from ludwig.api import LudwigModel
 from ludwig.experiment import experiment_cli
+from ludwig.features.numerical_feature import numeric_transformation_registry
 from ludwig.modules.optimization_modules import optimizers_registry
 from ludwig.utils.data_utils import load_json, replace_file_extension
+from ludwig.utils.misc_utils import get_from_registry
 from tests.integration_tests.utils import category_feature, generate_data
 
 RANDOM_SEED = 42
@@ -475,3 +478,66 @@ def test_cache_checksum(csv_filename, tmp_path):
 
     # timestamps should be different
     assert prior_training_timestamp < current_training_timestamp
+
+
+@pytest.mark.parametrize(
+    'transformer_key', list(numeric_transformation_registry.keys())
+)
+def test_numeric_transformer(transformer_key, tmpdir):
+    Transformer = get_from_registry(transformer_key,
+                                    numeric_transformation_registry)
+    transformer_name = Transformer().__class__.__name__
+    if transformer_name == 'Log1pTransformer':
+        raw_values = np.random.lognormal(5, 2, size=100)
+    else:
+        raw_values = np.random.normal(5, 2, size=100)
+
+    parameters = Transformer.fit_transform_params(raw_values)
+    if transformer_name in {'Log1pTransformer', 'IdentityTransformer'}:
+        # should be empty
+        assert not bool(parameters)
+    else:
+        # should not be empty
+        assert bool(parameters)
+
+    # instantiate numeric transformer
+    numeric_transfomer = Transformer(**parameters)
+
+    # transform values
+    transformed_values = numeric_transfomer.transform(raw_values)
+
+    # inverse transform the prior transformed values
+    reconstructed_values = \
+        numeric_transfomer.inverse_transform(transformed_values)
+
+    # should now match
+    assert np.allclose(raw_values, reconstructed_values)
+
+    # now test numeric transformer with output feature
+    df = pd.DataFrame(np.array([raw_values, raw_values]).T, columns=['x', 'y'])
+    config = {
+        'input_features': [
+            {'name': 'x', 'type': 'numerical'}
+        ],
+        'output_features': [
+            {'name': 'y', 'type': 'numerical',
+             'preprocessing': {'normalization': transformer_key}}
+        ],
+        'combiner': {
+            'type': 'concat',
+        },
+        'training': {
+            'epochs': 2,
+            'batch_size': 16,
+        }
+    }
+
+    args = {
+        'config': config,
+        'skip_save_processed_input': True,
+        'output_directory': os.path.join(tmpdir, 'results'),
+        'logging_level': logging.WARN
+    }
+
+    # ensure no exceptions are raised
+    experiment_cli(dataset=df, **args)

From 40be903a60a80a8b4ded2c0ba8c71967618f4aa5 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 29 Nov 2020 00:58:26 -0500
Subject: [PATCH 14/18] fix: error retrieving backend df_engine compute

---
 ludwig/features/numerical_feature.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 9eff0580434..48682aed6f8 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -50,7 +50,7 @@ class NumericalFeatureMixin(object):
 
     @staticmethod
     def get_feature_meta(column, preprocessing_parameters, backend):
-        compute = backend.df_engine_compute
+        compute = backend.df_engine.compute
         numeric_transformer = get_from_registry(
             preprocessing_parameters.get('normalization', None),
             numeric_transformation_registry

From 5bcb7e286362a8618d3eb4bf5210699d869a54f0 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 29 Nov 2020 01:23:27 -0500
Subject: [PATCH 15/18] fix: error preprocessing numeric feature

---
 ludwig/features/numerical_feature.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 48682aed6f8..ccf62b92431 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -79,6 +79,8 @@ def add_feature_data(
         proc_df[feature[PROC_COLUMN]] = \
             numeric_transformer.transform(proc_df[feature[PROC_COLUMN]])
 
+        return proc_df
+
 
 class NumericalInputFeature(NumericalFeatureMixin, InputFeature):
     encoder = 'passthrough'

From f2535a6a860b0ce9d1294bfbf8a41688ae5ae2f3 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 29 Nov 2020 09:17:01 -0500
Subject: [PATCH 16/18] refactor: incorporate new backend design

---
 ludwig/features/numerical_feature.py | 33 +++++++++++++++++++---------
 1 file changed, 23 insertions(+), 10 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index ccf62b92431..7070a51b087 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -50,13 +50,12 @@ class NumericalFeatureMixin(object):
 
     @staticmethod
     def get_feature_meta(column, preprocessing_parameters, backend):
-        compute = backend.df_engine.compute
         numeric_transformer = get_from_registry(
             preprocessing_parameters.get('normalization', None),
             numeric_transformation_registry
         )
 
-        return numeric_transformer.fit_transform_params(column)
+        return numeric_transformer.fit_transform_params(column, backend)
 
     @staticmethod
     def add_feature_data(
@@ -327,10 +326,14 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return x * self.sigma + self.mu
 
     @staticmethod
-    def fit_transform_params(column: np.ndarray) -> dict:
+    def fit_transform_params(
+            column: np.ndarray,
+            backend: 'LocalBackend'
+    ) -> dict:
+        compute = backend.df_engine.compute
         return {
-            'mean': column.astype(np.float32).mean(),
-            'std': column.astype(np.float32).std()
+            'mean': compute(column.astype(np.float32).mean()),
+            'std': compute(column.astype(np.float32).std())
         }
 
 
@@ -352,10 +355,14 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return x * self.range + self.min_value
 
     @staticmethod
-    def fit_transform_params(column: np.ndarray) -> dict:
+    def fit_transform_params(
+            column: np.ndarray,
+            backend: 'LocalBackend'
+    ) -> dict:
+        compute = backend.df_engine.compute
         return {
-            'min': column.astype(np.float32).min(),
-            'max': column.astype(np.float32).max()
+            'min': compute(column.astype(np.float32).min()),
+            'max': compute(column.astype(np.float32).max())
         }
 
 
@@ -375,7 +382,10 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return np.expm1(x)
 
     @staticmethod
-    def fit_transform_params(column: np.ndarray) -> dict:
+    def fit_transform_params(
+            column: np.ndarray,
+            backend: 'LocalBackend'
+    ) -> dict:
         return {}
 
 
@@ -390,7 +400,10 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
         return x
 
     @staticmethod
-    def fit_transform_params(column: np.ndarray) -> dict:
+    def fit_transform_params(
+            column: np.ndarray,
+            backend: 'LocalBackend'
+    ) -> dict:
         return {}
 
 

From 59fd83b28a45ca87fe04e7dfa6b8996ee4a5aef6 Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 29 Nov 2020 09:51:57 -0500
Subject: [PATCH 17/18] refactor: incorporate new backend design to unit test

---
 tests/integration_tests/test_model_training_options.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/tests/integration_tests/test_model_training_options.py b/tests/integration_tests/test_model_training_options.py
index e5e7d6a3c24..f77ec023a20 100644
--- a/tests/integration_tests/test_model_training_options.py
+++ b/tests/integration_tests/test_model_training_options.py
@@ -12,6 +12,7 @@
 
 from ludwig import globals as global_vars
 from ludwig.api import LudwigModel
+from ludwig.backend import LOCAL_BACKEND
 from ludwig.experiment import experiment_cli
 from ludwig.features.numerical_feature import numeric_transformation_registry
 from ludwig.modules.optimization_modules import optimizers_registry
@@ -492,7 +493,8 @@ def test_numeric_transformer(transformer_key, tmpdir):
     else:
         raw_values = np.random.normal(5, 2, size=100)
 
-    parameters = Transformer.fit_transform_params(raw_values)
+    backend = LOCAL_BACKEND
+    parameters = Transformer.fit_transform_params(raw_values, backend)
     if transformer_name in {'Log1pTransformer', 'IdentityTransformer'}:
         # should be empty
         assert not bool(parameters)

From 630ef2bf633a4d5dd55f2c65ceda8f3390ea456a Mon Sep 17 00:00:00 2001
From: Jim Thompson <jimthompson5802@gmail.com>
Date: Sun, 29 Nov 2020 16:43:30 -0500
Subject: [PATCH 18/18] doc: incorporated reviewer comments

---
 ludwig/features/numerical_feature.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/ludwig/features/numerical_feature.py b/ludwig/features/numerical_feature.py
index 7070a51b087..1e60d55134b 100644
--- a/ludwig/features/numerical_feature.py
+++ b/ludwig/features/numerical_feature.py
@@ -328,7 +328,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
     @staticmethod
     def fit_transform_params(
             column: np.ndarray,
-            backend: 'LocalBackend'
+            backend: 'Backend'
     ) -> dict:
         compute = backend.df_engine.compute
         return {
@@ -357,7 +357,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
     @staticmethod
     def fit_transform_params(
             column: np.ndarray,
-            backend: 'LocalBackend'
+            backend: 'Backend'
     ) -> dict:
         compute = backend.df_engine.compute
         return {
@@ -384,7 +384,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
     @staticmethod
     def fit_transform_params(
             column: np.ndarray,
-            backend: 'LocalBackend'
+            backend: 'Backend'
     ) -> dict:
         return {}
 
@@ -402,7 +402,7 @@ def inverse_transform(self, x: np.ndarray) -> np.ndarray:
     @staticmethod
     def fit_transform_params(
             column: np.ndarray,
-            backend: 'LocalBackend'
+            backend: 'Backend'
     ) -> dict:
         return {}