fix(KDP): DistributionAwareEncoder fix and tests for custom pipelines (#23)

piotrlaczkowski · web-flow · commit ad910969e366 · 2025-02-14T15:03:53.000+01:00
diff --git a/docs/distribution_aware_encoder.md b/docs/distribution_aware_encoder.md
@@ -1,7 +1,7 @@
 # Distribution-Aware Encoder
 
 ## Overview
-The Distribution-Aware Encoder is an advanced preprocessing layer that automatically detects and handles various types of data distributions. It uses TensorFlow Probability (tfp) for accurate modeling and applies specialized transformations while preserving the statistical properties of the data.
+The **Distribution-Aware Encoder** is an advanced preprocessing layer that automatically detects and handles various types of data distributions. It leverages TensorFlow Probability (tfp) for accurate modeling and applies specialized transformations while preserving the statistical properties of the data.
 
 ## Features
 
@@ -80,7 +80,7 @@ The Distribution-Aware Encoder is an advanced preprocessing layer that automatic
 
 ### Basic Usage
 
-The capability only works with numerical features!
+The Distribution-Aware Encoder works seamlessly (and only) with numerical features. Enable it by setting `use_distribution_aware=True` in the `PreprocessingModel`.
 
 ```python
 from kdp.processor import PreprocessingModel
diff --git a/docs/example_usages.md b/docs/example_usages.md
@@ -298,6 +298,8 @@ However we can also manually set the prefered distribution for each numerical fe
 ```python
 from kdp.features import NumericalFeature, FeatureType
 from kdp.processor import PreprocessingModel, OutputModeOptions
+from kdp.custom_layers import DistributionAwareEncoder
+
 
 # Define features
 features = {
@@ -321,6 +323,7 @@ features = {
         preprocessors=[
             tf.keras.layers.Rescaling,
             tf.keras.layers.Normalization,
+            DistributionAwareEncoder,
         ],
         bin_boundaries=[0.0, 1.0, 2.0],
         mean=0.0,
diff --git a/docs/features.md b/docs/features.md
@@ -13,7 +13,7 @@ Explore various methods to define numerical features tailored to your needs:
         "feat1": "float",
         "feat2": "FLOAT",
         "feat3": "FLOAT_NORMALIZED",
-        "feat3": "FLOAT_RESCALED",
+        "feat4": "FLOAT_RESCALED",
         ...
     }
     ```
@@ -50,7 +50,7 @@ Explore various methods to define numerical features tailored to your needs:
         "feat3": NumericalFeature(
             name="feat3",
             feature_type=FeatureType.FLOAT_DISCRETIZED,
-            bin_boundaries=[(1, 10)],
+            bin_boundaries=[0.0, 1.0, 2.0],
         ),
         "feat4": NumericalFeature(
             name="feat4",
@@ -60,6 +60,10 @@ Explore various methods to define numerical features tailored to your needs:
     }
     ```
 
+### 📊 **Distribution-Aware Encoding**
+
+Enhance your numerical feature processing by leveraging the **Distribution-Aware Encoder**. This allows automatic or manual detection of data distributions, applying appropriate transformations to preserve the integrity and statistical properties of your data.
+
 Here's how the numeric preprocessing pipeline looks:
 
 ![Numeric Feature Pipeline](imgs/num_feature_pipeline.png)
diff --git a/docs/imgs/numerical_example_model_with_distribution_aware.png b/docs/imgs/numerical_example_model_with_distribution_aware.png
diff --git a/docs/index.md b/docs/index.md
@@ -65,6 +65,8 @@ features_specs = {
 ppr = PreprocessingModel(
     path_data="data/my_data.csv",
     features_specs=features_spec,
+    use_distribution_aware=True,  # Enable Distribution-Aware Encoding
+    distribution_aware_bins=1000, # Set number of bins for finer data encoding
 )
 # construct the preprocessing pipelines
 ppr.build_preprocessor()
diff --git a/docs/kdp_overview.md b/docs/kdp_overview.md
@@ -33,6 +33,17 @@ KDP (Keras Data Processor) is a powerful preprocessing library designed to strea
   - Dynamic feature filtering
   - Interpretable weights
 
+### 4. 📈 **Distribution-Aware Encoder**
+- **Automatic Distribution Detection**
+  - Identifies underlying data distributions (e.g., Normal, Heavy-Tailed, Multimodal, etc.)
+  - Applies specialized transformations to preserve statistical properties
+- **Adaptive Transformations**
+  - Learns optimal parameters during training
+  - Adjusts to data distribution changes dynamically
+- **Robust Handling**
+  - Manages sparse and periodic data effectively
+  - Ensures numerical stability across transformations
+
 ## 🏗️ Architecture Overview
 
 ```mermaid
diff --git a/test/test_distribution_aware.py b/test/test_distribution_aware.py
@@ -382,5 +382,271 @@ def test_invalid_input(self):
             self.encoder(tf.constant([["1", "2"], ["3", "4"]]))
 
 
+class TestAdvancedOptionsDistributionAwareEncoder(tf.test.TestCase):
+    def setUp(self):
+        super().setUp()
+        # Create an instance of the DistributionAwareEncoder with advanced features enabled.
+        self.encoder = DistributionAwareEncoder(
+            name="distribution_aware_encoder",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=True,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+
+    def test_config_serialization(self):
+        """Test that the encoder's configuration is correctly saved and restored."""
+        config = self.encoder.get_config()
+        new_encoder = DistributionAwareEncoder.from_config(config)
+        self.assertEqual(new_encoder.num_bins, self.encoder.num_bins)
+        self.assertEqual(new_encoder.epsilon, self.encoder.epsilon)
+        self.assertEqual(
+            new_encoder.detect_periodicity, self.encoder.detect_periodicity
+        )
+        self.assertEqual(new_encoder.handle_sparsity, self.encoder.handle_sparsity)
+        self.assertEqual(new_encoder.adaptive_binning, self.encoder.adaptive_binning)
+        self.assertEqual(
+            new_encoder.mixture_components, self.encoder.mixture_components
+        )
+        self.assertTrue(new_encoder.trainable)
+
+    def test_periodic_processing(self):
+        """Test that periodic input data is encoded with the periodic branch."""
+        # Create periodic data: sin wave with some noise.
+        t = np.linspace(0, 4 * np.pi, 100).astype(np.float32)
+        data = np.sin(t) + 0.05 * np.random.normal(0, 1, 100).astype(np.float32)
+        inputs = tf.convert_to_tensor(data)
+        outputs = self.encoder(inputs, training=False)
+
+        # With detect_periodicity=True, the output is expected to be concatenated
+        # (e.g., sin/cos branches) doubling the dimensionality.
+        self.assertEqual(
+            outputs.shape[0],
+            inputs.shape[0] * 2,
+            "Periodicity encoding failed to double output dimensions.",
+        )
+
+    def test_sparsity_handling(self):
+        """Test that sparse inputs (mostly zeros) produce near-zero outputs in those positions."""
+        data = np.zeros(100, dtype=np.float32)
+        # Set a few indices to non-zero values.
+        indices = np.random.choice(np.arange(100), size=10, replace=False)
+        data[indices] = np.random.normal(1, 0.1, size=10)
+        inputs = tf.convert_to_tensor(data)
+        outputs = self.encoder(inputs, training=False)
+
+        # In regions where input values are near zero the encoder should preserve sparsity.
+        zero_mask = np.abs(data) < self.encoder.epsilon
+        outputs_val = outputs.numpy()
+        self.assertTrue(
+            np.all(np.abs(outputs_val[zero_mask]) < self.encoder.epsilon),
+            "Sparse inputs not preserved as near-zero in outputs.",
+        )
+
+
+class TestEncoderConfigurations(tf.test.TestCase):
+    def test_detect_periodicity_true(self):
+        """When detect_periodicity is True, periodic inputs should produce an output with doubled dimensions."""
+        encoder = DistributionAwareEncoder(
+            name="encoder_periodic_true",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=True,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        # Create a sinusoidal input signal.
+        t = np.linspace(0, 4 * np.pi, 100).astype(np.float32)
+        data = np.sin(t)
+        inputs = tf.convert_to_tensor(data)
+        outputs = encoder(inputs, training=False)
+        # With periodic detection enabled, the encoder output is expected to be (input_length * 2,)
+        self.assertEqual(
+            outputs.shape,
+            (inputs.shape[0] * 2,),
+            "Expected output shape to be twice the input length when detecting periodicity.",
+        )
+
+    def test_detect_periodicity_false(self):
+        """When detect_periodicity is False, the output shape should match the input."""
+        encoder = DistributionAwareEncoder(
+            name="encoder_periodic_false",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        # Use a sinusoidal input.
+        t = np.linspace(0, 4 * np.pi, 100).astype(np.float32)
+        data = np.sin(t)
+        inputs = tf.convert_to_tensor(data)
+        outputs = encoder(inputs, training=False)
+        self.assertEqual(
+            outputs.shape,
+            inputs.shape,
+            "Expected output shape to be the same as input when periodicity detection is disabled.",
+        )
+
+    def test_handle_sparsity_true(self):
+        """When handle_sparsity is True, input values near zero should be preserved as near-zero in the output."""
+        encoder = DistributionAwareEncoder(
+            name="encoder_sparsity_true",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        # Generate sparse input data: mostly zeros with some non-zero values.
+        data = np.zeros(200, dtype=np.float32)
+        np.random.seed(42)
+        indices = np.random.choice(200, size=20, replace=False)
+        data[indices] = np.random.normal(0, 1, size=20)
+        inputs = tf.convert_to_tensor(data)
+        outputs = encoder(inputs, training=False)
+
+        # For sparsity handling, zeros (or near-zero) in the input should give near-zero outputs.
+        zero_mask = np.abs(data) < encoder.epsilon
+        outputs_np = outputs.numpy()
+        self.assertTrue(
+            np.all(np.abs(outputs_np[zero_mask]) < encoder.epsilon),
+            "When handle_sparsity is True, inputs near zero should produce near-zero outputs.",
+        )
+
+    def test_handle_sparsity_false(self):
+        """When handle_sparsity is False, there is no requirement to preserve zeros."""
+        encoder = DistributionAwareEncoder(
+            name="encoder_sparsity_false",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=False,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        # Generate similar sparse input.
+        data = np.zeros(200, dtype=np.float32)
+        np.random.seed(42)
+        indices = np.random.choice(200, size=20, replace=False)
+        data[indices] = np.random.normal(0, 1, size=20)
+        inputs = tf.convert_to_tensor(data)
+        outputs = encoder(inputs, training=False)
+
+        # When handle_sparsity is False, we do not insist on preserving zeros; instead, we can check that
+        # at least some non-zero output is produced for non-zero input.
+        non_zero_mask = np.abs(data) > encoder.epsilon
+        outputs_np = outputs.numpy()
+        self.assertTrue(
+            np.any(np.abs(outputs_np[non_zero_mask]) > encoder.epsilon),
+            "When handle_sparsity is False, non-zero inputs should result in non-zero outputs.",
+        )
+
+    def test_adaptive_binning_flag(self):
+        """Test that the adaptive_binning flag is stored correctly."""
+        encoder_true = DistributionAwareEncoder(
+            name="encoder_adaptive_true",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        encoder_false = DistributionAwareEncoder(
+            name="encoder_adaptive_false",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=False,
+            mixture_components=3,
+            trainable=True,
+        )
+        self.assertTrue(
+            encoder_true.adaptive_binning, "Encoder should have adaptive_binning=True."
+        )
+        self.assertFalse(
+            encoder_false.adaptive_binning,
+            "Encoder should have adaptive_binning=False.",
+        )
+
+    def test_mixture_components(self):
+        """Test that the mixture_components parameter is correctly stored."""
+        encoder = DistributionAwareEncoder(
+            name="encoder_mixture",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=5,
+            trainable=True,
+        )
+        self.assertEqual(
+            encoder.mixture_components,
+            5,
+            "The mixture_components parameter should be correctly set to 5.",
+        )
+
+    def test_trainable_flag(self):
+        """Test that setting the trainable flag correctly updates the layer's trainability."""
+        encoder_trainable = DistributionAwareEncoder(
+            name="encoder_trainable_true",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        encoder_non_trainable = DistributionAwareEncoder(
+            name="encoder_trainable_false",
+            num_bins=1000,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=False,
+        )
+        self.assertTrue(
+            encoder_trainable.trainable,
+            "Encoder should be trainable when trainable=True.",
+        )
+        self.assertFalse(
+            encoder_non_trainable.trainable,
+            "Encoder should not be trainable when trainable=False.",
+        )
+
+    def test_num_bins_parameter(self):
+        """Test that the num_bins parameter is correctly set and stored."""
+        encoder = DistributionAwareEncoder(
+            name="encoder_num_bins",
+            num_bins=500,
+            epsilon=1e-6,
+            detect_periodicity=False,
+            handle_sparsity=True,
+            adaptive_binning=True,
+            mixture_components=3,
+            trainable=True,
+        )
+        self.assertEqual(
+            encoder.num_bins, 500, "The num_bins parameter should be set to 500."
+        )
+
+
 if __name__ == "__main__":
     tf.test.main()

Original file line number	Diff line number	Diff line change
`@@ -65,6 +65,8 @@ features_specs = {`
`65`	`65`	`ppr = PreprocessingModel(`
`66`	`66`	`path_data="data/my_data.csv",`
`67`	`67`	`features_specs=features_spec,`
	`68`	`+ use_distribution_aware=True, # Enable Distribution-Aware Encoding`
	`69`	`+ distribution_aware_bins=1000, # Set number of bins for finer data encoding`
`68`	`70`	`)`
`69`	`71`	`# construct the preprocessing pipelines`
`70`	`72`	`ppr.build_preprocessor()`