UnicoLab
diff --git a/‎complex_model.png‎
311 KB b/‎complex_model.png‎
311 KB
diff --git a/‎kdp/processor.py‎
Lines changed: 18 additions & 22 deletions b/‎kdp/processor.py‎
Lines changed: 18 additions & 22 deletions
diff --git a/‎test/test_feature_selection.py‎
Lines changed: 70 additions & 206 deletions b/‎test/test_feature_selection.py‎
Lines changed: 70 additions & 206 deletions
@@ -8,6 +8,7 @@
 from functools import wraps
 from typing import Any
 
+import numpy as np
 import pandas as pd
 import tensorflow as tf
 from loguru import logger
@@ -1437,42 +1438,37 @@ def get_feature_statistics(self) -> dict:
             "output_mode": self.output_mode,
         }
 
-    def transform(self, data: tf.data.Dataset | pd.DataFrame | dict) -> dict[str, Any]:
-        """Transform input data using the built preprocessor model.
+    def _convert_to_dataset(self, data: tf.data.Dataset | pd.DataFrame | dict) -> tf.data.Dataset:
+        """Convert input data to TensorFlow dataset.
 
         Args:
-            data: Input data to transform. Can be a DataFrame, Dataset, or dict.
+            data: Input data to convert. Can be a DataFrame, Dataset, or dict.
 
         Returns:
-            dict[str, Any]: Dictionary containing:
-                - transformed_data: The transformed data output
-                - {feature_name}_weights: Weight for each feature from feature selection
+            tf.data.Dataset: The converted dataset.
 
         Raises:
-            ValueError: If preprocessor hasn't been built yet.
+            ValueError: If input data is not a supported type.
         """
-        # Convert input data to TensorFlow dataset if needed
         if isinstance(data, pd.DataFrame):
-            dataset = tf.data.Dataset.from_tensor_slices(dict(data)).batch(32)
+            return tf.data.Dataset.from_tensor_slices(dict(data)).batch(32)
         elif isinstance(data, dict):
-            dataset = tf.data.Dataset.from_tensor_slices(data).batch(32)
+            return tf.data.Dataset.from_tensor_slices(data).batch(32)
         elif isinstance(data, tf.data.Dataset):
-            dataset = data
+            return data
         else:
             raise ValueError("Input data must be a DataFrame, dict, or TensorFlow Dataset")
 
-        # Transform the data using the model
-        transformed = self.model.predict(dataset)
+    def _extract_feature_weights(self) -> dict[str, np.ndarray]:
+        """Extract feature importance weights from feature selection layers.
 
-        # Initialize return dictionary with transformed data
-        result = {"transformed_data": transformed}
-
-        # Get feature importance from the feature selection layer if it exists
+        Returns:
+            dict[str, np.ndarray]: Dictionary mapping feature names to their importance weights.
+        """
+        weights = {}
         for layer in self.model.layers:
             if "feature_selection" in layer.name:
-                weights = layer.get_weights()
+                layer_weights = layer.get_weights()
                 for i, feature_name in enumerate(self.features_specs.keys()):
-                    # Add weights for each feature with the expected key format
-                    result[f"{feature_name}_weights"] = weights[0][:, i]
-
-        return result
+                    weights[f"{feature_name}_weights"] = layer_weights[0][:, i]
+        return weights
@@ -28,11 +28,39 @@ def test_gating_mechanism(self):
         outputs = self.layer(inputs)
         self.assertAllInRange(outputs, -10.0, 10.0)  # Reasonable range for gated outputs
 
-    def test_serialization(self):
+    def test_serialization_basic(self):
         config = self.layer.get_config()
         new_layer = GatedLinearUnit.from_config(config)
         self.assertEqual(self.layer.units, new_layer.units)
 
+    def test_output_types(self):
+        """Test output types for GatedLinearUnit."""
+        gl = GatedLinearUnit(units=64)
+        inputs = tf.random.normal((32, 100))
+        outputs = gl(inputs)
+
+        # Verify output is a tensor with correct dtype
+        self.assertIsInstance(outputs, tf.Tensor)
+        self.assertEqual(outputs.dtype, tf.float32)
+
+    def test_serialization_and_output_consistency(self):
+        """Test serialization and deserialization of GatedLinearUnit."""
+        dummy_input = tf.random.normal((1, 100))
+
+        gl = GatedLinearUnit(units=64)
+        gl(dummy_input)  # This builds the layer
+
+        config = gl.get_config()
+        gl_new = GatedLinearUnit.from_config(config)
+        gl_new(dummy_input)  # Build the new layer too
+
+        # Set the weights to be the same
+        gl_new.set_weights(gl.get_weights())
+
+        # Test both layers produce the same output
+        inputs = tf.random.normal((32, 100))
+        self.assertAllClose(gl(inputs), gl_new(inputs))
+
 
 class TestGatedResidualNetwork(tf.test.TestCase):
     def setUp(self):
@@ -85,12 +113,50 @@ def test_dropout_behavior(self):
         for i in range(len(inference_outputs) - 1):
             self.assertAllClose(inference_outputs[i], inference_outputs[i + 1])
 
-    def test_serialization(self):
+    def test_serialization_basic(self):
         config = self.layer.get_config()
         new_layer = GatedResidualNetwork.from_config(config)
         self.assertEqual(self.layer.units, new_layer.units)
         self.assertEqual(self.layer.dropout_rate, new_layer.dropout_rate)
 
+    def test_output_types(self):
+        """Test output types for GatedResidualNetwork."""
+        batch_size = 32
+        input_dim = 64
+        dropout_rate = 0.5
+
+        grn = GatedResidualNetwork(units=input_dim, dropout_rate=dropout_rate)
+        inputs = tf.random.normal((batch_size, input_dim))
+
+        outputs = grn(inputs)
+
+        # Verify output is a tensor with correct dtype
+        self.assertIsInstance(outputs, tf.Tensor)
+        self.assertEqual(outputs.dtype, tf.float32)
+
+        # Test with different input types
+        inputs_int = tf.cast(inputs, tf.float32)
+        outputs_from_int = grn(inputs_int)
+        self.assertEqual(outputs_from_int.dtype, tf.float32)  # Should always output float32
+
+    def test_serialization_and_output_consistency(self):
+        """Test serialization and deserialization of GatedResidualNetwork."""
+        grn = GatedResidualNetwork(units=64, dropout_rate=0.3)
+        # Build the layer first
+        dummy_input = tf.random.normal((1, 64))
+        grn(dummy_input)
+
+        config = grn.get_config()
+        grn_new = GatedResidualNetwork.from_config(config)
+        grn_new(dummy_input)
+
+        # Set the weights to be the same
+        grn_new.set_weights(grn.get_weights())
+
+        # Test both layers produce the same output
+        inputs = tf.random.normal((32, 64))
+        self.assertAllClose(grn(inputs), grn_new(inputs))
+
 
 class TestVariableSelection(tf.test.TestCase):
     def setUp(self):
@@ -148,215 +214,13 @@ def test_dropout_behavior(self):
         for i in range(len(inference_outputs) - 1):
             self.assertAllClose(inference_outputs[i], inference_outputs[i + 1])
 
-    def test_serialization(self):
+    def test_serialization_basic(self):
         config = self.layer.get_config()
         new_layer = VariableSelection.from_config(config)
         self.assertEqual(self.layer.nr_features, new_layer.nr_features)
         self.assertEqual(self.layer.units, new_layer.units)
         self.assertEqual(self.layer.dropout_rate, new_layer.dropout_rate)
 
-
-class TestGatedLinearUnit2(tf.test.TestCase):
-    """Test suite for GatedLinearUnit layer."""
-
-    def test_output_shape(self):
-        """Test that output shape is correct."""
-        batch_size = 32
-        input_dim = 100
-        units = 64
-
-        gl = GatedLinearUnit(units=units)
-        inputs = tf.random.normal((batch_size, input_dim))
-        outputs = gl(inputs)
-
-        self.assertEqual(outputs.shape, (batch_size, units))
-
-    def test_gating_mechanism(self):
-        """Test that gating mechanism properly filters values."""
-        gl = GatedLinearUnit(units=1)
-        inputs = tf.constant([[1.0], [2.0], [3.0]])
-
-        # Get internal gate values
-        gate_values = gl.sigmoid(inputs)
-
-        # Verify gates are between 0 and 1
-        self.assertTrue(tf.reduce_all(gate_values >= 0))
-        self.assertTrue(tf.reduce_all(gate_values <= 1))
-
-    def test_output_types(self):
-        """Test output types for GatedLinearUnit."""
-        gl = GatedLinearUnit(units=64)
-        inputs = tf.random.normal((32, 100))
-        outputs = gl(inputs)
-
-        # Verify output is a tensor with correct dtype
-        self.assertIsInstance(outputs, tf.Tensor)
-        self.assertEqual(outputs.dtype, tf.float32)
-
-    def test_serialization(self):
-        """Test serialization and deserialization of GatedLinearUnit."""
-        dummy_input = tf.random.normal((1, 100))
-
-        gl = GatedLinearUnit(units=64)
-        gl(dummy_input)  # This builds the layer
-
-        config = gl.get_config()
-        gl_new = GatedLinearUnit.from_config(config)
-        gl_new(dummy_input)  # Build the new layer too
-
-        # Set the weights to be the same
-        gl_new.set_weights(gl.get_weights())
-
-        # Test both layers produce the same output
-        inputs = tf.random.normal((32, 100))
-        self.assertAllClose(gl(inputs), gl_new(inputs))
-
-
-class TestGatedResidualNetwork2(tf.test.TestCase):
-    """Test suite for GatedResidualNetwork layer."""
-
-    def test_output_shape(self):
-        """Test that output shape matches input shape."""
-        batch_size = 32
-        input_dim = 64
-        units = 64
-
-        grn = GatedResidualNetwork(units=units)
-        inputs = tf.random.normal((batch_size, input_dim))
-        outputs = grn(inputs)
-
-        self.assertEqual(outputs.shape, (batch_size, units))
-
-    def test_residual_connection(self):
-        """Test that residual connection is working."""
-        grn = GatedResidualNetwork(units=2, dropout_rate=0.0)
-        inputs = tf.constant([[1.0, 2.0]])
-
-        # Get output with and without residual connection
-        with_residual = grn(inputs)
-
-        # Verify output is different from input but related
-        self.assertNotAllClose(with_residual, inputs)
-        self.assertGreater(tf.reduce_max(tf.abs(with_residual - inputs)), 0)
-
-    def test_dropout_behavior(self):
-        """Test dropout behavior in training vs inference."""
-        batch_size = 32
-        input_dim = 64
-        dropout_rate = 0.5
-
-        grn = GatedResidualNetwork(units=input_dim, dropout_rate=dropout_rate)
-        inputs = tf.random.normal((batch_size, input_dim))
-
-        # Training mode (should apply dropout)
-        train_outputs = grn(inputs, training=True)
-
-        # Inference mode (should not apply dropout)
-        inference_outputs = grn(inputs, training=False)
-
-        # Outputs should be different in training vs inference
-        self.assertNotAllClose(train_outputs, inference_outputs)
-
-    def test_output_types(self):
-        """Test output types for GatedResidualNetwork."""
-        batch_size = 32
-        input_dim = 64
-        dropout_rate = 0.5
-
-        grn = GatedResidualNetwork(units=input_dim, dropout_rate=dropout_rate)
-        inputs = tf.random.normal((batch_size, input_dim))
-
-        outputs = grn(inputs)
-
-        # Verify output is a tensor with correct dtype
-        self.assertIsInstance(outputs, tf.Tensor)
-        self.assertEqual(outputs.dtype, tf.float32)
-
-        # Test with different input types
-        inputs_int = tf.cast(inputs, tf.float32)
-        outputs_from_int = grn(inputs_int)
-        self.assertEqual(outputs_from_int.dtype, tf.float32)  # Should always output float32
-
-    def test_serialization(self):
-        """Test serialization and deserialization of GatedResidualNetwork."""
-        grn = GatedResidualNetwork(units=64, dropout_rate=0.3)
-        # Build the layer first
-        dummy_input = tf.random.normal((1, 64))
-        grn(dummy_input)
-
-        config = grn.get_config()
-        grn_new = GatedResidualNetwork.from_config(config)
-        grn_new(dummy_input)
-
-        # Set the weights to be the same
-        grn_new.set_weights(grn.get_weights())
-
-        # Test both layers produce the same output
-        inputs = tf.random.normal((32, 64))
-        self.assertAllClose(grn(inputs), grn_new(inputs))
-
-
-class TestVariableSelection2(tf.test.TestCase):
-    """Test suite for VariableSelection layer."""
-
-    def test_output_shape(self):
-        """Test output shapes for features and weights."""
-        batch_size = 32
-        nr_features = 3
-        feature_dims = [100, 200, 300]
-        units = 64
-
-        vs = VariableSelection(nr_features=nr_features, units=units)
-        inputs = [tf.random.normal((batch_size, dim)) for dim in feature_dims]
-
-        selected_features, feature_weights = vs(inputs)
-
-        # Check selected features shape
-        self.assertEqual(selected_features.shape, (batch_size, units))
-
-        # Check weights shape
-        self.assertEqual(feature_weights.shape, (batch_size, nr_features, 1))
-
-    def test_weight_properties(self):
-        """Test that feature weights sum to 1 and are non-negative."""
-        batch_size = 32
-        nr_features = 3
-        feature_dims = [10, 20, 30]
-        units = 64
-
-        vs = VariableSelection(nr_features=nr_features, units=units)
-        inputs = [tf.random.normal((batch_size, dim)) for dim in feature_dims]
-
-        _, feature_weights = vs(inputs)
-
-        # Remove the last dimension for easier testing
-        weights = tf.squeeze(feature_weights, axis=-1)
-
-        # Test weights sum to 1 for each sample
-        sums = tf.reduce_sum(weights, axis=1)
-        self.assertAllClose(sums, tf.ones_like(sums))
-
-        # Test weights are non-negative
-        self.assertTrue(tf.reduce_all(weights >= 0))
-
-    def test_feature_selection(self):
-        """Test that the layer can select important features."""
-        batch_size = 10
-        nr_features = 2
-        units = 4
-
-        vs = VariableSelection(nr_features=nr_features, units=units)
-
-        # Create one important and one noisy feature
-        important_feature = tf.ones((batch_size, 2))
-        noisy_feature = tf.random.normal((batch_size, 2)) * 0.1
-
-        selected_features, feature_weights = vs([important_feature, noisy_feature])
-
-        # The important feature should get higher weights
-        weights = tf.squeeze(feature_weights, axis=-1)
-        self.assertTrue(tf.reduce_mean(weights[:, 0]) > tf.reduce_mean(weights[:, 1]))
-
     def test_output_types(self):
         """Test output types for VariableSelection."""
         batch_size = 32
@@ -398,7 +262,7 @@ def test_mixed_input_types(self):
         self.assertEqual(selected_features.dtype, tf.float32)
         self.assertEqual(feature_weights.dtype, tf.float32)
 
-    def test_serialization(self):
+    def test_serialization_and_output_consistency(self):
         """Test serialization and deserialization of VariableSelection."""
         vs = VariableSelection(nr_features=3, units=64, dropout_rate=0.2)
         # Build the layer first