feat(KDP): smart processing for custom pipelines

Gandalfdore · Gandalfdore · commit 448f63fc2a41 · 2025-02-14T14:39:39.000+02:00
diff --git a/kdp/dynamic_pipeline.py b/kdp/dynamic_pipeline.py
@@ -0,0 +1,46 @@
+class DynamicPreprocessingPipeline:
+    """
+    Dynamically initializes a sequence of Keras preprocessing layers based on the output
+    from each previous layer, allowing each layer to access the outputs of all prior layers where relevant.
+    """
+
+    def __init__(self, layers):
+        """
+        Initializes the DynamicPreprocessingPipeline with a list of layers.
+
+        Args:
+            layers (list): A list of Keras preprocessing layers, each potentially named for reference.
+        """
+        self.layers = layers
+
+    def initialize_and_transform(self, init_data):
+        """
+        Sequentially processes each layer, applying transformations selectively based on each
+        layer's input requirements and ensuring efficient data usage and processing. Each layer
+        can access the outputs of all previous layers.
+
+        Args:
+            init_data (dict): A dictionary with initialization data, dynamically keyed.
+
+        Returns:
+            dict: The dictionary containing selectively transformed data for each layer.
+        """
+        current_data = init_data
+
+        for i, layer in enumerate(self.layers):
+            # For many layers we may not have a formal input_spec, so assume the layer uses all current data.
+            required_keys = current_data.keys()
+
+            # Prepare input for the current layer based on the determined keys.
+            # Here, we assume that each layer accepts a dictionary of inputs.
+            current_input = {k: current_data[k] for k in required_keys}
+
+            # Apply transformation: if the layer returns a tensor, wrap it in a dict using the layer name.
+            transformed_output = layer(current_input)
+            if not isinstance(transformed_output, dict):
+                transformed_output = {layer.name: transformed_output}
+
+            # Update the current data with the transformed output so that subsequent layers can reuse it.
+            current_data.update(transformed_output)
+
+        return current_data
diff --git a/kdp/pipeline.py b/kdp/pipeline.py
@@ -4,6 +4,7 @@
 from loguru import logger
 
 from kdp.layers_factory import PreprocessorLayerFactory
+from kdp.dynamic_pipeline import DynamicPreprocessingPipeline
 
 
 class ProcessingStep:
@@ -87,44 +88,77 @@ def transform(self, input_data: tf.Tensor) -> tf.Tensor:
 
 
 class FeaturePreprocessor:
-    def __init__(self, name: str) -> None:
-        """Initialize a feature preprocessor.
+    def __init__(self, name: str, use_dynamic: bool = False) -> None:
+        """
+        Initializes a feature preprocessor.
 
         Args:
             name (str): The name of the feature preprocessor.
+            use_dynamic (bool): Whether to use the dynamic preprocessing pipeline.
         """
         self.name = name
-        self.pipeline = Pipeline(name=name)
+        self.use_dynamic = use_dynamic
+        if not self.use_dynamic:
+            self.pipeline = Pipeline(name=name)
+        else:
+            self.layers = []  # for dynamic pipeline
 
     def add_processing_step(
         self, layer_creator: Callable[..., tf.keras.layers.Layer] = None, **layer_kwargs
     ) -> None:
-        """Add a processing step to the feature preprocessor.
+        """
+        Add a preprocessing layer to the feature preprocessor pipeline.
+        If using the standard pipeline, a ProcessingStep is added.
+        Otherwise, the layer is added to a list for dynamic handling.
 
         Args:
             layer_creator (Callable[..., tf.keras.layers.Layer]): A callable that creates a layer.
                 If not provided, the default layer creator is used.
             **layer_kwargs: Additional keyword arguments for the layer creator.
         """
         layer_creator = layer_creator or PreprocessorLayerFactory.create_layer
-        step = ProcessingStep(layer_creator=layer_creator, **layer_kwargs)
-        self.pipeline.add_step(step=step)
+        if self.use_dynamic:
+            layer = layer_creator(**layer_kwargs)
+            logger.info(f"Adding {layer.name} to dynamic preprocessing pipeline")
+            self.layers.append(layer)
+        else:
+            step = ProcessingStep(layer_creator=layer_creator, **layer_kwargs)
+            self.pipeline.add_step(step=step)
 
     def chain(self, input_layer) -> tf.keras.layers.Layer:
-        """Chain the preprocessor's pipeline steps starting from the input layer.
+        """
+        Chains the processing steps starting from the given input_layer.
 
-        Args:
-            input_layer: The input layer to start the chain from.
+        For a static pipeline, this delegates to the internal Pipeline's chain() method.
+        For the dynamic pipeline, it constructs the dynamic pipeline on the fly.
         """
-        return self.pipeline.chain(input_layer)
+        if not self.use_dynamic:
+            return self.pipeline.chain(input_layer)
+        else:
+            dynamic_pipeline = DynamicPreprocessingPipeline(self.layers)
+            # In the dynamic case, we use a dict for the input.
+            output_dict = dynamic_pipeline.initialize_and_transform(
+                {"input": input_layer}
+            )
+            # Return the transformed data at key "input" (or adjust as needed).
+            return output_dict.get("input", input_layer)
 
     def transform(self, input_data: tf.Tensor) -> tf.Tensor:
-        """Apply the feature preprocessor to the input data.
+        """
+        Process the input data through the pipeline.
+        For the dynamic pipeline, wrap input in a dictionary and extract final output.
 
         Args:
             input_data: The input data to process.
 
         Returns:
             tf.Tensor: The processed data.
         """
-        return self.pipeline.transform(input_data)
+        if not self.use_dynamic:
+            return self.pipeline.transform(input_data)
+        else:
+            dynamic_pipeline = DynamicPreprocessingPipeline(self.layers)
+            output_dict = dynamic_pipeline.initialize_and_transform(
+                {"input": input_data}
+            )
+            return output_dict.get("input", input_data)