feat(KDP): Enhance Dynamic Preprocessing Pipeline (#24)

piotrlaczkowski · web-flow · commit bd90f11eaff9 · 2025-02-17T13:50:16.000+01:00
diff --git a/kdp/dynamic_pipeline.py b/kdp/dynamic_pipeline.py
@@ -0,0 +1,84 @@
+import tensorflow as tf
+
+
+class DynamicPreprocessingPipeline:
+    """
+    Dynamically initializes and manages a sequence of Keras preprocessing layers, with selective retention of outputs
+    based on dependencies among layers, and supports streaming data through the pipeline.
+    """
+
+    def __init__(self, layers):
+        """
+        Initializes the pipeline with a list of preprocessing layers.
+
+        Args:
+            layers (list): A list of TensorFlow preprocessing layers.
+        """
+        self.layers = layers
+        self.dependency_map = self._analyze_dependencies()
+
+    def _analyze_dependencies(self):
+        """
+        Analyzes and determines the dependencies of each layer on the outputs of previous layers.
+
+        Returns:
+            dict: A dictionary mapping each layer's name to the set of layer outputs it depends on.
+        """
+        dependencies = {}
+        all_outputs = set()
+        for i, layer in enumerate(self.layers):
+            # If the layer has an input_spec (which is common in Keras layers) we inspect it.
+            if hasattr(layer, "input_spec") and layer.input_spec is not None:
+                # Use a safe getter so that if an element does not have a 'name' attribute, we get None.
+                # Then filter out the Nones.
+                required_inputs = set(
+                    [
+                        name
+                        for name in tf.nest.flatten(
+                            tf.nest.map_structure(
+                                lambda x: getattr(x, "name", None), layer.input_spec
+                            )
+                        )
+                        if name is not None
+                    ]
+                )
+            else:
+                # Otherwise, assume that the layer depends on all outputs seen so far.
+                required_inputs = all_outputs
+            dependencies[layer.name] = required_inputs
+            all_outputs.update(required_inputs)
+            all_outputs.add(layer.name)
+        return dependencies
+
+    def process(self, dataset):
+        """
+        Processes the dataset through the pipeline using tf.data API.
+
+        Args:
+            dataset (tf.data.Dataset): The dataset where each element is a dictionary of features.
+
+        Returns:
+            tf.data.Dataset: The processed dataset with outputs of each layer stored by key.
+        """
+
+        def _apply_transformations(features):
+            current_data = features
+            for i, layer in enumerate(self.layers):
+                # Get the required input keys for the current layer.
+                required_keys = self.dependency_map[layer.name]
+                # Prepare the input by selecting the keys if they exist in the current data.
+                current_input = {
+                    k: current_data[k] for k in required_keys if k in current_data
+                }
+                # Process each required input through the layer.
+                # Here we assume the layer accepts one tensor per key.
+                transformed_output = {
+                    layer.name: layer(current_input[k])
+                    for k in required_keys
+                    if k in current_input
+                }
+                # Merge transformed output into the working data dictionary.
+                current_data.update(transformed_output)
+            return current_data
+
+        return dataset.map(_apply_transformations)
diff --git a/kdp/pipeline.py b/kdp/pipeline.py
@@ -4,6 +4,7 @@
 from loguru import logger
 
 from kdp.layers_factory import PreprocessorLayerFactory
+from kdp.dynamic_pipeline import DynamicPreprocessingPipeline
 
 
 class ProcessingStep:
@@ -87,44 +88,77 @@ def transform(self, input_data: tf.Tensor) -> tf.Tensor:
 
 
 class FeaturePreprocessor:
-    def __init__(self, name: str) -> None:
-        """Initialize a feature preprocessor.
+    def __init__(self, name: str, use_dynamic: bool = False) -> None:
+        """
+        Initializes a feature preprocessor.
 
         Args:
             name (str): The name of the feature preprocessor.
+            use_dynamic (bool): Whether to use the dynamic preprocessing pipeline.
         """
         self.name = name
-        self.pipeline = Pipeline(name=name)
+        self.use_dynamic = use_dynamic
+        if not self.use_dynamic:
+            self.pipeline = Pipeline(name=name)
+        else:
+            self.layers = []  # for dynamic pipeline
 
     def add_processing_step(
         self, layer_creator: Callable[..., tf.keras.layers.Layer] = None, **layer_kwargs
     ) -> None:
-        """Add a processing step to the feature preprocessor.
+        """
+        Add a preprocessing layer to the feature preprocessor pipeline.
+        If using the standard pipeline, a ProcessingStep is added.
+        Otherwise, the layer is added to a list for dynamic handling.
 
         Args:
             layer_creator (Callable[..., tf.keras.layers.Layer]): A callable that creates a layer.
                 If not provided, the default layer creator is used.
             **layer_kwargs: Additional keyword arguments for the layer creator.
         """
         layer_creator = layer_creator or PreprocessorLayerFactory.create_layer
-        step = ProcessingStep(layer_creator=layer_creator, **layer_kwargs)
-        self.pipeline.add_step(step=step)
+        if self.use_dynamic:
+            layer = layer_creator(**layer_kwargs)
+            logger.info(f"Adding {layer.name} to dynamic preprocessing pipeline")
+            self.layers.append(layer)
+        else:
+            step = ProcessingStep(layer_creator=layer_creator, **layer_kwargs)
+            self.pipeline.add_step(step=step)
 
     def chain(self, input_layer) -> tf.keras.layers.Layer:
-        """Chain the preprocessor's pipeline steps starting from the input layer.
+        """
+        Chains the processing steps starting from the given input_layer.
 
-        Args:
-            input_layer: The input layer to start the chain from.
+        For a static pipeline, this delegates to the internal Pipeline's chain() method.
+        For the dynamic pipeline, it constructs the dynamic pipeline on the fly.
         """
-        return self.pipeline.chain(input_layer)
+        if not self.use_dynamic:
+            return self.pipeline.chain(input_layer)
+        else:
+            dynamic_pipeline = DynamicPreprocessingPipeline(self.layers)
+            # In the dynamic case, we use a dict for the input.
+            output_dict = dynamic_pipeline.initialize_and_transform(
+                {"input": input_layer}
+            )
+            # Return the transformed data at key "input" (or adjust as needed).
+            return output_dict.get("input", input_layer)
 
     def transform(self, input_data: tf.Tensor) -> tf.Tensor:
-        """Apply the feature preprocessor to the input data.
+        """
+        Process the input data through the pipeline.
+        For the dynamic pipeline, wrap input in a dictionary and extract final output.
 
         Args:
             input_data: The input data to process.
 
         Returns:
             tf.Tensor: The processed data.
         """
-        return self.pipeline.transform(input_data)
+        if not self.use_dynamic:
+            return self.pipeline.transform(input_data)
+        else:
+            dynamic_pipeline = DynamicPreprocessingPipeline(self.layers)
+            output_dict = dynamic_pipeline.initialize_and_transform(
+                {"input": input_data}
+            )
+            return output_dict.get("input", input_data)