docs(KDP): added docs

Gandalfdore · Gandalfdore · commit dd728efc6892 · 2025-02-13T20:16:29.000+02:00
diff --git a/docs/complex_example.md b/docs/complex_example.md
@@ -24,6 +24,7 @@ features = {
     "quantity": NumericalFeature(
         name="quantity",
         feature_type=FeatureType.FLOAT_RESCALED
+        prefered_distribution="poisson" # here we could specify a prefered distribution (normal, periodic, etc)
     ),
 
     # Categorical features
@@ -118,6 +119,10 @@ ppr = PreprocessingModel(
     feature_selection_placement="all_features", # Choose between (all_features|numeric|categorical)
     feature_selection_units=32,
     feature_selection_dropout=0.15,
+
+    # Distribution aware configuration
+    use_distribution_aware=True, # here we activate the distribution aware encoder
+    distribution_aware_bins=1000, # thats the default value, but you can change it for finer data
 )
 
 # Build the preprocessor
diff --git a/docs/distribution_aware_encoder.md b/docs/distribution_aware_encoder.md
@@ -66,11 +66,6 @@ The Distribution-Aware Encoder is an advanced preprocessing layer that automatic
     - Handled via rate parameter estimation
     - Detection: Integer values and variance≈mean
 
-13. **Weibull Distribution**
-    - For lifetime/failure data
-    - Handled via Weibull CDF
-    - Detection: Shape and scale analysis
-
 14. **Cauchy Distribution**
     - For extremely heavy-tailed data
     - Handled via robust location-scale estimation
@@ -81,29 +76,61 @@ The Distribution-Aware Encoder is an advanced preprocessing layer that automatic
     - Handled via mixture model approach
     - Detection: Zero proportion analysis
 
-16. **Bounded Distribution**
-    - For data with known bounds
-    - Handled via scaled beta transformation
-    - Detection: Value range analysis
-
-17. **Ordinal Distribution**
-    - For ordered categorical data
-    - Handled via learned mapping
-    - Detection: Discrete ordered values
-
 ## Usage
 
 ### Basic Usage
+
+The capability only works with numerical features!
+
 ```python
 from kdp.processor import PreprocessingModel
-
-preprocessor = PreprocessingModel(
-    features_stats=stats,
-    features_specs=specs,
+from kdp.features import NumericalFeature
+
+# Define features
+features = {
+    # Numerical features
+    "feature1": NumericalFeature(),
+    "feature2": NumericalFeature(),
+    # etc ..
+}
+
+# Initialize the model
+model = PreprocessingModel( # here
+    features=features,
     use_distribution_aware=True
 )
 ```
 
+### Manual Usage
+
+```python
+from kdp.processor import PreprocessingModel
+from kdp.features import NumericalFeature
+
+# Define features
+features = {
+    # Numerical features
+    # Numerical features
+    "feature1": NumericalFeature(
+        name="feature1",
+        feature_type=FeatureType.FLOAT_NORMALIZED
+    ),
+    "feature2": NumericalFeature(
+        name="feature2",
+        feature_type=FeatureType.FLOAT_RESCALED
+        prefered_distribution="log_normal" # here we could specify a prefered distribution (normal, periodic, etc)
+    )
+    # etc ..
+}
+
+# Initialize the model
+model = PreprocessingModel( # here
+    features=features,
+    use_distribution_aware=True,
+    distribution_aware_bins=1000, # 1000 is the default value, but you can change it for finer data
+)
+```
+
 ### Advanced Configuration
 ```python
 encoder = DistributionAwareEncoder(
@@ -272,40 +299,6 @@ The DistributionAwareEncoder is integrated into the numeric feature processing p
    - Enable caching for repeated processing
    - Adjust mixture components based on data
 
-## Example Use Cases
-
-### 1. Financial Data
-```python
-# Handle heavy-tailed return distributions
-preprocessor = PreprocessingModel(
-    use_distribution_aware=True,
-    handle_sparsity=False,
-    mixture_components=2
-)
-```
-
-### 2. Temporal Data
-```python
-# Handle periodic patterns
-preprocessor = PreprocessingModel(
-    use_distribution_aware=True,
-    detect_periodicity=True,
-    adaptive_binning=True
-)
-```
-
-### 3. Sparse Features
-```python
-# Handle sparse categorical data
-preprocessor = PreprocessingModel(
-    use_distribution_aware=True,
-    handle_sparsity=True,
-    mixture_components=1
-)
-```
-
-## Monitoring and Debugging
-
 ### Distribution Detection
 ```python
 # Access distribution information
diff --git a/docs/example_usages.md b/docs/example_usages.md
@@ -288,3 +288,74 @@ print("Feature importances:", feature_importances)
 
 Here is the plot of the model:
 ![Complex Model](imgs/complex_model.png)
+
+
+## Example 4: Numerical features with distribution aware encoder
+
+Normally the distribution aware encoder works well in automatic mode, once use_distribution_aware=True is set.
+However we can also manually set the prefered distribution for each numerical feature if we would like to.
+
+```python
+from kdp.features import NumericalFeature, FeatureType
+from kdp.processor import PreprocessingModel, OutputModeOptions
+
+# Define features
+features = {
+    # 1. Default automatic distribution detection
+    "basic_float": NumericalFeature(
+        name="basic_float",
+        feature_type=FeatureType.FLOAT,
+    ),
+
+    # 2. Manually setting a gamma distribution
+    "rescaled_float": NumericalFeature(
+        name="rescaled_float",
+        feature_type=FeatureType.FLOAT_RESCALED,
+        scale=2.0,
+        prefered_distribution="gamma"
+    ),
+    # 3. Custom preprocessing pipeline with a custom set normal distribution
+    "custom_float": NumericalFeature(
+        name="custom_float",
+        feature_type=FeatureType.FLOAT,
+        preprocessors=[
+            tf.keras.layers.Rescaling,
+            tf.keras.layers.Normalization,
+        ],
+        bin_boundaries=[0.0, 1.0, 2.0],
+        mean=0.0,
+        variance=1.0,
+        scale=4.0,
+        prefered_distribution="normal"
+    ),
+}
+
+# Now we can create a preprocessing model with the features
+ppr = PreprocessingModel(
+    path_data="sample_data.csv",
+    features_specs=features,
+    features_stats_path="features_stats.json",
+    overwrite_stats=True,
+    output_mode=OutputModeOptions.CONCAT,
+
+    # Add feature selection to get the most important features
+    feature_selection_placement="numeric", # Choose between (all_features|numeric|categorical)
+
+    # Add tabular attention to check for feature interactions
+    tabular_attention=True,
+
+    # Add distribution aware encoder
+    use_distribution_aware=True
+)
+
+# Build the preprocessor
+result = ppr.build_preprocessor()
+
+# Transform data using direct model prediction
+transformed_data = ppr.model.predict(test_batch)
+
+# Get feature importances
+feature_importances = ppr.get_feature_importances()
+```
+Here is the plot of the model:
+![Complex Model](imgs/numerical_example_model_with_distribution_aware.png)
diff --git a/docs/imgs/complex_model.png b/docs/imgs/complex_model.png
diff --git a/docs/imgs/numerical_example_model_with_distribution_aware.png b/docs/imgs/numerical_example_model_with_distribution_aware.png
diff --git a/docs/quick_start.md b/docs/quick_start.md
@@ -31,6 +31,7 @@ model = PreprocessingModel(
     features=features,
     tabular_attention=True,  # Enable attention mechanism
     feature_selection=True   # Enable feature selection
+    use_distribution_aware=True # Enable distribution aware encoder
 )
 ```
 

Original file line number	Diff line number	Diff line change
`@@ -31,6 +31,7 @@ model = PreprocessingModel(`
`31`	`31`	`features=features,`
`32`	`32`	`tabular_attention=True, # Enable attention mechanism`
`33`	`33`	`feature_selection=True # Enable feature selection`
	`34`	`+ use_distribution_aware=True # Enable distribution aware encoder`
`34`	`35`	`)`
`35`	`36`	```
`36`	`37`