Skip to content

Commit ea419cf

Browse files
docs(KDP): adding concrete examples of usage of advance features to docs (#21)
2 parents a67c634 + 9b0f386 commit ea419cf

File tree

4 files changed

+298
-8
lines changed

4 files changed

+298
-8
lines changed

docs/example_usages.md

Lines changed: 290 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,290 @@
1+
# Example usages
2+
3+
Let us go trough some different examples of case scenarios. To get a feeling how one could use the library to their advantage with ease and comfort.
4+
5+
## Example 1: Numerical features
6+
7+
```python
8+
from kdp.features import NumericalFeature, FeatureType
9+
from kdp.processor import PreprocessingModel, OutputModeOptions
10+
11+
# Define features
12+
features = {
13+
# 1. Basic float feature (no preprocessing)
14+
"basic_float": NumericalFeature(
15+
name="basic_float",
16+
feature_type=FeatureType.FLOAT
17+
),
18+
19+
# 2. Basic float feature (no preprocessing)
20+
"basic_float2": NumericalFeature(
21+
name="basic_float2",
22+
feature_type=FeatureType.FLOAT
23+
),
24+
25+
# 3. Normalized float feature
26+
"normalized_float": NumericalFeature(
27+
name="normalized_float",
28+
feature_type=FeatureType.FLOAT_NORMALIZED
29+
),
30+
31+
# 4. Rescaled float feature
32+
"rescaled_float": NumericalFeature(
33+
name="rescaled_float",
34+
feature_type=FeatureType.FLOAT_RESCALED,
35+
scale=2.0 # Optional scale parameter
36+
),
37+
38+
# 5. Discretized float feature
39+
"discretized_float": NumericalFeature(
40+
name="discretized_float",
41+
feature_type=FeatureType.FLOAT_DISCRETIZED,
42+
bin_boundaries=[0.0, 1.0, 2.0] # Required for discretization
43+
),
44+
45+
# 6. Custom preprocessing pipeline
46+
"custom_float": NumericalFeature(
47+
name="custom_float",
48+
feature_type=FeatureType.FLOAT,
49+
preprocessors=[
50+
tf.keras.layers.Rescaling,
51+
tf.keras.layers.Normalization,
52+
],
53+
# Additional kwargs for the preprocessors
54+
bin_boundaries=[0.0, 1.0, 2.0],
55+
mean=0.0,
56+
variance=1.0,
57+
scale=4.0 # Added required scale parameter for Rescaling layer
58+
),
59+
}
60+
61+
# Define cross-feature between 2 arbitrary features, though tabular attention would be more useful for feature crossings
62+
feature_crosses = [("normalized_float", "rescaled_float", 10)] # 10 is the number of bins to hash into
63+
64+
# Now we can create a preprocessing model with the features
65+
ppr = PreprocessingModel(
66+
path_data="sample_data.csv",
67+
features_specs=features,
68+
feature_crosses=feature_crosses,
69+
features_stats_path="features_stats.json",
70+
overwrite_stats=True,
71+
output_mode=OutputModeOptions.CONCAT,
72+
73+
# Add feature selection to get the most important features
74+
feature_selection_placement="numeric", # Choose between (all_features|numeric|categorical)
75+
feature_selection_units=32,
76+
feature_selection_dropout=0.10,
77+
78+
# Add tabular attention to check for feature interactions
79+
tabular_attention=True,
80+
tabular_attention_placement="all_features", # Choose between (none|numeric|categorical|all_features|multi_resolution)
81+
tabular_attention_heads=3, # Number of attention heads
82+
tabular_attention_dim=32, # Attention dimension
83+
tabular_attention_dropout=0.1, # Attention dropout rate
84+
tabular_attention_embedding_dim=16, # Embedding dimension
85+
)
86+
87+
# Build the preprocessor
88+
result = ppr.build_preprocessor()
89+
90+
# Transform data using direct model prediction
91+
transformed_data = ppr.model.predict(test_batch)
92+
93+
# Get feature importances
94+
feature_importances = ppr.get_feature_importances()
95+
```
96+
Here is the plot of the model:
97+
![Complex Model](imgs/numerical_example_model.png)
98+
99+
100+
## Example 2: Categorical features
101+
102+
```python
103+
104+
from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions
105+
from kdp.processor import PreprocessingModel, OutputModeOptions
106+
107+
# Define features
108+
features = {
109+
# 1. Basic string categorical feature with embedding
110+
"basic_category": CategoricalFeature(
111+
name="basic_category",
112+
feature_type=FeatureType.STRING_CATEGORICAL,
113+
category_encoding=CategoryEncodingOptions.EMBEDDING,
114+
embedding_size=8 # Custom embedding size
115+
),
116+
117+
# 2. Basic integer categorical feature with one-hot encoding
118+
"basic_int_category": CategoricalFeature(
119+
name="basic_int_category",
120+
feature_type=FeatureType.INTEGER_CATEGORICAL,
121+
category_encoding=CategoryEncodingOptions.ONE_HOT_ENCODING
122+
),
123+
124+
# 3. High cardinality categorical feature with embedding
125+
"high_card_category": CategoricalFeature(
126+
name="high_card_category",
127+
feature_type=FeatureType.STRING_CATEGORICAL,
128+
category_encoding=CategoryEncodingOptions.EMBEDDING,
129+
# embedding size will be automatically determined based on cardinality
130+
),
131+
132+
# 4. Binary categorical feature with one-hot encoding
133+
"binary_category": CategoricalFeature(
134+
name="binary_category",
135+
feature_type=FeatureType.STRING_CATEGORICAL,
136+
category_encoding=CategoryEncodingOptions.ONE_HOT_ENCODING
137+
),
138+
}
139+
140+
feature_crosses = [("basic_category", "basic_int_category", 8)] # Using existing features
141+
142+
# Now we can create a preprocessing model with the features
143+
ppr = PreprocessingModel(
144+
path_data="sample_data.csv",
145+
features_specs=features,
146+
feature_crosses=feature_crosses,
147+
features_stats_path="features_stats.json",
148+
overwrite_stats=True,
149+
output_mode=OutputModeOptions.CONCAT,
150+
151+
# Add feature selection
152+
feature_selection_placement="categorical",
153+
feature_selection_units=32,
154+
feature_selection_dropout=0.1,
155+
156+
# Add tabular attention
157+
tabular_attention=True,
158+
tabular_attention_placement="categorical", # Apply attention only to categorical features
159+
tabular_attention_heads=4,
160+
tabular_attention_dim=32,
161+
tabular_attention_dropout=0.1,
162+
tabular_attention_embedding_dim=16,
163+
164+
# Add transformer blocks to make it learn sophisticated feature interactions
165+
transfo_nr_blocks=2,
166+
transfo_nr_heads=4,
167+
transfo_ff_units=32,
168+
transfo_dropout_rate=0.1,
169+
transfo_placement="categorical", # Apply transformer only to categorical features
170+
)
171+
172+
# Build the preprocessor
173+
result = ppr.build_preprocessor()
174+
175+
# Transform data using direct model prediction
176+
transformed_data = ppr.model.predict(test_batch)
177+
178+
# Get feature importances
179+
feature_importances = ppr.get_feature_importances()
180+
```
181+
182+
![Categorical Model](imgs/categorical_example_model.png)
183+
184+
185+
## Example 3: Multi-feature model
186+
187+
```python
188+
from kdp.features import NumericalFeature, CategoricalFeature, TextFeature, DateFeature, FeatureType
189+
from kdp.processor import PreprocessingModel, OutputModeOptions
190+
191+
# Define features
192+
features = {
193+
# Numerical features
194+
"price": NumericalFeature(
195+
name="price",
196+
feature_type=FeatureType.FLOAT_NORMALIZED
197+
),
198+
"quantity": NumericalFeature(
199+
name="quantity",
200+
feature_type=FeatureType.FLOAT_RESCALED,
201+
scale=1.0
202+
),
203+
204+
# Categorical features
205+
"category": CategoricalFeature(
206+
name="category",
207+
feature_type=FeatureType.STRING_CATEGORICAL,
208+
embedding_size=32
209+
),
210+
"brand": CategoricalFeature(
211+
name="brand",
212+
feature_type=FeatureType.STRING_CATEGORICAL,
213+
embedding_size=16
214+
),
215+
216+
# Text features
217+
"description": TextFeature(
218+
name="description",
219+
feature_type=FeatureType.TEXT,
220+
max_tokens=100
221+
),
222+
"title": TextFeature(
223+
name="title",
224+
feature_type=FeatureType.TEXT,
225+
max_tokens=50, # max number of tokens to keep
226+
227+
),
228+
229+
# Date features
230+
"sale_date": DateFeature(
231+
name="sale_date",
232+
feature_type=FeatureType.DATE,
233+
add_season=True, # adds one-hot season indicator (summer, winter, etc) defaults to False
234+
)
235+
}
236+
237+
# Create preprocessor with both transformer blocks and attention
238+
ppr = PreprocessingModel(
239+
path_data="sample_data.csv",
240+
features_stats_path="features_stats.json",
241+
overwrite_stats=True, # Force stats generation, recommended to be set to True
242+
features_specs=features,
243+
output_mode=OutputModeOptions.CONCAT,
244+
245+
# Transformer block configuration
246+
transfo_placement="all_features", # Choose between (categorical|all_features)
247+
transfo_nr_blocks=2, # Number of transformer blocks
248+
transfo_nr_heads=4, # Number of attention heads in transformer
249+
transfo_ff_units=64, # Feed-forward units in transformer
250+
transfo_dropout_rate=0.1, # Dropout rate for transformer
251+
252+
# Tabular attention configuration
253+
tabular_attention=True,
254+
tabular_attention_placement="all_features", # Choose between (none|numeric|categorical|all_features| multi_resolution)
255+
tabular_attention_heads=3, # Number of attention heads
256+
tabular_attention_dim=32, # Attention dimension
257+
tabular_attention_dropout=0.1, # Attention dropout rate
258+
tabular_attention_embedding_dim=16, # Embedding dimension
259+
260+
# Feature selection configuration
261+
feature_selection_placement="all_features", # Choose between (all_features|numeric|categorical)
262+
feature_selection_units=32,
263+
feature_selection_dropout=0.15,
264+
)
265+
266+
# Build the preprocessor
267+
result = ppr.build_preprocessor()
268+
```
269+
270+
Now if one wants to plot the a block diagram of the model or get the outout of the NN or get the importance weights of the features, use the following:
271+
272+
```python
273+
# Plot the model architecture
274+
ppr.plot_model("complex_model.png")
275+
276+
# Transform data using direct model prediction
277+
transformed_data = ppr.model.predict(test_batch)
278+
279+
# Transform data using batch_predict
280+
transformed_data = ppr.batch_predict(test_batch)
281+
transformed_batches = list(transformed_data) # For better visualization
282+
283+
# Get feature importances
284+
feature_importances = ppr.get_feature_importances()
285+
print("Feature importances:", feature_importances)
286+
```
287+
288+
289+
Here is the plot of the model:
290+
![Complex Model](imgs/complex_model.png)
322 KB
Loading
256 KB
Loading

docs/layers_factory.md

Lines changed: 8 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -17,14 +17,14 @@ normalization_layer = PreprocessorLayerFactory.create_layer(
1717
```
1818
Available layers:
1919

20-
- [x] Normalization
21-
- [x] Discretization
22-
- [x] CategoryEncoding
23-
- [x] Hashing
24-
- [x] HashedCrossing
25-
- [x] StringLookup
26-
- [x] IntegerLookup
27-
- [x] TextVectorization
20+
- [x] Normalization - Standardizes numerical features
21+
- [x] Discretization - Bins continuous features into discrete intervals
22+
- [x] CategoryEncoding - Converts categorical data into numeric representations
23+
- [x] Hashing - Performs feature hashing for categorical variables
24+
- [x] HashedCrossing - Creates feature crosses using hashing
25+
- [x] StringLookup - Converts string inputs to integer indices
26+
- [x] IntegerLookup - Maps integer inputs to indexed array positions
27+
- [x] TextVectorization - Processes raw text into encoded representations
2828
- [x] ... and more
2929

3030

0 commit comments

Comments
 (0)