Skip to content

Commit 078df9d

Browse files
feat(KDP): adding categorical features hashing
1 parent 55ae7ce commit 078df9d

File tree

8 files changed

+1325
-184
lines changed

8 files changed

+1325
-184
lines changed
Lines changed: 284 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,284 @@
1+
# Categorical Feature Hashing Example
2+
3+
This example demonstrates how to use feature hashing for categorical variables in the KDP library.
4+
5+
## What is Categorical Feature Hashing?
6+
7+
Feature hashing (also known as the "hashing trick") is a technique used to transform high-cardinality categorical features into a fixed-size vector representation. It's particularly useful for:
8+
9+
- Handling categorical features with very large numbers of unique values
10+
- Dealing with previously unseen categories at inference time
11+
- Reducing memory usage for high-cardinality features
12+
13+
## When to Use Hashing vs. Embeddings or One-Hot Encoding
14+
15+
- **One-Hot Encoding**: Best for low-cardinality features (typically <10 categories)
16+
- **Embeddings**: Good for medium-cardinality features where the relationships between categories are important
17+
- **Hashing**: Ideal for high-cardinality features (hundreds or thousands of unique values)
18+
19+
## Basic Example
20+
21+
```python
22+
from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions
23+
from kdp.processor import PreprocessingModel
24+
25+
# Define a categorical feature with hashing
26+
features = {
27+
"high_cardinality_feature": CategoricalFeature(
28+
name="high_cardinality_feature",
29+
feature_type=FeatureType.STRING_CATEGORICAL,
30+
category_encoding=CategoryEncodingOptions.HASHING,
31+
hash_bucket_size=1024 # Number of hash buckets
32+
)
33+
}
34+
35+
# Create a preprocessing model with the features
36+
model = PreprocessingModel(features_specs=features)
37+
```
38+
39+
## Advanced Hashing Options
40+
41+
### Hash with Embeddings
42+
43+
You can combine hashing with embeddings to reduce dimensionality further:
44+
45+
```python
46+
from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions
47+
48+
features = {
49+
"hashed_with_embedding": CategoricalFeature(
50+
name="hashed_with_embedding",
51+
feature_type=FeatureType.STRING_CATEGORICAL,
52+
category_encoding=CategoryEncodingOptions.HASHING,
53+
hash_bucket_size=512, # Number of hash buckets
54+
hash_with_embedding=True, # Enable embedding layer after hashing
55+
embedding_size=16 # Size of the embedding vectors
56+
)
57+
}
58+
```
59+
60+
### Custom Hash Salt
61+
62+
Adding a salt value to the hash function can help prevent collisions between different features:
63+
64+
```python
65+
features = {
66+
"product_id": CategoricalFeature(
67+
name="product_id",
68+
feature_type=FeatureType.STRING_CATEGORICAL,
69+
category_encoding=CategoryEncodingOptions.HASHING,
70+
hash_bucket_size=2048,
71+
salt=42 # Custom salt value for hashing
72+
)
73+
}
74+
```
75+
76+
## Comparison of Different Encoding Methods
77+
78+
```python
79+
features = {
80+
# Small cardinality - one hot encoding
81+
"product_category": CategoricalFeature(
82+
name="product_category",
83+
feature_type=FeatureType.STRING_CATEGORICAL,
84+
category_encoding=CategoryEncodingOptions.ONE_HOT_ENCODING
85+
),
86+
87+
# Medium cardinality - embeddings
88+
"store_id": CategoricalFeature(
89+
name="store_id",
90+
feature_type=FeatureType.STRING_CATEGORICAL,
91+
category_encoding=CategoryEncodingOptions.EMBEDDING,
92+
embedding_size=8
93+
),
94+
95+
# High cardinality - hashing
96+
"customer_id": CategoricalFeature(
97+
name="customer_id",
98+
feature_type=FeatureType.STRING_CATEGORICAL,
99+
category_encoding=CategoryEncodingOptions.HASHING,
100+
hash_bucket_size=1024
101+
),
102+
103+
# Very high cardinality - hashing with embedding
104+
"product_id": CategoricalFeature(
105+
name="product_id",
106+
feature_type=FeatureType.STRING_CATEGORICAL,
107+
category_encoding=CategoryEncodingOptions.HASHING,
108+
hash_bucket_size=2048,
109+
hash_with_embedding=True,
110+
embedding_size=16
111+
)
112+
}
113+
```
114+
115+
## Automatic Configuration with ModelAdvisor
116+
117+
KDP's `ModelAdvisor` can automatically determine the best encoding strategy for each feature based on data statistics:
118+
119+
```python
120+
from kdp.stats import DatasetStatistics
121+
from kdp.model_advisor import recommend_model_configuration
122+
123+
# First, analyze your dataset
124+
dataset_stats = DatasetStatistics("e_commerce_data.csv")
125+
dataset_stats.compute_statistics()
126+
127+
# Get recommendations from the ModelAdvisor
128+
recommendations = recommend_model_configuration(dataset_stats.features_stats)
129+
130+
# Print feature-specific recommendations
131+
for feature, config in recommendations["features"].items():
132+
if "HASHING" in config.get("preprocessing", []):
133+
print(f"Feature '{feature}' recommended for hashing:")
134+
print(f" - Hash bucket size: {config['config'].get('hash_bucket_size')}")
135+
print(f" - Use embeddings: {config['config'].get('hash_with_embedding', False)}")
136+
if config['config'].get('hash_with_embedding'):
137+
print(f" - Embedding size: {config['config'].get('embedding_size')}")
138+
print(f" - Salt value: {config['config'].get('salt')}")
139+
print(f" - Notes: {', '.join(config.get('notes', []))}")
140+
print()
141+
142+
# Generate ready-to-use code
143+
print("Generated code snippet:")
144+
print(recommendations["code_snippet"])
145+
```
146+
147+
The ModelAdvisor uses these heuristics for categorical features:
148+
- For features with <50 unique values: ONE_HOT_ENCODING
149+
- For features with 50-1000 unique values: EMBEDDING
150+
- For features with >1000 unique values: HASHING
151+
- For features with >10,000 unique values: HASHING with embeddings
152+
153+
It also automatically determines:
154+
- The appropriate hash bucket size based on cardinality
155+
- Whether to add salt values to prevent collisions
156+
- Embedding dimensions when using hash_with_embedding=True
157+
158+
## Choosing the Right Hash Bucket Size
159+
160+
The number of hash buckets is a critical parameter that affects model performance:
161+
162+
- Too few buckets: Many categories will hash to the same bucket (high collision rate)
163+
- Too many buckets: Sparse representation that might not generalize well
164+
165+
A good rule of thumb is to use a bucket size that is 2-4 times the number of unique categories in your data.
166+
167+
## Handling Hash Collisions
168+
169+
Hash collisions occur when different category values hash to the same bucket. There are two common strategies to mitigate this:
170+
171+
1. **Increase bucket size**: Use more buckets to reduce collision probability
172+
2. **Multi-hashing**: Apply multiple hash functions and use all outputs:
173+
174+
```python
175+
# Example of using multi-hash technique (available in advanced settings)
176+
features = {
177+
"complex_id": CategoricalFeature(
178+
name="complex_id",
179+
feature_type=FeatureType.STRING_CATEGORICAL,
180+
category_encoding=CategoryEncodingOptions.HASHING,
181+
hash_bucket_size=1024,
182+
hash_with_embedding=True,
183+
multi_hash=True, # Enable multiple hash functions
184+
num_hash_functions=3 # Number of hash functions to use
185+
)
186+
}
187+
```
188+
189+
## Performance Considerations
190+
191+
Hashing is computationally efficient compared to maintaining a large vocabulary mapping, especially when:
192+
193+
- You have a very large number of unique categories
194+
- New categories appear frequently in production
195+
- Memory is constrained
196+
197+
Feature hashing trades off a small amount of accuracy (due to potential collisions) for significant efficiency gains with very high-cardinality features.
198+
199+
## Complete End-to-End Example
200+
201+
Here's a complete example showing how to use feature hashing for e-commerce product data:
202+
203+
```python
204+
import pandas as pd
205+
from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions, NumericalFeature
206+
from kdp.processor import PreprocessingModel
207+
208+
# Create sample e-commerce data
209+
data = {
210+
"product_id": [f"p{i}" for i in range(1000)], # High cardinality
211+
"category": ["electronics", "clothing", "books", "home"] * 250, # Low cardinality
212+
"store_id": [f"store_{i % 100}" for i in range(1000)], # Medium cardinality
213+
"user_id": [f"user_{i % 10000}" for i in range(1000)], # Very high cardinality
214+
"price": [i * 0.1 for i in range(1000)] # Numerical
215+
}
216+
df = pd.DataFrame(data)
217+
df.to_csv("ecommerce.csv", index=False)
218+
219+
# Define features with appropriate encoding strategies
220+
features = {
221+
# Low cardinality - one hot encoding
222+
"category": CategoricalFeature(
223+
name="category",
224+
feature_type=FeatureType.STRING_CATEGORICAL,
225+
category_encoding=CategoryEncodingOptions.ONE_HOT_ENCODING
226+
),
227+
228+
# Medium cardinality - embedding
229+
"store_id": CategoricalFeature(
230+
name="store_id",
231+
feature_type=FeatureType.STRING_CATEGORICAL,
232+
category_encoding=CategoryEncodingOptions.EMBEDDING,
233+
embedding_size=8
234+
),
235+
236+
# High cardinality - hashing
237+
"product_id": CategoricalFeature(
238+
name="product_id",
239+
feature_type=FeatureType.STRING_CATEGORICAL,
240+
category_encoding=CategoryEncodingOptions.HASHING,
241+
hash_bucket_size=2048,
242+
salt=1 # Use different salt values for different features
243+
),
244+
245+
# Very high cardinality - hashing with embedding
246+
"user_id": CategoricalFeature(
247+
name="user_id",
248+
feature_type=FeatureType.STRING_CATEGORICAL,
249+
category_encoding=CategoryEncodingOptions.HASHING,
250+
hash_bucket_size=4096,
251+
hash_with_embedding=True,
252+
embedding_size=16,
253+
salt=2 # Different salt to avoid collisions with product_id
254+
),
255+
256+
# Numerical feature
257+
"price": NumericalFeature(
258+
name="price",
259+
feature_type=FeatureType.FLOAT_NORMALIZED
260+
)
261+
}
262+
263+
# Create and build the model
264+
model = PreprocessingModel(
265+
path_data="ecommerce.csv",
266+
features_specs=features,
267+
output_mode="CONCAT"
268+
)
269+
270+
# Build the preprocessor
271+
preprocessor = model.build_preprocessor()
272+
273+
# Use the preprocessor for inference
274+
input_data = {
275+
"category": ["electronics"],
276+
"store_id": ["store_42"],
277+
"product_id": ["p999"], # Known product
278+
"user_id": ["user_new"], # New user, not seen in training
279+
"price": [99.99]
280+
}
281+
282+
# Process the data - note how hashing handles both known and unknown values
283+
processed = preprocessor(input_data)
284+
print("Output shape:", processed.shape)

0 commit comments

Comments
 (0)