|
| 1 | +# Categorical Feature Hashing Example |
| 2 | + |
| 3 | +This example demonstrates how to use feature hashing for categorical variables in the KDP library. |
| 4 | + |
| 5 | +## What is Categorical Feature Hashing? |
| 6 | + |
| 7 | +Feature hashing (also known as the "hashing trick") is a technique used to transform high-cardinality categorical features into a fixed-size vector representation. It's particularly useful for: |
| 8 | + |
| 9 | +- Handling categorical features with very large numbers of unique values |
| 10 | +- Dealing with previously unseen categories at inference time |
| 11 | +- Reducing memory usage for high-cardinality features |
| 12 | + |
| 13 | +## When to Use Hashing vs. Embeddings or One-Hot Encoding |
| 14 | + |
| 15 | +- **One-Hot Encoding**: Best for low-cardinality features (typically <10 categories) |
| 16 | +- **Embeddings**: Good for medium-cardinality features where the relationships between categories are important |
| 17 | +- **Hashing**: Ideal for high-cardinality features (hundreds or thousands of unique values) |
| 18 | + |
| 19 | +## Basic Example |
| 20 | + |
| 21 | +```python |
| 22 | +from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions |
| 23 | +from kdp.processor import PreprocessingModel |
| 24 | + |
| 25 | +# Define a categorical feature with hashing |
| 26 | +features = { |
| 27 | + "high_cardinality_feature": CategoricalFeature( |
| 28 | + name="high_cardinality_feature", |
| 29 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 30 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 31 | + hash_bucket_size=1024 # Number of hash buckets |
| 32 | + ) |
| 33 | +} |
| 34 | + |
| 35 | +# Create a preprocessing model with the features |
| 36 | +model = PreprocessingModel(features_specs=features) |
| 37 | +``` |
| 38 | + |
| 39 | +## Advanced Hashing Options |
| 40 | + |
| 41 | +### Hash with Embeddings |
| 42 | + |
| 43 | +You can combine hashing with embeddings to reduce dimensionality further: |
| 44 | + |
| 45 | +```python |
| 46 | +from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions |
| 47 | + |
| 48 | +features = { |
| 49 | + "hashed_with_embedding": CategoricalFeature( |
| 50 | + name="hashed_with_embedding", |
| 51 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 52 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 53 | + hash_bucket_size=512, # Number of hash buckets |
| 54 | + hash_with_embedding=True, # Enable embedding layer after hashing |
| 55 | + embedding_size=16 # Size of the embedding vectors |
| 56 | + ) |
| 57 | +} |
| 58 | +``` |
| 59 | + |
| 60 | +### Custom Hash Salt |
| 61 | + |
| 62 | +Adding a salt value to the hash function can help prevent collisions between different features: |
| 63 | + |
| 64 | +```python |
| 65 | +features = { |
| 66 | + "product_id": CategoricalFeature( |
| 67 | + name="product_id", |
| 68 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 69 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 70 | + hash_bucket_size=2048, |
| 71 | + salt=42 # Custom salt value for hashing |
| 72 | + ) |
| 73 | +} |
| 74 | +``` |
| 75 | + |
| 76 | +## Comparison of Different Encoding Methods |
| 77 | + |
| 78 | +```python |
| 79 | +features = { |
| 80 | + # Small cardinality - one hot encoding |
| 81 | + "product_category": CategoricalFeature( |
| 82 | + name="product_category", |
| 83 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 84 | + category_encoding=CategoryEncodingOptions.ONE_HOT_ENCODING |
| 85 | + ), |
| 86 | + |
| 87 | + # Medium cardinality - embeddings |
| 88 | + "store_id": CategoricalFeature( |
| 89 | + name="store_id", |
| 90 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 91 | + category_encoding=CategoryEncodingOptions.EMBEDDING, |
| 92 | + embedding_size=8 |
| 93 | + ), |
| 94 | + |
| 95 | + # High cardinality - hashing |
| 96 | + "customer_id": CategoricalFeature( |
| 97 | + name="customer_id", |
| 98 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 99 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 100 | + hash_bucket_size=1024 |
| 101 | + ), |
| 102 | + |
| 103 | + # Very high cardinality - hashing with embedding |
| 104 | + "product_id": CategoricalFeature( |
| 105 | + name="product_id", |
| 106 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 107 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 108 | + hash_bucket_size=2048, |
| 109 | + hash_with_embedding=True, |
| 110 | + embedding_size=16 |
| 111 | + ) |
| 112 | +} |
| 113 | +``` |
| 114 | + |
| 115 | +## Automatic Configuration with ModelAdvisor |
| 116 | + |
| 117 | +KDP's `ModelAdvisor` can automatically determine the best encoding strategy for each feature based on data statistics: |
| 118 | + |
| 119 | +```python |
| 120 | +from kdp.stats import DatasetStatistics |
| 121 | +from kdp.model_advisor import recommend_model_configuration |
| 122 | + |
| 123 | +# First, analyze your dataset |
| 124 | +dataset_stats = DatasetStatistics("e_commerce_data.csv") |
| 125 | +dataset_stats.compute_statistics() |
| 126 | + |
| 127 | +# Get recommendations from the ModelAdvisor |
| 128 | +recommendations = recommend_model_configuration(dataset_stats.features_stats) |
| 129 | + |
| 130 | +# Print feature-specific recommendations |
| 131 | +for feature, config in recommendations["features"].items(): |
| 132 | + if "HASHING" in config.get("preprocessing", []): |
| 133 | + print(f"Feature '{feature}' recommended for hashing:") |
| 134 | + print(f" - Hash bucket size: {config['config'].get('hash_bucket_size')}") |
| 135 | + print(f" - Use embeddings: {config['config'].get('hash_with_embedding', False)}") |
| 136 | + if config['config'].get('hash_with_embedding'): |
| 137 | + print(f" - Embedding size: {config['config'].get('embedding_size')}") |
| 138 | + print(f" - Salt value: {config['config'].get('salt')}") |
| 139 | + print(f" - Notes: {', '.join(config.get('notes', []))}") |
| 140 | + print() |
| 141 | + |
| 142 | +# Generate ready-to-use code |
| 143 | +print("Generated code snippet:") |
| 144 | +print(recommendations["code_snippet"]) |
| 145 | +``` |
| 146 | + |
| 147 | +The ModelAdvisor uses these heuristics for categorical features: |
| 148 | +- For features with <50 unique values: ONE_HOT_ENCODING |
| 149 | +- For features with 50-1000 unique values: EMBEDDING |
| 150 | +- For features with >1000 unique values: HASHING |
| 151 | +- For features with >10,000 unique values: HASHING with embeddings |
| 152 | + |
| 153 | +It also automatically determines: |
| 154 | +- The appropriate hash bucket size based on cardinality |
| 155 | +- Whether to add salt values to prevent collisions |
| 156 | +- Embedding dimensions when using hash_with_embedding=True |
| 157 | + |
| 158 | +## Choosing the Right Hash Bucket Size |
| 159 | + |
| 160 | +The number of hash buckets is a critical parameter that affects model performance: |
| 161 | + |
| 162 | +- Too few buckets: Many categories will hash to the same bucket (high collision rate) |
| 163 | +- Too many buckets: Sparse representation that might not generalize well |
| 164 | + |
| 165 | +A good rule of thumb is to use a bucket size that is 2-4 times the number of unique categories in your data. |
| 166 | + |
| 167 | +## Handling Hash Collisions |
| 168 | + |
| 169 | +Hash collisions occur when different category values hash to the same bucket. There are two common strategies to mitigate this: |
| 170 | + |
| 171 | +1. **Increase bucket size**: Use more buckets to reduce collision probability |
| 172 | +2. **Multi-hashing**: Apply multiple hash functions and use all outputs: |
| 173 | + |
| 174 | +```python |
| 175 | +# Example of using multi-hash technique (available in advanced settings) |
| 176 | +features = { |
| 177 | + "complex_id": CategoricalFeature( |
| 178 | + name="complex_id", |
| 179 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 180 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 181 | + hash_bucket_size=1024, |
| 182 | + hash_with_embedding=True, |
| 183 | + multi_hash=True, # Enable multiple hash functions |
| 184 | + num_hash_functions=3 # Number of hash functions to use |
| 185 | + ) |
| 186 | +} |
| 187 | +``` |
| 188 | + |
| 189 | +## Performance Considerations |
| 190 | + |
| 191 | +Hashing is computationally efficient compared to maintaining a large vocabulary mapping, especially when: |
| 192 | + |
| 193 | +- You have a very large number of unique categories |
| 194 | +- New categories appear frequently in production |
| 195 | +- Memory is constrained |
| 196 | + |
| 197 | +Feature hashing trades off a small amount of accuracy (due to potential collisions) for significant efficiency gains with very high-cardinality features. |
| 198 | + |
| 199 | +## Complete End-to-End Example |
| 200 | + |
| 201 | +Here's a complete example showing how to use feature hashing for e-commerce product data: |
| 202 | + |
| 203 | +```python |
| 204 | +import pandas as pd |
| 205 | +from kdp.features import CategoricalFeature, FeatureType, CategoryEncodingOptions, NumericalFeature |
| 206 | +from kdp.processor import PreprocessingModel |
| 207 | + |
| 208 | +# Create sample e-commerce data |
| 209 | +data = { |
| 210 | + "product_id": [f"p{i}" for i in range(1000)], # High cardinality |
| 211 | + "category": ["electronics", "clothing", "books", "home"] * 250, # Low cardinality |
| 212 | + "store_id": [f"store_{i % 100}" for i in range(1000)], # Medium cardinality |
| 213 | + "user_id": [f"user_{i % 10000}" for i in range(1000)], # Very high cardinality |
| 214 | + "price": [i * 0.1 for i in range(1000)] # Numerical |
| 215 | +} |
| 216 | +df = pd.DataFrame(data) |
| 217 | +df.to_csv("ecommerce.csv", index=False) |
| 218 | + |
| 219 | +# Define features with appropriate encoding strategies |
| 220 | +features = { |
| 221 | + # Low cardinality - one hot encoding |
| 222 | + "category": CategoricalFeature( |
| 223 | + name="category", |
| 224 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 225 | + category_encoding=CategoryEncodingOptions.ONE_HOT_ENCODING |
| 226 | + ), |
| 227 | + |
| 228 | + # Medium cardinality - embedding |
| 229 | + "store_id": CategoricalFeature( |
| 230 | + name="store_id", |
| 231 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 232 | + category_encoding=CategoryEncodingOptions.EMBEDDING, |
| 233 | + embedding_size=8 |
| 234 | + ), |
| 235 | + |
| 236 | + # High cardinality - hashing |
| 237 | + "product_id": CategoricalFeature( |
| 238 | + name="product_id", |
| 239 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 240 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 241 | + hash_bucket_size=2048, |
| 242 | + salt=1 # Use different salt values for different features |
| 243 | + ), |
| 244 | + |
| 245 | + # Very high cardinality - hashing with embedding |
| 246 | + "user_id": CategoricalFeature( |
| 247 | + name="user_id", |
| 248 | + feature_type=FeatureType.STRING_CATEGORICAL, |
| 249 | + category_encoding=CategoryEncodingOptions.HASHING, |
| 250 | + hash_bucket_size=4096, |
| 251 | + hash_with_embedding=True, |
| 252 | + embedding_size=16, |
| 253 | + salt=2 # Different salt to avoid collisions with product_id |
| 254 | + ), |
| 255 | + |
| 256 | + # Numerical feature |
| 257 | + "price": NumericalFeature( |
| 258 | + name="price", |
| 259 | + feature_type=FeatureType.FLOAT_NORMALIZED |
| 260 | + ) |
| 261 | +} |
| 262 | + |
| 263 | +# Create and build the model |
| 264 | +model = PreprocessingModel( |
| 265 | + path_data="ecommerce.csv", |
| 266 | + features_specs=features, |
| 267 | + output_mode="CONCAT" |
| 268 | +) |
| 269 | + |
| 270 | +# Build the preprocessor |
| 271 | +preprocessor = model.build_preprocessor() |
| 272 | + |
| 273 | +# Use the preprocessor for inference |
| 274 | +input_data = { |
| 275 | + "category": ["electronics"], |
| 276 | + "store_id": ["store_42"], |
| 277 | + "product_id": ["p999"], # Known product |
| 278 | + "user_id": ["user_new"], # New user, not seen in training |
| 279 | + "price": [99.99] |
| 280 | +} |
| 281 | + |
| 282 | +# Process the data - note how hashing handles both known and unknown values |
| 283 | +processed = preprocessor(input_data) |
| 284 | +print("Output shape:", processed.shape) |
0 commit comments