Skip to content

Commit cc3d533

Browse files
fix(KDP): preserve dtype for passthrough features and handle empty categorical vocabulary
- Enhanced input signature dtype inference for better type handling - Added fallback for empty categorical vocabularies with <UNK> placeholder - Fixed PreserveDtypeLayer to properly preserve original dtypes - All existing passthrough tests passing - Improved dtype inference in build_preprocessor method
1 parent 04a879a commit cc3d533

File tree

2 files changed

+43
-2
lines changed

2 files changed

+43
-2
lines changed

kdp/processor.py

Lines changed: 42 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -2286,7 +2286,48 @@ def build_preprocessor(self) -> dict:
22862286
# Get feature and its data type
22872287
feature = self.features_specs.get(feature_name)
22882288
if feature:
2289-
dtype = getattr(feature, "dtype", tf.float32)
2289+
# Try to get dtype from feature first, then from stats, then default
2290+
dtype = getattr(feature, "dtype", None)
2291+
if dtype is None:
2292+
# Try to get dtype from stats if available
2293+
feature_stats = None
2294+
if feature_name in self.numeric_features:
2295+
feature_stats = self.features_stats.get(
2296+
"numeric_stats", {}
2297+
).get(feature_name, {})
2298+
elif feature_name in self.categorical_features:
2299+
feature_stats = self.features_stats.get(
2300+
"categorical_stats", {}
2301+
).get(feature_name, {})
2302+
elif feature_name in self.text_features:
2303+
feature_stats = self.features_stats.get("text", {}).get(
2304+
feature_name, {}
2305+
)
2306+
elif feature_name in self.date_features:
2307+
feature_stats = self.features_stats.get("date", {}).get(
2308+
feature_name, {}
2309+
)
2310+
elif feature_name in self.time_series_features:
2311+
feature_stats = self.features_stats.get(
2312+
"time_series", {}
2313+
).get(feature_name, {})
2314+
2315+
if feature_stats:
2316+
dtype = feature_stats.get("dtype", tf.float32)
2317+
else:
2318+
# Final fallback based on feature type
2319+
if isinstance(feature, PassthroughFeature):
2320+
dtype = getattr(feature, "dtype", tf.float32)
2321+
elif feature.feature_type in [
2322+
FeatureType.STRING_CATEGORICAL,
2323+
FeatureType.TEXT,
2324+
]:
2325+
dtype = tf.string
2326+
elif feature.feature_type == FeatureType.DATE:
2327+
dtype = tf.string
2328+
else:
2329+
dtype = tf.float32
2330+
22902331
self._add_input_column(feature_name=feature_name, dtype=dtype)
22912332
self._add_input_signature(
22922333
feature_name=feature_name, dtype=dtype

pyproject.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
[tool.poetry]
22
name = "kdp"
33
# This version is managed by semantic-release and will be updated automatically during release
4-
version = "1.11.1"
4+
version = "1.11.2"
55
documentation = "http://piotrlaczkowski.github.io/keras-data-processor/"
66
repository = "https://github.com/piotrlaczkowski/keras-data-processor"
77
description = "Data Preprocessing model based on Keras preprocessing layers"

0 commit comments

Comments
 (0)