Skip to content

Commit 11f258d

Browse files
committed
fix(KDP): broke transform method into 2 separate methods and end-to-end tests
1 parent 790f102 commit 11f258d

File tree

5 files changed

+534
-544
lines changed

5 files changed

+534
-544
lines changed

complex_model.png

311 KB
Loading

kdp/processor.py

Lines changed: 18 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
from functools import wraps
99
from typing import Any
1010

11+
import numpy as np
1112
import pandas as pd
1213
import tensorflow as tf
1314
from loguru import logger
@@ -1437,42 +1438,37 @@ def get_feature_statistics(self) -> dict:
14371438
"output_mode": self.output_mode,
14381439
}
14391440

1440-
def transform(self, data: tf.data.Dataset | pd.DataFrame | dict) -> dict[str, Any]:
1441-
"""Transform input data using the built preprocessor model.
1441+
def _convert_to_dataset(self, data: tf.data.Dataset | pd.DataFrame | dict) -> tf.data.Dataset:
1442+
"""Convert input data to TensorFlow dataset.
14421443
14431444
Args:
1444-
data: Input data to transform. Can be a DataFrame, Dataset, or dict.
1445+
data: Input data to convert. Can be a DataFrame, Dataset, or dict.
14451446
14461447
Returns:
1447-
dict[str, Any]: Dictionary containing:
1448-
- transformed_data: The transformed data output
1449-
- {feature_name}_weights: Weight for each feature from feature selection
1448+
tf.data.Dataset: The converted dataset.
14501449
14511450
Raises:
1452-
ValueError: If preprocessor hasn't been built yet.
1451+
ValueError: If input data is not a supported type.
14531452
"""
1454-
# Convert input data to TensorFlow dataset if needed
14551453
if isinstance(data, pd.DataFrame):
1456-
dataset = tf.data.Dataset.from_tensor_slices(dict(data)).batch(32)
1454+
return tf.data.Dataset.from_tensor_slices(dict(data)).batch(32)
14571455
elif isinstance(data, dict):
1458-
dataset = tf.data.Dataset.from_tensor_slices(data).batch(32)
1456+
return tf.data.Dataset.from_tensor_slices(data).batch(32)
14591457
elif isinstance(data, tf.data.Dataset):
1460-
dataset = data
1458+
return data
14611459
else:
14621460
raise ValueError("Input data must be a DataFrame, dict, or TensorFlow Dataset")
14631461

1464-
# Transform the data using the model
1465-
transformed = self.model.predict(dataset)
1462+
def _extract_feature_weights(self) -> dict[str, np.ndarray]:
1463+
"""Extract feature importance weights from feature selection layers.
14661464
1467-
# Initialize return dictionary with transformed data
1468-
result = {"transformed_data": transformed}
1469-
1470-
# Get feature importance from the feature selection layer if it exists
1465+
Returns:
1466+
dict[str, np.ndarray]: Dictionary mapping feature names to their importance weights.
1467+
"""
1468+
weights = {}
14711469
for layer in self.model.layers:
14721470
if "feature_selection" in layer.name:
1473-
weights = layer.get_weights()
1471+
layer_weights = layer.get_weights()
14741472
for i, feature_name in enumerate(self.features_specs.keys()):
1475-
# Add weights for each feature with the expected key format
1476-
result[f"{feature_name}_weights"] = weights[0][:, i]
1477-
1478-
return result
1473+
weights[f"{feature_name}_weights"] = layer_weights[0][:, i]
1474+
return weights

test/test_feature_selection.py

Lines changed: 70 additions & 206 deletions
Original file line numberDiff line numberDiff line change
@@ -28,11 +28,39 @@ def test_gating_mechanism(self):
2828
outputs = self.layer(inputs)
2929
self.assertAllInRange(outputs, -10.0, 10.0) # Reasonable range for gated outputs
3030

31-
def test_serialization(self):
31+
def test_serialization_basic(self):
3232
config = self.layer.get_config()
3333
new_layer = GatedLinearUnit.from_config(config)
3434
self.assertEqual(self.layer.units, new_layer.units)
3535

36+
def test_output_types(self):
37+
"""Test output types for GatedLinearUnit."""
38+
gl = GatedLinearUnit(units=64)
39+
inputs = tf.random.normal((32, 100))
40+
outputs = gl(inputs)
41+
42+
# Verify output is a tensor with correct dtype
43+
self.assertIsInstance(outputs, tf.Tensor)
44+
self.assertEqual(outputs.dtype, tf.float32)
45+
46+
def test_serialization_and_output_consistency(self):
47+
"""Test serialization and deserialization of GatedLinearUnit."""
48+
dummy_input = tf.random.normal((1, 100))
49+
50+
gl = GatedLinearUnit(units=64)
51+
gl(dummy_input) # This builds the layer
52+
53+
config = gl.get_config()
54+
gl_new = GatedLinearUnit.from_config(config)
55+
gl_new(dummy_input) # Build the new layer too
56+
57+
# Set the weights to be the same
58+
gl_new.set_weights(gl.get_weights())
59+
60+
# Test both layers produce the same output
61+
inputs = tf.random.normal((32, 100))
62+
self.assertAllClose(gl(inputs), gl_new(inputs))
63+
3664

3765
class TestGatedResidualNetwork(tf.test.TestCase):
3866
def setUp(self):
@@ -85,12 +113,50 @@ def test_dropout_behavior(self):
85113
for i in range(len(inference_outputs) - 1):
86114
self.assertAllClose(inference_outputs[i], inference_outputs[i + 1])
87115

88-
def test_serialization(self):
116+
def test_serialization_basic(self):
89117
config = self.layer.get_config()
90118
new_layer = GatedResidualNetwork.from_config(config)
91119
self.assertEqual(self.layer.units, new_layer.units)
92120
self.assertEqual(self.layer.dropout_rate, new_layer.dropout_rate)
93121

122+
def test_output_types(self):
123+
"""Test output types for GatedResidualNetwork."""
124+
batch_size = 32
125+
input_dim = 64
126+
dropout_rate = 0.5
127+
128+
grn = GatedResidualNetwork(units=input_dim, dropout_rate=dropout_rate)
129+
inputs = tf.random.normal((batch_size, input_dim))
130+
131+
outputs = grn(inputs)
132+
133+
# Verify output is a tensor with correct dtype
134+
self.assertIsInstance(outputs, tf.Tensor)
135+
self.assertEqual(outputs.dtype, tf.float32)
136+
137+
# Test with different input types
138+
inputs_int = tf.cast(inputs, tf.float32)
139+
outputs_from_int = grn(inputs_int)
140+
self.assertEqual(outputs_from_int.dtype, tf.float32) # Should always output float32
141+
142+
def test_serialization_and_output_consistency(self):
143+
"""Test serialization and deserialization of GatedResidualNetwork."""
144+
grn = GatedResidualNetwork(units=64, dropout_rate=0.3)
145+
# Build the layer first
146+
dummy_input = tf.random.normal((1, 64))
147+
grn(dummy_input)
148+
149+
config = grn.get_config()
150+
grn_new = GatedResidualNetwork.from_config(config)
151+
grn_new(dummy_input)
152+
153+
# Set the weights to be the same
154+
grn_new.set_weights(grn.get_weights())
155+
156+
# Test both layers produce the same output
157+
inputs = tf.random.normal((32, 64))
158+
self.assertAllClose(grn(inputs), grn_new(inputs))
159+
94160

95161
class TestVariableSelection(tf.test.TestCase):
96162
def setUp(self):
@@ -148,215 +214,13 @@ def test_dropout_behavior(self):
148214
for i in range(len(inference_outputs) - 1):
149215
self.assertAllClose(inference_outputs[i], inference_outputs[i + 1])
150216

151-
def test_serialization(self):
217+
def test_serialization_basic(self):
152218
config = self.layer.get_config()
153219
new_layer = VariableSelection.from_config(config)
154220
self.assertEqual(self.layer.nr_features, new_layer.nr_features)
155221
self.assertEqual(self.layer.units, new_layer.units)
156222
self.assertEqual(self.layer.dropout_rate, new_layer.dropout_rate)
157223

158-
159-
class TestGatedLinearUnit2(tf.test.TestCase):
160-
"""Test suite for GatedLinearUnit layer."""
161-
162-
def test_output_shape(self):
163-
"""Test that output shape is correct."""
164-
batch_size = 32
165-
input_dim = 100
166-
units = 64
167-
168-
gl = GatedLinearUnit(units=units)
169-
inputs = tf.random.normal((batch_size, input_dim))
170-
outputs = gl(inputs)
171-
172-
self.assertEqual(outputs.shape, (batch_size, units))
173-
174-
def test_gating_mechanism(self):
175-
"""Test that gating mechanism properly filters values."""
176-
gl = GatedLinearUnit(units=1)
177-
inputs = tf.constant([[1.0], [2.0], [3.0]])
178-
179-
# Get internal gate values
180-
gate_values = gl.sigmoid(inputs)
181-
182-
# Verify gates are between 0 and 1
183-
self.assertTrue(tf.reduce_all(gate_values >= 0))
184-
self.assertTrue(tf.reduce_all(gate_values <= 1))
185-
186-
def test_output_types(self):
187-
"""Test output types for GatedLinearUnit."""
188-
gl = GatedLinearUnit(units=64)
189-
inputs = tf.random.normal((32, 100))
190-
outputs = gl(inputs)
191-
192-
# Verify output is a tensor with correct dtype
193-
self.assertIsInstance(outputs, tf.Tensor)
194-
self.assertEqual(outputs.dtype, tf.float32)
195-
196-
def test_serialization(self):
197-
"""Test serialization and deserialization of GatedLinearUnit."""
198-
dummy_input = tf.random.normal((1, 100))
199-
200-
gl = GatedLinearUnit(units=64)
201-
gl(dummy_input) # This builds the layer
202-
203-
config = gl.get_config()
204-
gl_new = GatedLinearUnit.from_config(config)
205-
gl_new(dummy_input) # Build the new layer too
206-
207-
# Set the weights to be the same
208-
gl_new.set_weights(gl.get_weights())
209-
210-
# Test both layers produce the same output
211-
inputs = tf.random.normal((32, 100))
212-
self.assertAllClose(gl(inputs), gl_new(inputs))
213-
214-
215-
class TestGatedResidualNetwork2(tf.test.TestCase):
216-
"""Test suite for GatedResidualNetwork layer."""
217-
218-
def test_output_shape(self):
219-
"""Test that output shape matches input shape."""
220-
batch_size = 32
221-
input_dim = 64
222-
units = 64
223-
224-
grn = GatedResidualNetwork(units=units)
225-
inputs = tf.random.normal((batch_size, input_dim))
226-
outputs = grn(inputs)
227-
228-
self.assertEqual(outputs.shape, (batch_size, units))
229-
230-
def test_residual_connection(self):
231-
"""Test that residual connection is working."""
232-
grn = GatedResidualNetwork(units=2, dropout_rate=0.0)
233-
inputs = tf.constant([[1.0, 2.0]])
234-
235-
# Get output with and without residual connection
236-
with_residual = grn(inputs)
237-
238-
# Verify output is different from input but related
239-
self.assertNotAllClose(with_residual, inputs)
240-
self.assertGreater(tf.reduce_max(tf.abs(with_residual - inputs)), 0)
241-
242-
def test_dropout_behavior(self):
243-
"""Test dropout behavior in training vs inference."""
244-
batch_size = 32
245-
input_dim = 64
246-
dropout_rate = 0.5
247-
248-
grn = GatedResidualNetwork(units=input_dim, dropout_rate=dropout_rate)
249-
inputs = tf.random.normal((batch_size, input_dim))
250-
251-
# Training mode (should apply dropout)
252-
train_outputs = grn(inputs, training=True)
253-
254-
# Inference mode (should not apply dropout)
255-
inference_outputs = grn(inputs, training=False)
256-
257-
# Outputs should be different in training vs inference
258-
self.assertNotAllClose(train_outputs, inference_outputs)
259-
260-
def test_output_types(self):
261-
"""Test output types for GatedResidualNetwork."""
262-
batch_size = 32
263-
input_dim = 64
264-
dropout_rate = 0.5
265-
266-
grn = GatedResidualNetwork(units=input_dim, dropout_rate=dropout_rate)
267-
inputs = tf.random.normal((batch_size, input_dim))
268-
269-
outputs = grn(inputs)
270-
271-
# Verify output is a tensor with correct dtype
272-
self.assertIsInstance(outputs, tf.Tensor)
273-
self.assertEqual(outputs.dtype, tf.float32)
274-
275-
# Test with different input types
276-
inputs_int = tf.cast(inputs, tf.float32)
277-
outputs_from_int = grn(inputs_int)
278-
self.assertEqual(outputs_from_int.dtype, tf.float32) # Should always output float32
279-
280-
def test_serialization(self):
281-
"""Test serialization and deserialization of GatedResidualNetwork."""
282-
grn = GatedResidualNetwork(units=64, dropout_rate=0.3)
283-
# Build the layer first
284-
dummy_input = tf.random.normal((1, 64))
285-
grn(dummy_input)
286-
287-
config = grn.get_config()
288-
grn_new = GatedResidualNetwork.from_config(config)
289-
grn_new(dummy_input)
290-
291-
# Set the weights to be the same
292-
grn_new.set_weights(grn.get_weights())
293-
294-
# Test both layers produce the same output
295-
inputs = tf.random.normal((32, 64))
296-
self.assertAllClose(grn(inputs), grn_new(inputs))
297-
298-
299-
class TestVariableSelection2(tf.test.TestCase):
300-
"""Test suite for VariableSelection layer."""
301-
302-
def test_output_shape(self):
303-
"""Test output shapes for features and weights."""
304-
batch_size = 32
305-
nr_features = 3
306-
feature_dims = [100, 200, 300]
307-
units = 64
308-
309-
vs = VariableSelection(nr_features=nr_features, units=units)
310-
inputs = [tf.random.normal((batch_size, dim)) for dim in feature_dims]
311-
312-
selected_features, feature_weights = vs(inputs)
313-
314-
# Check selected features shape
315-
self.assertEqual(selected_features.shape, (batch_size, units))
316-
317-
# Check weights shape
318-
self.assertEqual(feature_weights.shape, (batch_size, nr_features, 1))
319-
320-
def test_weight_properties(self):
321-
"""Test that feature weights sum to 1 and are non-negative."""
322-
batch_size = 32
323-
nr_features = 3
324-
feature_dims = [10, 20, 30]
325-
units = 64
326-
327-
vs = VariableSelection(nr_features=nr_features, units=units)
328-
inputs = [tf.random.normal((batch_size, dim)) for dim in feature_dims]
329-
330-
_, feature_weights = vs(inputs)
331-
332-
# Remove the last dimension for easier testing
333-
weights = tf.squeeze(feature_weights, axis=-1)
334-
335-
# Test weights sum to 1 for each sample
336-
sums = tf.reduce_sum(weights, axis=1)
337-
self.assertAllClose(sums, tf.ones_like(sums))
338-
339-
# Test weights are non-negative
340-
self.assertTrue(tf.reduce_all(weights >= 0))
341-
342-
def test_feature_selection(self):
343-
"""Test that the layer can select important features."""
344-
batch_size = 10
345-
nr_features = 2
346-
units = 4
347-
348-
vs = VariableSelection(nr_features=nr_features, units=units)
349-
350-
# Create one important and one noisy feature
351-
important_feature = tf.ones((batch_size, 2))
352-
noisy_feature = tf.random.normal((batch_size, 2)) * 0.1
353-
354-
selected_features, feature_weights = vs([important_feature, noisy_feature])
355-
356-
# The important feature should get higher weights
357-
weights = tf.squeeze(feature_weights, axis=-1)
358-
self.assertTrue(tf.reduce_mean(weights[:, 0]) > tf.reduce_mean(weights[:, 1]))
359-
360224
def test_output_types(self):
361225
"""Test output types for VariableSelection."""
362226
batch_size = 32
@@ -398,7 +262,7 @@ def test_mixed_input_types(self):
398262
self.assertEqual(selected_features.dtype, tf.float32)
399263
self.assertEqual(feature_weights.dtype, tf.float32)
400264

401-
def test_serialization(self):
265+
def test_serialization_and_output_consistency(self):
402266
"""Test serialization and deserialization of VariableSelection."""
403267
vs = VariableSelection(nr_features=3, units=64, dropout_rate=0.2)
404268
# Build the layer first

0 commit comments

Comments
 (0)