Skip to content

Commit 519bb06

Browse files
feat(KDP): adding new layers / features to timeseries
1 parent 13e4511 commit 519bb06

31 files changed

+5831
-749
lines changed

docs/features/time_series_features.md

Lines changed: 234 additions & 100 deletions
Large diffs are not rendered by default.

examples/custom_preprocessing_example.py

Lines changed: 5 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -4,19 +4,21 @@
44
This example demonstrates how to define and use custom preprocessing pipelines
55
for various feature types in the KDP framework.
66
"""
7+
# ruff: noqa: E402
78

89
import os
910
import sys
11+
12+
# Add the project root to the Python path to allow module imports
13+
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
14+
1015
import numpy as np
1116
import pandas as pd
1217
import logging
1318
import tensorflow as tf
1419
from sklearn.model_selection import train_test_split
1520
from sklearn.metrics import mean_squared_error
1621

17-
# Add the project root to the Python path to allow module imports
18-
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
19-
2022
from kdp.processor import PreprocessingModel
2123
from kdp.features import (
2224
NumericalFeature,

examples/dynamic_pipeline_examples.py

Lines changed: 15 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@
44
This script demonstrates how to use the DynamicPreprocessingPipeline to create
55
a flexible pipeline of preprocessing layers, with customizable transformations.
66
"""
7+
# ruff: noqa: E402
78

89
import numpy as np
910
import tensorflow as tf
@@ -25,6 +26,7 @@
2526
np.random.seed(42)
2627
tf.random.set_seed(42)
2728

29+
2830
# Example 1: Basic Custom Layers
2931
class ScalingLayer(tf.keras.layers.Layer):
3032
"""Custom layer to scale numeric input by a factor."""
@@ -294,54 +296,54 @@ def data_generator():
294296
def example_5_normalize_transform():
295297
"""Create a pipeline that normalizes data and then applies a log transform."""
296298
print("\n=== Example 5: Normalize and Transform Pipeline ===")
297-
299+
298300
# Generate random data - lognormal distribution (right-skewed)
299301
data = np.random.lognormal(mean=0, sigma=1, size=(1000, 1)).astype(np.float32)
300-
302+
301303
# Create a normalization layer
302304
normalize_layer = tf.keras.layers.Normalization(name="normalize")
303305
normalize_layer.adapt(data)
304-
306+
305307
# Create a log transform layer using our factory
306308
log_transform = PreprocessorLayerFactory.distribution_transform_layer(
307309
transform_type="log", name="log_transform"
308310
)
309-
311+
310312
# Create our pipeline with both layers
311313
pipeline = DynamicPreprocessingPipeline([normalize_layer, log_transform])
312-
314+
313315
# Create a dataset
314316
dataset = tf.data.Dataset.from_tensor_slices({"normalize": data}).batch(32)
315-
317+
316318
# Process the data
317319
processed_data = pipeline.process(dataset)
318-
320+
319321
# Examine the results
320322
for batch in processed_data.take(1):
321323
original_mean = np.mean(data)
322324
transformed_mean = batch["log_transform"].numpy().mean()
323-
325+
324326
print(f"Original data mean: {original_mean:.4f}")
325327
print(f"Transformed data mean: {transformed_mean:.4f}")
326-
328+
327329
# Visualize the transformation
328330
plt.figure(figsize=(12, 5))
329-
331+
330332
plt.subplot(1, 2, 1)
331333
plt.hist(data, bins=50, alpha=0.7)
332334
plt.title("Original Data Distribution")
333335
plt.xlabel("Value")
334336
plt.ylabel("Frequency")
335-
337+
336338
plt.subplot(1, 2, 2)
337339
plt.hist(batch["log_transform"].numpy(), bins=50, alpha=0.7)
338340
plt.title("Normalized + Log Transformed Data")
339341
plt.xlabel("Value")
340342
plt.ylabel("Frequency")
341-
343+
342344
plt.tight_layout()
343345
plt.show()
344-
346+
345347
return pipeline
346348

347349

Lines changed: 181 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,181 @@
1+
#!/usr/bin/env python
2+
# -*- coding: utf-8 -*-
3+
4+
"""
5+
Example of using the new time series feature layers in keras-data-processor.
6+
7+
This example demonstrates how to use the WaveletTransformLayer and TSFreshFeatureLayer
8+
for extracting features from time series data.
9+
"""
10+
11+
import numpy as np
12+
import matplotlib.pyplot as plt
13+
from tensorflow.keras.models import Model
14+
from tensorflow.keras.layers import Input, Dense, Concatenate
15+
16+
from kdp.layers.time_series import (
17+
WaveletTransformLayer,
18+
TSFreshFeatureLayer,
19+
LagFeatureLayer,
20+
)
21+
22+
23+
def generate_sample_data(n_samples=1000, n_features=1):
24+
"""Generate a sample time series dataset."""
25+
np.random.seed(42)
26+
27+
# Time steps
28+
t = np.linspace(0, 10 * np.pi, n_samples)
29+
30+
# Base sine wave with increasing frequency
31+
base_signal = np.sin(t * (1 + t / (10 * np.pi)))
32+
33+
# Add trends and seasonality for complexity
34+
trend = 0.3 * t / (10 * np.pi)
35+
seasonality = 0.5 * np.sin(0.5 * t)
36+
37+
# Create signal with noise
38+
signal = base_signal + trend + seasonality + np.random.normal(0, 0.2, n_samples)
39+
40+
# Normalize
41+
signal = (signal - np.mean(signal)) / np.std(signal)
42+
43+
# For multiple features, create variations
44+
if n_features > 1:
45+
signals = [signal]
46+
for i in range(1, n_features):
47+
# Create different variations with phase shifts and scaling
48+
variation = np.sin(t * (1 + t / (10 * np.pi) + i * 0.2)) + trend * (
49+
1.0 + 0.1 * i
50+
)
51+
variation = (variation - np.mean(variation)) / np.std(variation)
52+
signals.append(variation)
53+
signal = np.column_stack(signals)
54+
55+
# Create test/train split
56+
train_size = int(0.8 * n_samples)
57+
X_train = signal[:train_size]
58+
X_test = signal[train_size:]
59+
60+
# Create target variable (for regression task)
61+
# We'll predict the next value in the series
62+
y_train = (
63+
signal[1 : train_size + 1, 0] if n_features > 1 else signal[1 : train_size + 1]
64+
)
65+
y_test = signal[train_size + 1 :, 0] if n_features > 1 else signal[train_size + 1 :]
66+
67+
return X_train, y_train, X_test, y_test
68+
69+
70+
def build_model_with_feature_layers(input_shape):
71+
"""Build a model that uses various time series feature layers."""
72+
inputs = Input(shape=input_shape)
73+
74+
# 1. Extract wavelet transform features
75+
wavelet_features = WaveletTransformLayer(
76+
levels=3, window_sizes=[4, 8, 16], flatten_output=True
77+
)(inputs)
78+
79+
# 2. Extract statistical features using TSFreshFeatureLayer
80+
tsfresh_features = TSFreshFeatureLayer(
81+
features=["mean", "std", "min", "max", "median", "skewness", "kurtosis"],
82+
normalize=True,
83+
)(inputs)
84+
85+
# 3. Extract lag features for temporal patterns
86+
lag_features = LagFeatureLayer(
87+
lag_indices=[1, 2, 3, 5, 7, 14, 21],
88+
drop_na=False, # We'll get zeros for missing values
89+
)(inputs)
90+
91+
# Combine all features
92+
combined_features = Concatenate()(
93+
[wavelet_features, tsfresh_features, lag_features]
94+
)
95+
96+
# Dense layers for prediction
97+
x = Dense(64, activation="relu")(combined_features)
98+
x = Dense(32, activation="relu")(x)
99+
outputs = Dense(1)(x)
100+
101+
model = Model(inputs=inputs, outputs=outputs)
102+
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
103+
104+
return model
105+
106+
107+
def main():
108+
"""Run the example."""
109+
# Generate sample data
110+
X_train, y_train, X_test, y_test = generate_sample_data(
111+
n_samples=1000, n_features=2
112+
)
113+
114+
print(f"X_train shape: {X_train.shape}")
115+
print(f"y_train shape: {y_train.shape}")
116+
117+
# Reshape for the model (add batch dimension if not already present)
118+
if len(X_train.shape) == 1:
119+
X_train = X_train.reshape(-1, 1)
120+
X_test = X_test.reshape(-1, 1)
121+
122+
# Build model
123+
model = build_model_with_feature_layers(input_shape=(X_train.shape[1],))
124+
125+
# Print model summary
126+
model.summary()
127+
128+
# Train model
129+
history = model.fit(
130+
X_train,
131+
y_train,
132+
validation_data=(X_test, y_test),
133+
epochs=50,
134+
batch_size=32,
135+
verbose=1,
136+
)
137+
138+
# Plot training history
139+
plt.figure(figsize=(12, 4))
140+
141+
plt.subplot(1, 2, 1)
142+
plt.plot(history.history["loss"])
143+
plt.plot(history.history["val_loss"])
144+
plt.title("Model loss")
145+
plt.ylabel("Loss (MSE)")
146+
plt.xlabel("Epoch")
147+
plt.legend(["Train", "Validation"], loc="upper right")
148+
149+
plt.subplot(1, 2, 2)
150+
plt.plot(history.history["mae"])
151+
plt.plot(history.history["val_mae"])
152+
plt.title("Model MAE")
153+
plt.ylabel("MAE")
154+
plt.xlabel("Epoch")
155+
plt.legend(["Train", "Validation"], loc="upper right")
156+
157+
plt.tight_layout()
158+
plt.savefig("time_series_features_training.png")
159+
print("Training plot saved as 'time_series_features_training.png'")
160+
161+
# Evaluate on test set
162+
test_loss, test_mae = model.evaluate(X_test, y_test, verbose=0)
163+
print(f"Test Loss (MSE): {test_loss:.4f}")
164+
print(f"Test MAE: {test_mae:.4f}")
165+
166+
# Make predictions and plot
167+
predictions = model.predict(X_test)
168+
169+
plt.figure(figsize=(12, 6))
170+
plt.plot(y_test, label="Actual")
171+
plt.plot(predictions, label="Predicted")
172+
plt.title("Time Series Prediction with Feature Layers")
173+
plt.xlabel("Time Step")
174+
plt.ylabel("Value")
175+
plt.legend()
176+
plt.savefig("time_series_features_prediction.png")
177+
print("Prediction plot saved as 'time_series_features_prediction.png'")
178+
179+
180+
if __name__ == "__main__":
181+
main()

ideas.md

Lines changed: 41 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,41 @@
1+
For Timeseries Features:
2+
3+
Based on your current implementation, I can suggest several advanced features to enhance your time series preprocessing capabilities:
4+
5+
Automatic Time Series Decomposition
6+
Implement seasonal-trend decomposition (STL) to separate time series into trend, seasonal, and residual components
7+
This would allow models to learn from each component separately, improving performance on seasonal data
8+
9+
Dynamic Feature Generation
10+
Add configurable lag feature windows that automatically determine optimal lag values based on autocorrelation analysis
11+
Implement change point detection to identify regime shifts in time series data
12+
13+
Advanced Signal Processing Features
14+
Fast Fourier Transform (FFT) layers to extract frequency domain features
15+
Wavelet transforms for multi-resolution analysis of time series data
16+
Spectral analysis features to capture cyclical patterns
17+
18+
Improved Missing Value Handling
19+
Add specialized interpolation methods for time series (cubic spline, LOCF, etc.)
20+
Implement masking mechanism to handle irregular time series with missing timestamps
21+
22+
Time-Aware Attention Mechanisms
23+
Implement temporal attention layers that focus on relevant time steps
24+
Create a positional encoding layer specifically for time series to encode temporal distance
25+
26+
Multi-Scale Processing
27+
Implement automatic resampling at multiple time scales (hourly, daily, weekly)
28+
Create hierarchical time series preprocessors that handle different granularities
29+
30+
31+
Enhanced Seasonality Handling
32+
Add calendar feature generation (holidays, day of week, etc.)
33+
Implement multiple seasonal period detection and encoding
34+
35+
Causal Inference Features
36+
Add Granger causality testing as a preprocessing step
37+
Implement transfer entropy calculations for multivariate time series
38+
39+
Temporal Feature Extraction
40+
Add automatic feature extraction using tsfresh-inspired statistical features
41+
Implement shapelets detection for pattern recognition

0 commit comments

Comments
 (0)