In [1]:
"""
Example: Training TemporalValidator with SignalFlow-NN

Flow:
1. Load raw data
2. Extract features (for ALL bars - this is the feature history)
3. Detect signals (generates signal timestamps)
4. Label signals (assigns labels ONLY to signal timestamps)
5. Train validator (windows created ONLY at signal timestamps)
6. Validate new signals

Key insight: The dataset contains windows ONLY for detected signals,
not for every bar. This is meta-labeling approach.
"""
import signalflow as sf
from signalflow.nn.validator import TemporalValidator
from signalflow.nn.model.temporal_classificator import TrainingConfig
from pathlib import Path
from datetime import datetime
import polars as pl
import torch

# ============================================================================
# 1. Load Raw Data
# ============================================================================

raw_data = sf.data.RawDataFactory.from_duckdb_spot_store(
    spot_store_path=Path("test.duckdb"),
    pairs=["BTCUSDT", "ETHUSDT", "SOLUSDT"],
    start=datetime(2025, 10, 1),
    end=datetime(2025, 12, 31),
    data_types=["spot"],
)
raw_data_view = sf.core.RawDataView(raw_data)

# ============================================================================
# 2. Feature Engineering (for ALL bars - this is feature history)
# ============================================================================

feature_set = sf.feature.FeatureSet(extractors=[
    sf.feature.pandasta.PandasTaRsiExtractor(length=14),
    sf.feature.pandasta.PandasTaMacdExtractor(fast=12, slow=26, signal=9),
    sf.feature.pandasta.PandasTaAtrExtractor(length=14),
    sf.feature.pandasta.PandasTaBbandsExtractor(length=20, std=2.0),
])
features_df = feature_set.extract(raw_data_view)

# Normalize features per pair
feature_cols = [c for c in features_df.columns if c not in ["pair", "timestamp"]]
features_df = features_df.with_columns([
    ((pl.col(c) - pl.col(c).mean().over("pair")) / (pl.col(c).std().over("pair") + 1e-6))
    .alias(c)
    for c in feature_cols
])

print(f"Features shape: {features_df.shape} (full history)")
print(f"Feature columns ({len(feature_cols)}): {feature_cols[:5]}...")

# ============================================================================
# 3. Signal Detection (generates signal TIMESTAMPS)
# ============================================================================

detector = sf.detector.SmaCrossSignalDetector(fast_period=10, slow_period=30)
signals = detector.run(raw_data_view)

# Filter to actionable signals only (RISE/FALL, not NONE)
actionable_signals = signals.value.filter(
    pl.col("signal_type").is_in(["rise", "fall"])
)

print(f"\nDetected {signals.value.height} total signals")
print(f"Actionable signals: {actionable_signals.height}")
print(f"Signal distribution:\n{actionable_signals.group_by('signal_type').len()}")

# ============================================================================
# 4. Labeling (assign labels ONLY to signal timestamps)
# ============================================================================

from signalflow.target import FixedHorizonLabeler

labeler = FixedHorizonLabeler(
    price_col="close",
    horizon=12,
    out_col="label",
    include_meta=True,
)

# Label the raw data, then join with signals
spot_df = raw_data_view.to_polars("spot")
labeled_full = labeler.compute(spot_df)

# Join labels with signal timestamps
# This gives us labels ONLY for detected signal timestamps
labeled_signals = (
    actionable_signals
    .select(["pair", "timestamp", "signal_type"])
    .join(
        labeled_full.select(["pair", "timestamp", "label"]),
        on=["pair", "timestamp"],
        how="inner",
    )
    .filter(pl.col("label").is_not_null())
)

# Encode labels: none=0, rise=1, fall=2
labeled_signals = labeled_signals.with_columns(
    pl.when(pl.col("label") == "rise").then(1)
    .when(pl.col("label") == "fall").then(2)
    .otherwise(0)
    .cast(pl.Int64)
    .alias("label")
)

print(f"\nLabeled signals: {labeled_signals.height}")
print(f"Label distribution:\n{labeled_signals.group_by('label').len()}")

# ============================================================================
# 5. Configure and Train Validator
# ============================================================================

input_size = len(feature_cols)

# Encoder config
encoder_params = {
    "input_size": input_size,
    "hidden_size": 64,
    "num_layers": 2,
    "dropout": 0.2,
    "bidirectional": False,
}

# Head config
head_params = {
    "hidden_sizes": [128, 64],
    "dropout": 0.3,
    "activation": "gelu",
}

# Training config
training_config = {
    "learning_rate": 1e-3,
    "weight_decay": 1e-5,
    "optimizer": "adamw",
    "scheduler": "reduce_on_plateau",
    "scheduler_patience": 5,
    "label_smoothing": 0.1,
}

# Create validator
validator = TemporalValidator(
    encoder_type="encoder/lstm",
    encoder_params=encoder_params,
    head_type="head/cls/mlp",
    head_params=head_params,
    window_size=30,
    num_classes=3,
    class_weights=[1.0, 2.0, 2.0],  # Upweight rise/fall
    training_config=training_config,
    feature_cols=feature_cols,
    max_epochs=10,
    batch_size=64,
    early_stopping_patience=5,
    train_val_test_split=(0.6, 0.2, 0.2),
    split_strategy="temporal",
    num_workers=4,
)

print(f"\n{'='*60}")
print(f"Starting training")
print(f"{'='*60}")
print(f"Input size: {input_size}")
print(f"Window size: {validator.window_size}")
print(f"Training on {labeled_signals.height} labeled signals")
print(f"(NOT on all {features_df.height} bars!)")

# Train: X_train is full feature history, y_train is ONLY labeled signals
validator.fit(
    X_train=features_df,        # Full feature history
    y_train=labeled_signals,    # Only signal timestamps with labels
    log_dir=Path("./logs/temporal_validator"),
    accelerator="auto",
)

print("\nTraining finished.")

# ============================================================================
# 6. Validate New Signals
# ============================================================================

validated_signals = validator.validate_signals(signals, features_df)

# Rename probability columns for clarity
validated_df = validated_signals.value.with_columns([
    pl.col("probability_none").alias("prob_neutral"),
    pl.col("probability_rise").alias("prob_rise"),
    pl.col("probability_fall").alias("prob_fall"),
])

# ============================================================================
# 7. Analyze Results
# ============================================================================

print("\n" + "=" * 60)
print("VALIDATION RESULTS")
print("=" * 60)

# Top Rise Signals
print("\nTop Rise Signals (high probability):")
rise_signals = (
    validated_df
    .filter(pl.col("signal_type") == "rise")
    .sort("prob_rise", descending=True)
    .select(["timestamp", "pair", "prob_rise", "prob_fall", "prob_neutral"])
    .head(10)
)
print(rise_signals)

# Top Fall Signals
print("\nTop Fall Signals (high probability):")
fall_signals = (
    validated_df
    .filter(pl.col("signal_type") == "fall")
    .sort("prob_fall", descending=True)
    .select(["timestamp", "pair", "prob_rise", "prob_fall", "prob_neutral"])
    .head(10)
)
print(fall_signals)

# High-confidence signals (>70% probability)
HIGH_CONF_THRESHOLD = 0.7

high_conf_rise = validated_df.filter(
    (pl.col("signal_type") == "rise") & (pl.col("prob_rise") > HIGH_CONF_THRESHOLD)
)
high_conf_fall = validated_df.filter(
    (pl.col("signal_type") == "fall") & (pl.col("prob_fall") > HIGH_CONF_THRESHOLD)
)

print(f"\nHigh-confidence signals (>{HIGH_CONF_THRESHOLD*100:.0f}%):")
print(f"  Rise signals: {high_conf_rise.height}")
print(f"  Fall signals: {high_conf_fall.height}")

# ============================================================================
# 8. Save Validator
# ============================================================================

validator.save(Path("./models/temporal_validator.pkl"))
print("\nValidator saved to ./models/temporal_validator.pkl")

# To load later:
# loaded_validator = TemporalValidator.load(Path("./models/temporal_validator.pkl"))

  from .autonotebook import tqdm as notebook_tqdm
[32m2026-01-07 19:07:14.577[0m | [1mINFO    [0m | [36msignalflow.data.raw_store.duckdb_stores[0m:[36m_ensure_tables[0m:[36m198[0m - [1mDatabase initialized: test.duckdb (timeframe=1m)[0m


Features shape: (393120, 12) (full history)
Feature columns (10): ['rsi_RSI_14', 'macd_12_26_9_MACD_12_26_9', 'macd_12_26_9_MACDh_12_26_9', 'macd_12_26_9_MACDs_12_26_9', 'atr_14_ATRr_14']...


GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores



Detected 393033 total signals
Actionable signals: 15854
Signal distribution:
shape: (2, 2)
┌─────────────┬──────┐
│ signal_type ┆ len  │
│ ---         ┆ ---  │
│ str         ┆ u32  │
╞═════════════╪══════╡
│ rise        ┆ 7928 │
│ fall        ┆ 7926 │
└─────────────┴──────┘

Labeled signals: 15854
Label distribution:
shape: (3, 2)
┌───────┬──────┐
│ label ┆ len  │
│ ---   ┆ ---  │
│ i64   ┆ u32  │
╞═══════╪══════╡
│ 1     ┆ 8006 │
│ 2     ┆ 7788 │
│ 0     ┆ 60   │
└───────┴──────┘

Starting training
Input size: 10
Window size: 30
Training on 15854 labeled signals
(NOT on all 393120 bars!)


/home/alastor/miniconda3/envs/sfnn/lib/python3.12/site-packages/lightning/pytorch/trainer/connectors/logger_connector/logger_connector.py:76: Starting from v1.9.0, `tensorboardX` has been removed as a dependency of the `lightning.pytorch` package, due to potential conflicts with other packages in the ML ecosystem. For this reason, `logger=True` will use `CSVLogger` as the default logger, unless the `tensorboard` or `tensorboardX` packages are found. Please `pip install lightning[extra]` or one of them to enable TensorBoard support by default
You are using a CUDA device ('NVIDIA GeForce RTX 4070') that has Tensor Cores. To properly utilize them, you should set `torch.set_float32_matmul_precision('medium' | 'high')` which will trade-off precision for performance. For more details, read https://pytorch.org/docs/stable/generated/torch.set_float32_matmul_precision.html#torch.set_float32_matmul_precision
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name    | Type              | Params | Mo

Data split: train=9512, val=3171, test=3171
Epoch 0: 100%|██████████| 148/148 [00:01<00:00, 97.55it/s, v_num=4, val_loss=nan.0, val_acc=0.00378, train_loss=nan.0, train_acc=0.00306]

Training finished.

VALIDATION RESULTS

Top Rise Signals (high probability):
shape: (10, 5)
┌─────────────────────┬─────────┬───────────┬───────────┬──────────────┐
│ timestamp           ┆ pair    ┆ prob_rise ┆ prob_fall ┆ prob_neutral │
│ ---                 ┆ ---     ┆ ---       ┆ ---       ┆ ---          │
│ datetime[μs]        ┆ str     ┆ f64       ┆ f64       ┆ f64          │
╞═════════════════════╪═════════╪═══════════╪═══════════╪══════════════╡
│ 2025-10-01 00:40:00 ┆ BTCUSDT ┆ NaN       ┆ NaN       ┆ NaN          │
│ 2025-10-01 00:56:00 ┆ BTCUSDT ┆ NaN       ┆ NaN       ┆ NaN          │
│ 2025-10-01 01:42:00 ┆ BTCUSDT ┆ NaN       ┆ NaN       ┆ NaN          │
│ 2025-10-01 03:13:00 ┆ BTCUSDT ┆ NaN       ┆ NaN       ┆ NaN          │
│ 2025-10-01 04:07:00 ┆ BTCUSDT ┆ NaN       ┆ NaN       ┆ NaN       

FileNotFoundError: [Errno 2] No such file or directory: 'models/temporal_validator.pkl'