# LSTM Price Prediction (v3)

Import packages and load market data; technical indicators are computed in this notebook (not from DB).

In [1]:
"""
Setup and Imports
"""
import sys
from pathlib import Path
from datetime import datetime, timedelta, timezone
from typing import Tuple, Optional
import warnings
warnings.filterwarnings('ignore')

# Add project root to path
project_root = Path().resolve().parent
if str(project_root) not in sys.path:
    sys.path.insert(0, str(project_root))

# Load environment variables from .env file
from dotenv import load_dotenv
env_path = project_root / ".env"
if env_path.exists():
    load_dotenv(env_path)
    print(f"Loaded environment variables from: {env_path}")
else:
    print(f"Warning: .env file not found at {env_path}")
    print("Please ensure your database credentials are set in environment variables or .env file")

# Core libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler, StandardScaler
from sklearn.metrics import mean_squared_error, mean_absolute_error

# PyTorch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader

# PyTorch Forecasting (encoder-only scaling, no look-ahead bias)
from pytorch_forecasting import TimeSeriesDataSet
from pytorch_forecasting.data.encoders import EncoderNormalizer

# optuna
import optuna

# Database
from sqlalchemy import select, desc
from src.shared.database.base import db_readonly_session
from src.shared.database.models.market_data import MarketData

# Yahoo Finance (for direct OHLCV fetch in notebook)
import yfinance as yf

# statsmodel
from statsmodels.tsa.stattools import adfuller
from statsmodels.stats.diagnostic import acorr_ljungbox

Loaded environment variables from: D:\PythonProjects\Trading-System\.env


In [2]:
# Set style
sns.set_style("whitegrid")
plt.rcParams['figure.figsize'] = (14, 6)

# Set random seeds for reproducibility
RANDOM_SEED = 42
np.random.seed(RANDOM_SEED)
torch.manual_seed(RANDOM_SEED)
if torch.cuda.is_available():
    torch.cuda.manual_seed(RANDOM_SEED)
    torch.backends.cudnn.deterministic = True

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device: {torch.cuda.get_device_name(0)}")

PyTorch version: 2.7.0+cu128
CUDA available: True
CUDA device: NVIDIA GeForce RTX 3050 Laptop GPU


## Helper functions

In [3]:
# Helper functions for inverse transformation (convert predictions back to prices)

def inverse_transform_pct_return(predictions: np.ndarray, base_prices: np.ndarray) -> np.ndarray:
    """
    Convert percentage return predictions back to price predictions.
    
    Args:
        predictions: Predicted percentage returns
        base_prices: Base prices (typically previous period's close prices)
        
    Returns:
        Predicted prices
    """
    return base_prices * (1 + predictions)


def inverse_transform_log_return(predictions: np.ndarray, base_prices: np.ndarray) -> np.ndarray:
    """
    Convert log return predictions back to price predictions.
    
    Args:
        predictions: Predicted log returns
        base_prices: Base prices (typically previous period's close prices)
        
    Returns:
        Predicted prices
    """
    return base_prices * np.exp(predictions)

print("Inverse transformation functions defined:")
print("  - inverse_transform_pct_return(): Convert % returns → prices")
print("  - inverse_transform_log_return(): Convert log returns → prices")

Inverse transformation functions defined:
  - inverse_transform_pct_return(): Convert % returns → prices
  - inverse_transform_log_return(): Convert log returns → prices


# Data Preparation

## Load OHLC Data

In [4]:
def load_market_data(
    symbol: str,
    start_date: Optional[datetime] = None,
    end_date: Optional[datetime] = None,
    data_source: str = "yahoo_adjusted",
    min_records: int = 1000
) -> pd.DataFrame:
    """
    Load OHLCV market data from database for a specific symbol.
    Default data_source='yahoo_adjusted' (splits/dividends adjusted). Use 'yahoo' for raw.
    Technical indicators are computed in this notebook, not loaded from DB.
    
    Args:
        symbol: Stock symbol (e.g., 'AAPL')
        start_date: Start date (default: 1 year ago)
        end_date: End date (default: today)
        data_source: Data source ('yahoo_adjusted', 'yahoo', 'polygon', 'alpaca')
        min_records: Minimum number of records required
        
    Returns:
        DataFrame with OHLCV columns (timestamp, open, high, low, close, volume)
    """
    if start_date is None:
        start_date = datetime.now(timezone.utc) - timedelta(days=500)
    if end_date is None:
        end_date = datetime.now(timezone.utc)
    
    symbol_upper = symbol.upper()
    
    # Load market data
    with db_readonly_session() as session:
        # Query market data
        market_query = (
            select(MarketData)
            .where(MarketData.symbol == symbol_upper)
            .where(MarketData.data_source == data_source.lower())
            .where(MarketData.timestamp >= start_date)
            .where(MarketData.timestamp <= end_date)
            .order_by(MarketData.timestamp)
        )
        
        market_result = session.execute(market_query)
        market_records = market_result.scalars().all()
    
    if len(market_records) < min_records:
        raise ValueError(
            f"Insufficient data: {len(market_records)} records found, "
            f"minimum {min_records} required"
        )
    
    # Convert market data to DataFrame
    market_data = []
    for record in market_records:
        if record.is_complete:  # Only include complete OHLCV records
            market_data.append({
                'timestamp': record.timestamp,
                'open': float(record.open),
                'high': float(record.high),
                'low': float(record.low),
                'close': float(record.close),
                'volume': int(record.volume) if record.volume else 0,
            })
    
    df_market = pd.DataFrame(market_data)
    return df_market

In [5]:
# Optional: verify where market_data lives in the DB you're connected to.
# Run this if DB load fails with "relation does not exist". Expect: schema=data_ingestion, table_name=market_data.
from sqlalchemy import text
try:
    with db_readonly_session() as session:
        r = session.execute(text(
            "SELECT table_schema, table_name FROM information_schema.tables WHERE table_name = 'market_data'"
        )).fetchall()
    if r:
        print("market_data found:", [dict(zip(["schema", "table"], row)) for row in r])
    else:
        print("No table named 'market_data' in this database. Create it with scripts/01_create_databases.sql and 02_create_core_tables.sql.")
except Exception as e:
    print("Could not check DB:", e)

[32m2026-02-07 15:15:01.497[0m | [34m[1mDEBUG   [0m | [36msrc.shared.database.base[0m:[36mdb_readonly_session[0m:[36m156[0m - [34m[1mRead-only session completed successfully[0m


market_data found: [{'schema': 'data_ingestion', 'table': 'market_data'}]


market_data found: [{'schema': 'data_ingestion', 'table': 'market_data'}]


market_data found: [{'schema': 'data_ingestion', 'table': 'market_data'}]


In [6]:
def fetch_market_data_yahoo(
    symbol: str,
    start_date: Optional[datetime] = None,
    end_date: Optional[datetime] = None,
    interval: str = "1h",
) -> pd.DataFrame:
    """
    Fetch OHLCV from Yahoo Finance (yfinance). No DB required.
    Uses auto_adjust=True so prices are adjusted for splits/dividends.
    Yahoo limits 1h data to the last 730 days; keep start_date within that range.
    Returns DataFrame with columns: timestamp, open, high, low, close, volume (UTC).
    """
    if start_date is None:
        start_date = datetime.now(timezone.utc) - timedelta(days=365)
    if end_date is None:
        end_date = datetime.now(timezone.utc)
    start = start_date.date() if hasattr(start_date, "date") else start_date
    end = end_date.date() if hasattr(end_date, "date") else end_date
    ticker = yf.Ticker(symbol.upper())
    hist = ticker.history(start=start, end=end, interval=interval, auto_adjust=True)
    if hist.empty:
        raise ValueError(f"No data from Yahoo for {symbol} between {start} and {end}")
    hist = hist.rename(columns={"Open": "open", "High": "high", "Low": "low", "Close": "close", "Volume": "volume"})
    hist = hist[["open", "high", "low", "close", "volume"]]
    hist.index = pd.to_datetime(hist.index)
    if hist.index.tz is None:
        hist.index = hist.index.tz_localize("UTC", ambiguous="infer")
    else:
        hist.index = hist.index.tz_convert("UTC")
    hist["timestamp"] = hist.index
    df_market = hist.reset_index(drop=True)[["timestamp", "open", "high", "low", "close", "volume"]]
    df_market["volume"] = df_market["volume"].fillna(0).astype(int)
    return df_market

In [7]:
# Load OHLCV: try DB (yahoo_adjusted) first, else fetch from Yahoo directly (1h; 730-day limit).
# DB table must be data_ingestion.market_data in the DB from .env (TRADING_DB_NAME). Run the cell above to verify.
SYMBOL = "MU"
START_DATE = datetime.now(timezone.utc) - timedelta(days=720)
try:
    df_market = load_market_data(SYMBOL, start_date=START_DATE, data_source="yahoo_adjusted", min_records=100)
    print("Loaded from DB (yahoo_adjusted)")
except Exception:
    print("DB unavailable or insufficient data (e.g. table missing). Loading from Yahoo...")
    df_market = fetch_market_data_yahoo(SYMBOL, start_date=START_DATE)
    print("Loaded from Yahoo (yfinance)")



[32m2026-02-07 15:15:06.940[0m | [34m[1mDEBUG   [0m | [36msrc.shared.database.base[0m:[36mdb_readonly_session[0m:[36m156[0m - [34m[1mRead-only session completed successfully[0m


Loaded from DB (yahoo_adjusted)


Loaded from DB (yahoo_adjusted)


Loaded from DB (yahoo_adjusted)


DB unavailable or insufficient data ((psycopg2.errors.UndefinedTable) relation "data_ingestion.market_data" does not exist
LINE 2: FROM data_ingestion.market_data 
             ^

[SQL: SELECT data_ingestion.market_data.id, data_ingestion.market_data.symbol, data_ingestion.market_data.timestamp, data_ingestion.market_data.data_source, data_ingestion.market_data.open, data_ingestion.market_data.high, data_ingestion.market_data.low, data_ingestion.market_data.close, data_ingestion.market_data.volume, data_ingestion.market_data.created_at 
FROM data_ingestion.market_data 
WHERE data_ingestion.market_data.symbol = %(symbol_1)s AND data_ingestion.market_data.data_source = %(data_source_1)s AND data_ingestion.market_data.timestamp >= %(timestamp_1)s AND data_ingestion.market_data.timestamp <= %(timestamp_2)s ORDER BY data_ingestion.market_data.timestamp]
[parameters: {'symbol_1': 'MU', 'data_source_1': 'yahoo_adjusted', 'timestamp_1': datetime.datetime(2024, 2, 18, 13, 45, 35, 23709, tzinfo=

## Outlier Detection

**Handling "Wick" Outliers** (The "Flash Crash" Problem) Even with auto_adjust=True, Yahoo Finance can occasionally have "bad prints"—data points where the High or Low is unnaturally far from the Open/Close. To protect your PyTorch model from these without deleting data, use a Clipping strategy before passing it to the normalizer. This prevents a single bad data point from skewing the local \(\mu \) and \(\sigma \) of your window. 

In [8]:
# Wick-outlier clipping on close only: cap at 0.5% and 99.5% quantiles.
# Preserves trend, prevents a single bad point from skewing scale; applied before indicators/normalizer.

col = "close"
# Show rows before trimming
print("Rows before trimming:", len(df_market))
upper = df_market[col].quantile(0.995)
lower = df_market[col].quantile(0.005)
n_below = (df_market[col] < lower).sum()
n_above = (df_market[col] > upper).sum()
print(f"close: lower={lower:.4f}, upper={upper:.4f}, n_below={n_below}, n_above={n_above}")
mask_affected = (df_market[col] < lower) | (df_market[col] > upper)
if mask_affected.any():
    display(df_market.loc[mask_affected].head(20).style.set_caption("Sample of rows that will be trimmed (close, before clip)"))
else:
    print("No rows outside 0.5%/99.5% quantiles — no trimming applied.")

# Apply clipping to close only
df_market[col] = df_market[col].clip(lower, upper)

Rows before trimming: 3434
close: lower=66.2942, upper=432.3211, n_below=18, n_above=18


Unnamed: 0,timestamp,open,high,low,close,volume
1962,2025-04-04 04:30:00-05:00,70.66,71.34,65.3539,65.52,14629304
1964,2025-04-04 06:30:00-05:00,66.84,67.23,65.04,65.39,4889152
1965,2025-04-04 07:30:00-05:00,65.35,66.05,64.4712,65.1653,3621558
1966,2025-04-04 08:30:00-05:00,65.14,66.3495,64.6855,64.79,4073711
1967,2025-04-04 09:30:00-05:00,64.795,65.72,64.33,65.57,4546785
1968,2025-04-04 10:30:00-05:00,65.555,66.41,64.37,64.72,6315030
1969,2025-04-07 04:30:00-05:00,63.935,72.46,61.54,66.18,17160459
1970,2025-04-07 05:30:00-05:00,66.555,69.45,64.96,65.93,7627565
1981,2025-04-08 09:30:00-05:00,67.21,68.3886,64.13,64.87,3951761
1982,2025-04-08 10:30:00-05:00,64.85,65.66,63.52,65.55,5912913


In [9]:
print("Date range:", df_market["timestamp"].min(), "to", df_market["timestamp"].max())

Date range: 2024-02-20 03:30:00-06:00 to 2026-02-06 09:30:00-06:00


In [10]:
df_market.head(5)

Unnamed: 0,timestamp,open,high,low,close,volume
0,2024-02-20 03:30:00-06:00,79.75,80.31,79.3,79.3,3790205
1,2024-02-20 04:30:00-06:00,79.31,79.96,79.15,79.96,1463230
2,2024-02-20 05:30:00-06:00,79.96,79.96,79.235,79.56,1018712
3,2024-02-20 06:30:00-06:00,79.56,80.09,79.49,79.81,989552
4,2024-02-20 07:30:00-06:00,79.82,80.205,79.755,80.2,905137


***Recommended Workflow for OHLC Data***

- **Load OHLC Data**: we are using adjusted prices from yahoo here

- **Calculate Technical Indicators**: Generate your TIs from the same adjusted OHLCV so indicators are on adjusted prices (before normalization).
- **Clean & Prepare**: Handle any NaN values created by lagging indicators (e.g., a 20-period MA will have 19 NaN rows at the start).
- **Apply Normalization**: Use your TimeSeriesDataSet with the EncoderNormalizer. This will scale each feature—both the adjusted OHLC and the new TIs—dynamically for every window to prevent look-ahead bias. 

***Why this order matters***

- **Mathematical Integrity**: Most TIs are functions of price or volume. If you feed "z-score scaled" values into an RSI formula, the resulting indicator will be mathematically meaningless.
- **Feature Consistency**: Neural networks like LSTMs or Transformers are highly sensitive to input scale. Once you have your TIs, they may have wildly different ranges (e.g., Volume in millions vs. RSI between 0-100). Normalizing all features together after calculation ensures the model weights aren't "swamped" by large-scale features.
- **Leakage Prevention**: By using the EncoderNormalizer after TI calculation, you ensure that even the indicator values are scaled only based on their own local history within the encoder window, strictly avoiding future data leaks. 

***Special Note on Different Indicators***
- **Bounded Indicators** (e.g., RSI, Stochastic): These are already naturally scaled between 0 and 100. Some practitioners choose not to scale these further, but running them through a normalizer is generally safer for deep learning convergence.
- **Unbounded Indicators** (e.g., MACD, Moving Averages): These must be normalized as they follow the price scale and will drift over time. 

## Compute Technical Indicators

Technical indicators are calculated from **adjusted** OHLCV in this notebook (no DB): the same `df_market` with split/dividend-adjusted OHLC is used, so all indicators (SMA, EMA, RSI, MACD, Bollinger Bands, volatility, price changes, volume) are computed on adjusted prices.

In [11]:
def compute_technical_indicators(df: pd.DataFrame) -> pd.DataFrame:
    """
    Compute technical indicators from OHLCV DataFrame (in-notebook, no DB).
    Expects columns: open, high, low, close, volume (all must be adjusted for
    splits/dividends). All indicators are calculated on these adjusted prices.
    """
    out = df.copy()
    c = out["close"]
    v = out["volume"]

    # Moving averages
    out["sma_20"] = c.rolling(20).mean()
    out["sma_50"] = c.rolling(50).mean()
    out["sma_200"] = c.rolling(200).mean()
    out["ema_12"] = c.ewm(span=12, adjust=False).mean()
    out["ema_26"] = c.ewm(span=26, adjust=False).mean()
    out["ema_50"] = c.ewm(span=50, adjust=False).mean()

    # RSI (14) - Wilder smoothing: avg_gain/avg_loss via EMA of delta
    delta = c.diff()
    gain = delta.where(delta > 0, 0.0)
    loss = (-delta).where(delta < 0, 0.0)
    ag = gain.ewm(alpha=1/14, adjust=False).mean()
    al = loss.ewm(alpha=1/14, adjust=False).mean()
    rs = ag / al.replace(0, np.nan)
    out["rsi"] = (100 - (100 / (1 + rs))).fillna(100)  # no loss -> RSI 100
    out["rsi_14"] = out["rsi"]

    # MACD (12, 26, 9)
    ema12 = c.ewm(span=12, adjust=False).mean()
    ema26 = c.ewm(span=26, adjust=False).mean()
    out["macd_line"] = ema12 - ema26
    out["macd_signal"] = out["macd_line"].ewm(span=9, adjust=False).mean()
    out["macd_histogram"] = out["macd_line"] - out["macd_signal"]

    # Bollinger Bands (20, 2)
    out["bb_middle"] = c.rolling(20).mean()
    bb_std = c.rolling(20).std()
    out["bb_upper"] = out["bb_middle"] + 2 * bb_std
    out["bb_lower"] = out["bb_middle"] - 2 * bb_std
    spread = out["bb_upper"] - out["bb_lower"]
    out["bb_position"] = (c - out["bb_lower"]) / spread.replace(0, np.nan)
    out["bb_width"] = (spread / out["bb_middle"].replace(0, np.nan)) * 100

    # Bounded indicators (fixed range; optional to scale, we leave them unscaled in scalers)
    # Stochastic %K (14): 0–100
    low_14 = out["low"].rolling(14).min()
    high_14 = out["high"].rolling(14).max()
    stoch_range = (high_14 - low_14).replace(0, np.nan)
    out["stoch_k"] = ((c - low_14) / stoch_range * 100).clip(0, 100)
    # Williams %R (14): -100 to 0 (oversold near -100, overbought near 0)
    out["williams_r"] = ((high_14 - c) / stoch_range * -100).clip(-100, 0)

    # Volatility (annualized %) and price changes
    returns = c.pct_change()
    out["volatility_20"] = returns.rolling(20).std() * np.sqrt(252) * 100
    out["price_change_1d"] = c.pct_change(1) * 100
    out["price_change_5d"] = c.pct_change(5) * 100
    out["price_change_30d"] = c.pct_change(30) * 100

    # Volume
    out["avg_volume_20"] = v.rolling(20).mean()
    out["current_volume"] = v

    return out

In [12]:
# df_market contains adjusted OHLC; indicators are computed on adjusted prices.
df_features = compute_technical_indicators(df_market)
# Remove rows with any null (e.g. warm-up for SMA_200, RSI, etc.)
df_features = df_features.dropna()
df_features.head(5)

Unnamed: 0,timestamp,open,high,low,close,volume,sma_20,sma_50,sma_200,ema_12,...,bb_position,bb_width,stoch_k,williams_r,volatility_20,price_change_1d,price_change_5d,price_change_30d,avg_volume_20,current_volume
199,2024-04-01 07:30:00-05:00,126.2,126.3272,124.33,124.5035,3252514,119.741375,114.994808,97.751092,121.727362,...,0.887614,10.260223,71.356608,-28.643392,27.027384,-1.324513,5.600933,4.065112,3862142.0,3252514
200,2024-04-01 08:30:00-05:00,124.52,124.7,123.68,123.75,3497280,119.967875,115.589208,97.973343,122.038537,...,0.795896,10.654474,63.840399,-36.159601,27.003904,-0.605204,4.97964,4.052804,3854194.55,3497280
201,2024-04-01 09:30:00-05:00,123.74,124.41,123.05,123.58,3849502,120.207125,116.163808,98.191443,122.275685,...,0.756998,10.917936,62.144638,-37.855362,26.95519,-0.137374,-2.130356,3.709385,3810491.0,3849502
202,2024-04-01 10:30:00-05:00,123.58,124.76,123.41,124.29,3653443,120.525625,116.724508,98.415093,122.58558,...,0.78045,11.136752,69.226933,-30.773067,26.744969,0.574527,-1.054023,4.1479,3589142.3,3653443
203,2024-04-02 04:30:00-05:00,122.625,123.68,120.92,121.76,8424672,120.698625,117.234508,98.624842,122.458567,...,0.579829,11.015598,43.425693,-56.574307,27.97266,-2.035562,-3.498879,3.097148,3841477.05,8424672


## Normalization

*encoder-only, no look-ahead bias*

Use **TimeSeriesDataSet** with **EncoderNormalizer**: scaling is fit on each encoder sequence only, so no future information leaks into the past (no look-ahead bias).

**Robust scaling (median + IQR):** We use **method="robust"** instead of mean/std. Outliers (e.g. Black Swan events, flash crashes) are **not deleted**—they stay in the data so the model can learn from them—but robust scaling prevents extreme spikes from squishing the rest of the values into a tiny range. Applied after technical indicators, before the model.

**Indicator scaling rationale:**
- **Bounded indicators** (RSI, bb_position, stoch_k, williams_r): Fixed scale → no scaling (identity).
- **Unbounded indicators** (MACD, MAs, close, volume, etc.): **EncoderNormalizer(method="robust")** (encoder-only, median/IQR).

### Data pipeline (how everything connects)

All later steps (e.g. **Temporal Fusion Transformer**) use the same data chain:

| Step | Variable | Description |
|------|----------|-------------|
| 1 | `df_market` | Adjusted OHLCV from DB or Yahoo (one symbol) |
| 2 | `df_features` | OHLCV + technical indicators, nulls dropped |
| 3 | `df_ts` | Same as `df_features` + `time_idx` + `ticker` (required by TimeSeriesDataSet) |
| 4 | `dataset` | TimeSeriesDataSet built from `df_ts` (encoder-only scaling, no look-ahead) |
| 5 | `dataloader` | `dataset.to_dataloader(...)` → batches for training |

**TFT and any other model in this notebook will use `dataset` and `dataloader` from above.**

In [13]:
# Prepare dataframe for TimeSeriesDataSet: need time_idx and group_ids
df_ts = df_features.copy()
df_ts["time_idx"] = np.arange(len(df_ts))
df_ts["ticker"] = SYMBOL  # single series → one group

# Time-varying features (unknown in the future). Bounded ones stay in list but get identity scaling.
# Close-focused: we predict close; indicators are close-based. open/high/low not fed as inputs to avoid
# redundancy and multicollinearity. high/low still used inside compute_technical_indicators for stoch_k, williams_r.
time_varying_unknown_reals = [
    "close", "volume",
    "sma_20", "sma_50", "sma_200", "ema_12", "ema_26", "ema_50",
    "rsi", "rsi_14", "macd_line", "macd_signal", "macd_histogram",
    "bb_upper", "bb_middle", "bb_lower", "bb_position", "bb_width",
    "stoch_k", "williams_r",
    "volatility_20", "price_change_1d", "price_change_5d", "price_change_30d",
    "avg_volume_20", "current_volume",
]
time_varying_unknown_reals = [c for c in time_varying_unknown_reals if c in df_ts.columns]

# Bounded: fixed scale (0–100, 0–1, -100–0) → no scaling (identity). Unbounded → EncoderNormalizer.
BOUNDED_INDICATORS = ["rsi", "rsi_14", "bb_position", "stoch_k", "williams_r"]
MIN_ENCODER_LENGTH = 60
MAX_ENCODER_LENGTH = 60
MIN_PREDICTION_LENGTH = 5
MAX_PREDICTION_LENGTH = 5

# Target has its own normalizer; do not put target in scalers (library requirement)
scaler_cols = [c for c in time_varying_unknown_reals if c != "close"]
# Robust scaling (median/IQR): keeps outliers in data, prevents extremes from squishing the rest
scalers = {
    col: (None if col in BOUNDED_INDICATORS else EncoderNormalizer(method="robust", center=True))
    for col in scaler_cols
}

# Show which attributes get normalization and which do not (target "close" uses target_normalizer)
normalized_attrs = [c for c in scaler_cols if scalers[c] is not None]
identity_attrs = [c for c in scaler_cols if scalers[c] is None]
norm_summary = pd.DataFrame({
    "attribute": normalized_attrs + identity_attrs + ["close"],
    "normalization": ["EncoderNormalizer (robust, encoder-only)"] * len(normalized_attrs)
    + ["None (identity)"] * len(identity_attrs)
    + ["EncoderNormalizer (robust, target_normalizer)"],
    "reason": ["Unbounded; robust scale (median/IQR); outliers retained"] * len(normalized_attrs)
    + ["Bounded (fixed range)"] * len(identity_attrs)
    + ["Target; robust encoder-only scale"],
})
display(norm_summary)



Unnamed: 0,attribute,normalization,reason
0,volume,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
1,sma_20,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
2,sma_50,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
3,sma_200,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
4,ema_12,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
5,ema_26,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
6,ema_50,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
7,macd_line,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
8,macd_signal,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...
9,macd_histogram,"EncoderNormalizer (robust, encoder-only)",Unbounded; robust scale (median/IQR); outliers...


In [14]:
# EncoderNormalizer: fit scale on each encoder sequence only → no look-ahead bias
dataset = TimeSeriesDataSet(
    df_ts,
    time_idx="time_idx",
    target="close",
    group_ids=["ticker"],
    min_encoder_length=MIN_ENCODER_LENGTH,
    max_encoder_length=MAX_ENCODER_LENGTH,
    min_prediction_length=MIN_PREDICTION_LENGTH,
    max_prediction_length=MAX_PREDICTION_LENGTH,
    time_varying_unknown_reals=time_varying_unknown_reals,
    target_normalizer=EncoderNormalizer(method="robust", center=True),
    scalers=scalers,
)

dataloader = dataset.to_dataloader(train=True, batch_size=32, num_workers=0)
print(f"TimeSeriesDataSet: {len(dataset)} samples, encoder={MAX_ENCODER_LENGTH}, prediction={MAX_PREDICTION_LENGTH}")
batch = next(iter(dataloader))
print(f"Batch keys: {list(batch[0].keys())}")

TimeSeriesDataSet: 3171 samples, encoder=60, prediction=5
Batch keys: ['encoder_cat', 'encoder_cont', 'encoder_target', 'encoder_lengths', 'decoder_cat', 'decoder_cont', 'decoder_target', 'decoder_lengths', 'decoder_time_idx', 'groups', 'target_scale']


In [15]:
# Verify link: prev data → TFT-ready dataset (run this to confirm the pipeline)
assert "time_idx" in df_ts.columns and "ticker" in df_ts.columns
assert len(df_ts) == len(df_features), "df_ts = df_features + time_idx + ticker"
assert set(df_features.columns).issubset(df_ts.columns), "df_ts contains all df_features columns"

In [16]:
pipeline = {
    "df_market": (len(df_market), list(df_market.columns)),
    "df_features": (len(df_features), list(df_features.columns)),
    "df_ts": (len(df_ts), list(df_ts.columns)),
    "dataset (TimeSeriesDataSet)": (len(dataset), "encoder/decoder batches"),
    "dataloader": (len(dataloader), "batch_size=32"),
}

In [17]:
print("Data pipeline (prev → new dataset):")

for name, (size, detail) in pipeline.items():
    print(f"{name}: n={size}  ({detail})")
    print(f"\n")
print("\n→ Use 'dataset' and 'dataloader' for TFT (or any PyTorch Forecasting model).")

Data pipeline (prev → new dataset):
df_market: n=3434  (['timestamp', 'open', 'high', 'low', 'close', 'volume'])


df_features: n=3235  (['timestamp', 'open', 'high', 'low', 'close', 'volume', 'sma_20', 'sma_50', 'sma_200', 'ema_12', 'ema_26', 'ema_50', 'rsi', 'rsi_14', 'macd_line', 'macd_signal', 'macd_histogram', 'bb_middle', 'bb_upper', 'bb_lower', 'bb_position', 'bb_width', 'stoch_k', 'williams_r', 'volatility_20', 'price_change_1d', 'price_change_5d', 'price_change_30d', 'avg_volume_20', 'current_volume'])


df_ts: n=3235  (['timestamp', 'open', 'high', 'low', 'close', 'volume', 'sma_20', 'sma_50', 'sma_200', 'ema_12', 'ema_26', 'ema_50', 'rsi', 'rsi_14', 'macd_line', 'macd_signal', 'macd_histogram', 'bb_middle', 'bb_upper', 'bb_lower', 'bb_position', 'bb_width', 'stoch_k', 'williams_r', 'volatility_20', 'price_change_1d', 'price_change_5d', 'price_change_30d', 'avg_volume_20', 'current_volume', 'time_idx', 'ticker'])


dataset (TimeSeriesDataSet): n=3171  (encoder/decoder batches