In [17]:
# Cell 1: Environment & Source File Validation

import os
import sys
import pandas as pd

print("Python version:", sys.version)
print("Pandas version:", pd.__version__)

# ---- Contract: Single Source of Truth ----
PARQUET_PATH = "final_output/feature_store/final_features_fixed_f.parquet"

print(f"\nChecking source file at: {PARQUET_PATH}")

if not os.path.exists(PARQUET_PATH):
    raise FileNotFoundError(
        f"‚ùå Contract violation: source file not found at {PARQUET_PATH}"
    )

# Try opening parquet (no inspection)
try:
    _df = pd.read_parquet(PARQUET_PATH)
except Exception as e:
    raise RuntimeError(
        f"‚ùå Contract violation: failed to read parquet file.\nError: {e}"
    )

print("‚úÖ Source parquet file exists and is readable.")
print("Rows loaded (sanity only):", len(_df))

# IMPORTANT: Do not inspect columns yet
del _df

print("\nCell 1 PASSED ‚Äî Environment and source file validated.")


Python version: 3.10.11 (tags/v3.10.11:7d4cc5a, Apr  5 2023, 00:38:17) [MSC v.1929 64 bit (AMD64)]
Pandas version: 2.1.4

Checking source file at: final_output/feature_store/final_features_fixed_f.parquet
‚úÖ Source parquet file exists and is readable.
Rows loaded (sanity only): 257860

Cell 1 PASSED ‚Äî Environment and source file validated.


In [18]:
# Cell 2: Load Dataset & Mandatory Column Validation

import pandas as pd

PARQUET_PATH = "final_output/feature_store/final_features_fixed_f.parquet"

df = pd.read_parquet(PARQUET_PATH)

print("Dataset loaded.")
print(f"Shape: {df.shape}")

# ---- Contract: Mandatory Columns ----
REQUIRED_METADATA_COLS = {"symbol", "date"}
REQUIRED_TARGET_COLS = {"ret_fwd_1d", "ret_fwd_5d"}

missing_meta = REQUIRED_METADATA_COLS - set(df.columns)
missing_targets = REQUIRED_TARGET_COLS - set(df.columns)

if missing_meta:
    raise ValueError(
        f"‚ùå Contract violation: Missing metadata columns: {sorted(missing_meta)}"
    )

if missing_targets:
    raise ValueError(
        f"‚ùå Contract violation: Missing target columns: {sorted(missing_targets)}"
    )

print("‚úÖ Mandatory metadata columns present:", sorted(REQUIRED_METADATA_COLS))
print("‚úÖ Mandatory target columns present:", sorted(REQUIRED_TARGET_COLS))

print("\nCell 2 PASSED ‚Äî Required columns validated.")


Dataset loaded.
Shape: (257860, 268)
‚úÖ Mandatory metadata columns present: ['date', 'symbol']
‚úÖ Mandatory target columns present: ['ret_fwd_1d', 'ret_fwd_5d']

Cell 2 PASSED ‚Äî Required columns validated.


In [19]:
# Cell 3: Metadata Integrity Validation

import pandas as pd

# ---- Metadata NaN check ----
if df["symbol"].isna().any():
    raise ValueError("‚ùå Contract violation: NaNs found in 'symbol' column")

if df["date"].isna().any():
    raise ValueError("‚ùå Contract violation: NaNs found in 'date' column")

print("‚úÖ No NaNs in metadata columns")

# ---- Date parsing (no mutation yet) ----
try:
    parsed_dates = pd.to_datetime(df["date"], errors="raise")
except Exception as e:
    raise ValueError(f"‚ùå Contract violation: 'date' column is not parseable as datetime\n{e}")

print("‚úÖ 'date' column is parseable as datetime")

print("\nCell 3 PASSED ‚Äî Metadata integrity confirmed.")


‚úÖ No NaNs in metadata columns
‚úÖ 'date' column is parseable as datetime

Cell 3 PASSED ‚Äî Metadata integrity confirmed.


In [20]:
# Cell 4: Symbol Canonicalization

import re

def canonicalize_symbol(sym: str) -> str:
    """
    Canonical symbol format:
    - Uppercase
    - Replace non-alphanumeric characters with underscore
    - Collapse multiple underscores
    - Strip leading/trailing underscores
    """
    sym = str(sym).upper()
    sym = re.sub(r"[^A-Z0-9]", "_", sym)
    sym = re.sub(r"_+", "_", sym)
    sym = sym.strip("_")
    return sym

# Apply canonicalization
df["symbol"] = df["symbol"].apply(canonicalize_symbol)

# ---- Validation ----
if df["symbol"].isna().any():
    raise ValueError("‚ùå Contract violation: NaNs introduced during symbol canonicalization")

if (df["symbol"].str.len() == 0).any():
    raise ValueError("‚ùå Contract violation: Empty symbol after canonicalization")

if not df["symbol"].map(lambda x: isinstance(x, str)).all():
    raise ValueError("‚ùå Contract violation: Non-string symbol detected after canonicalization")

print("‚úÖ Symbols canonicalized successfully")

# Optional sanity peek (safe)
print("\nSample canonical symbols:")
print(df["symbol"].drop_duplicates().head(10).tolist())

print("\nCell 4 PASSED ‚Äî Symbol namespace locked.")


‚úÖ Symbols canonicalized successfully

Sample canonical symbols:
['AAPL', 'ABBV', 'AMZN', 'ASML', 'BA', 'BABA', 'BAC', 'BHP', 'BP', 'BTC_USD']

Cell 4 PASSED ‚Äî Symbol namespace locked.


In [22]:
# Cell 5: Feature Discovery & Schema Freeze

from pathlib import Path

# ---- Contract definitions ----
METADATA_COLS = {"symbol", "date"}
TARGET_COLS = {"ret_fwd_1d", "ret_fwd_5d"}

# Discover feature columns
feature_cols = [
    col for col in df.columns
    if col not in METADATA_COLS and col not in TARGET_COLS
]

if len(feature_cols) == 0:
    raise ValueError("‚ùå Contract violation: No feature columns discovered")

# Deterministic ordering
feature_cols = sorted(feature_cols)

# Safety checks
forbidden = METADATA_COLS.union(TARGET_COLS)
leaked = forbidden.intersection(feature_cols)

if leaked:
    raise ValueError(
        f"‚ùå Contract violation: Forbidden columns leaked into feature set: {sorted(leaked)}"
    )

# ---- Freeze schema ----
schema_path = Path("feature_schema.txt")

with open(schema_path, "w") as f:
    for col in feature_cols:
        f.write(col + "\n")

print(f"‚úÖ Feature schema frozen with {len(feature_cols)} features")
print(f"üìÑ Written to: {schema_path.resolve()}")

# Safe summary
print("\nFirst 10 feature columns:")
for c in feature_cols[:10]:
    print(" -", c)

print("\nCell 5 PASSED ‚Äî Feature schema frozen.")


‚úÖ Feature schema frozen with 264 features
üìÑ Written to: D:\MarketSentinel\MarketSentinel\feature_schema.txt

First 10 feature columns:
 - atr_14
 - atr_14_rmean_60
 - atr_14_rstd_60
 - atr_14_rz_60
 - close_z_20
 - close_z_20_rmean_60
 - close_z_20_rstd_60
 - close_z_20_rz_60
 - cpi_change
 - cpi_yoy

Cell 5 PASSED ‚Äî Feature schema frozen.


In [26]:
# Cell 5B: Drop Non-Numeric Features & Re-freeze Schema

from pathlib import Path

# ---- Explicitly dropped features ----
DROPPED_FEATURES = {"is_month_end"}

missing = DROPPED_FEATURES - set(df.columns)
if missing:
    raise ValueError(f"‚ùå Attempting to drop missing columns: {missing}")

df = df.drop(columns=list(DROPPED_FEATURES))
print(f"‚úÖ Dropped features: {sorted(DROPPED_FEATURES)}")

# ---- Re-discover feature columns ----
METADATA_COLS = {"symbol", "date"}
TARGET_COLS = {"ret_fwd_1d", "ret_fwd_5d"}

feature_cols = [
    col for col in df.columns
    if col not in METADATA_COLS and col not in TARGET_COLS
]

if len(feature_cols) == 0:
    raise ValueError("‚ùå Contract violation: No feature columns remain after drop")

feature_cols = sorted(feature_cols)

# Safety check
forbidden = METADATA_COLS.union(TARGET_COLS)
leaked = forbidden.intersection(feature_cols)
if leaked:
    raise ValueError(
        f"‚ùå Contract violation: Forbidden columns leaked into feature set: {leaked}"
    )

# ---- Re-freeze schema ----
schema_path = Path("feature_schema.txt")
with open(schema_path, "w") as f:
    for col in feature_cols:
        f.write(col + "\n")

print(f"üîí Feature schema RE-FROZEN with {len(feature_cols)} features")
print(f"üìÑ Written to: {schema_path.resolve()}")

print("\nCell 5B PASSED ‚Äî Schema re-frozen after drop.")


‚úÖ Dropped features: ['is_month_end']
üîí Feature schema RE-FROZEN with 263 features
üìÑ Written to: D:\MarketSentinel\MarketSentinel\feature_schema.txt

Cell 5B PASSED ‚Äî Schema re-frozen after drop.


In [27]:
# Cell 6 (Revised): Feature Sanitation & Numerical Safety
# Adds HARD numeric feature validation

import numpy as np
from pandas.api.types import is_numeric_dtype

SAFE_MAX = 10.0

# ---- Drop rows with NaN targets ----
initial_rows = len(df)
df = df.dropna(subset=["ret_fwd_1d", "ret_fwd_5d"])
dropped = initial_rows - len(df)

print(f"Rows dropped due to NaN targets: {dropped}")

# ---- Numeric feature validation ----
non_numeric_features = [
    col for col in feature_cols
    if not is_numeric_dtype(df[col])
]

if non_numeric_features:
    raise TypeError(
        "‚ùå Contract violation: Non-numeric feature columns detected.\n"
        "These must be removed or engineered explicitly BEFORE this stage:\n"
        + "\n".join(f" - {c} ({df[c].dtype})" for c in non_numeric_features)
    )

print("‚úÖ All feature columns are numeric")

# ---- Sanitize feature columns ----
features = df[feature_cols]

# Replace inf/-inf with NaN, then NaN ‚Üí 0.0
features = features.replace([np.inf, -np.inf], np.nan)
features = features.fillna(0.0)

# Convert dtype
features = features.astype(np.float32)

# Detect pre-clip violations
pre_clip_max = features.abs().max().max()
if pre_clip_max > SAFE_MAX:
    print(f"‚ö†Ô∏è Values exceed SAFE_MAX before clipping: max={pre_clip_max:.4f}")

# Clip to SAFE_MAX
features = features.clip(-SAFE_MAX, SAFE_MAX)

# Final invariant checks
if not np.isfinite(features.values).all():
    raise ValueError("‚ùå Contract violation: Non-finite values remain in features")

if features.dtypes.nunique() != 1 or features.dtypes.iloc[0] != np.float32:
    raise ValueError("‚ùå Contract violation: Feature dtype is not float32")

# Write back sanitized features
df.loc[:, feature_cols] = features

print("‚úÖ Feature sanitation complete")
print("‚úÖ All features finite, float32, and clipped")

print("\nCell 6 PASSED ‚Äî Numerical safety enforced.")


Rows dropped due to NaN targets: 0
‚úÖ All feature columns are numeric
‚ö†Ô∏è Values exceed SAFE_MAX before clipping: max=3961173835776.0000
‚úÖ Feature sanitation complete
‚úÖ All features finite, float32, and clipped

Cell 6 PASSED ‚Äî Numerical safety enforced.


In [28]:
# Cell 7: Temporal Ordering & Window Viability Check

import pandas as pd

WINDOW_SIZE = 60

# Ensure datetime (safe conversion, already validated earlier)
df["date"] = pd.to_datetime(df["date"])

viable_symbols = {}
invalid_symbols = []

for symbol, sdf in df.groupby("symbol"):
    sdf = sdf.sort_values("date")

    # Check monotonicity
    if not sdf["date"].is_monotonic_increasing:
        invalid_symbols.append(symbol)
        continue

    n_rows = len(sdf)
    n_windows = n_rows - WINDOW_SIZE + 1

    if n_windows > 0:
        viable_symbols[symbol] = n_windows

if invalid_symbols:
    raise ValueError(
        "‚ùå Contract violation: Non-monotonic dates for symbols:\n"
        + "\n".join(invalid_symbols)
    )

if not viable_symbols:
    raise ValueError(
        "‚ùå Contract violation: No symbols can form a valid 60-step window"
    )

print(f"‚úÖ Symbols with valid temporal ordering: {len(viable_symbols)}")

# Summary stats
total_windows = sum(viable_symbols.values())
print(f"Total possible windows (pre-split): {total_windows}")

# Safe peek
sample = list(viable_symbols.items())[:5]
print("\nSample symbol window counts:")
for sym, cnt in sample:
    print(f" - {sym}: {cnt}")

print("\nCell 7 PASSED ‚Äî Temporal ordering validated.")


‚úÖ Symbols with valid temporal ordering: 66
Total possible windows (pre-split): 253900

Sample symbol window counts:
 - AAPL: 3899
 - ABBV: 3899
 - AMZN: 3899
 - ASML: 3899
 - BA: 3882

Cell 7 PASSED ‚Äî Temporal ordering validated.


In [29]:
# Cell 8: Temporal Window Construction

import numpy as np

WINDOW_SIZE = 60

X_windows = []
y_targets = []
meta = []

for symbol, sdf in df.groupby("symbol"):
    sdf = sdf.sort_values("date").reset_index(drop=True)

    feature_matrix = sdf[feature_cols].values
    targets = sdf[["ret_fwd_1d", "ret_fwd_5d"]].values
    dates = sdf["date"].values

    for i in range(len(sdf) - WINDOW_SIZE + 1):
        window_X = feature_matrix[i : i + WINDOW_SIZE]
        window_y = targets[i + WINDOW_SIZE - 1]
        window_date = dates[i + WINDOW_SIZE - 1]

        # Hard invariants
        if not np.isfinite(window_X).all():
            continue
        if not np.isfinite(window_y).all():
            continue

        X_windows.append(window_X)
        y_targets.append(window_y)
        meta.append((symbol, window_date))

X = np.stack(X_windows)
y = np.stack(y_targets)

print("‚úÖ Window construction complete")
print(f"X shape: {X.shape}")
print(f"y shape: {y.shape}")
print(f"Feature dim (F): {X.shape[-1]}")

# Final sanity
assert X.shape[0] == y.shape[0] == len(meta)
assert X.shape[1] == WINDOW_SIZE
assert X.shape[2] == len(feature_cols)

print("\nCell 8 PASSED ‚Äî Temporal windows built.")


‚úÖ Window construction complete
X shape: (253900, 60, 263)
y shape: (253900, 2)
Feature dim (F): 263

Cell 8 PASSED ‚Äî Temporal windows built.


In [32]:
# Cell 9 (Final): Chunked Chronological Split & HDF5 Write
# Fix: store date as int64 nanoseconds

import h5py
import numpy as np
from collections import defaultdict

TRAIN_RATIO = 0.70
VAL_RATIO = 0.15
BATCH_SIZE = 2048

# ---- Organize windows by symbol ----
by_symbol = defaultdict(list)
for idx, (symbol, date) in enumerate(meta):
    by_symbol[symbol].append(idx)

train_idx, val_idx, test_idx = [], [], []

for symbol, indices in by_symbol.items():
    n = len(indices)
    n_train = int(n * TRAIN_RATIO)
    n_val = int(n * VAL_RATIO)

    train_idx.extend(indices[:n_train])
    val_idx.extend(indices[n_train:n_train + n_val])
    test_idx.extend(indices[n_train + n_val:])

print("Split sizes:")
print("Train:", len(train_idx))
print("Val:  ", len(val_idx))
print("Test: ", len(test_idx))


def write_h5_chunked(path, indices):
    n = len(indices)
    F = X.shape[2]

    with h5py.File(path, "w") as f:
        X_ds = f.create_dataset(
            "X",
            shape=(n, WINDOW_SIZE, F),
            dtype="float32",
            compression="gzip",
            chunks=(min(BATCH_SIZE, n), WINDOW_SIZE, F),
        )
        y_ds = f.create_dataset(
            "y",
            shape=(n, 2),
            dtype="float32",
            compression="gzip",
            chunks=(min(BATCH_SIZE, n), 2),
        )
        sym_ds = f.create_dataset("symbol", shape=(n,), dtype="S16")

        # ‚úÖ FIX: int64 nanoseconds since epoch
        date_ds = f.create_dataset("date", shape=(n,), dtype="int64")

        for start in range(0, n, BATCH_SIZE):
            end = min(start + BATCH_SIZE, n)
            batch_idx = indices[start:end]

            X_ds[start:end] = X[batch_idx]
            y_ds[start:end] = y[batch_idx]
            sym_ds[start:end] = np.array(
                [meta[i][0] for i in batch_idx], dtype="S16"
            )
            date_ds[start:end] = np.array(
                [meta[i][1].astype("datetime64[ns]").astype("int64") for i in batch_idx]
            )

            if start == 0 or end == n:
                print(f"  wrote rows {start} ‚Üí {end}")


# ---- Write files ----
write_h5_chunked("gnn_sequences_train.h5", train_idx)
write_h5_chunked("gnn_sequences_val.h5", val_idx)
write_h5_chunked("gnn_sequences_test.h5", test_idx)

print("‚úÖ HDF5 files written successfully (datetime stored as int64 ns)")
print("\nCell 9 PASSED ‚Äî Memory-safe persistence complete.")


Split sizes:
Train: 177703
Val:   38048
Test:  38149
  wrote rows 0 ‚Üí 2048
  wrote rows 176128 ‚Üí 177703
  wrote rows 0 ‚Üí 2048
  wrote rows 36864 ‚Üí 38048
  wrote rows 0 ‚Üí 2048
  wrote rows 36864 ‚Üí 38149
‚úÖ HDF5 files written successfully (datetime stored as int64 ns)

Cell 9 PASSED ‚Äî Memory-safe persistence complete.


In [33]:
# Cell 10: Write node_features.parquet

from pathlib import Path
import numpy as np
import pandas as pd

# Final safety check
features = df[feature_cols]

if not np.isfinite(features.values).all():
    raise ValueError("‚ùå Contract violation: Non-finite values in node features")

if features.shape[1] != len(feature_cols):
    raise ValueError("‚ùå Feature dimension mismatch with frozen schema")

node_df = pd.concat(
    [
        df[["symbol", "date"]].reset_index(drop=True),
        features.reset_index(drop=True),
    ],
    axis=1,
)

out_path = Path("node_features.parquet")
node_df.to_parquet(out_path, index=False)

print(f"‚úÖ node_features.parquet written")
print(f"Rows: {len(node_df)}")
print(f"Features per node: {len(feature_cols)}")
print(f"Path: {out_path.resolve()}")

print("\nCell 10 PASSED ‚Äî Node features materialized.")


‚úÖ node_features.parquet written
Rows: 257794
Features per node: 263
Path: D:\MarketSentinel\MarketSentinel\node_features.parquet

Cell 10 PASSED ‚Äî Node features materialized.


In [34]:
# Cell 11: Data Precheck (Compiler Gate)

import h5py
import numpy as np
import pandas as pd
from pathlib import Path

# ---- Load frozen schema ----
schema_path = Path("feature_schema.txt")
if not schema_path.exists():
    raise FileNotFoundError("‚ùå Missing feature_schema.txt")

with open(schema_path) as f:
    schema_features = [line.strip() for line in f if line.strip()]

F = len(schema_features)
print(f"Schema feature count: {F}")

# ---- Check HDF5 files ----
for split in ["train", "val", "test"]:
    path = Path(f"gnn_sequences_{split}.h5")
    if not path.exists():
        raise FileNotFoundError(f"‚ùå Missing {path}")

    with h5py.File(path, "r") as f:
        X = f["X"]
        y = f["y"]
        symbols = f["symbol"]
        dates = f["date"]

        assert X.ndim == 3 and X.shape[1] == 60 and X.shape[2] == F
        assert y.shape[1] == 2
        assert X.dtype == np.float32
        assert y.dtype == np.float32
        assert dates.dtype == np.int64

        if not np.isfinite(X[:1000]).all():
            raise ValueError(f"‚ùå Non-finite values in X ({split})")
        if not np.isfinite(y[:1000]).all():
            raise ValueError(f"‚ùå Non-finite values in y ({split})")

        # Date round-trip check
        _ = dates[:10].astype("datetime64[ns]")

        print(f"‚úÖ {split} split OK ‚Äî {X.shape[0]} samples")

# ---- Check node_features.parquet ----
node_df = pd.read_parquet("node_features.parquet")

expected_cols = ["symbol", "date"] + schema_features
if list(node_df.columns) != expected_cols:
    raise ValueError("‚ùå node_features column order mismatch")

if not np.isfinite(node_df[schema_features].values).all():
    raise ValueError("‚ùå Non-finite values in node_features")

print("‚úÖ node_features.parquet OK")

print("\nüéâ DATA PRECHECK PASSED ‚Äî DATASET IS TRAIN-READY üéâ")


Schema feature count: 263
‚úÖ train split OK ‚Äî 177703 samples
‚úÖ val split OK ‚Äî 38048 samples
‚úÖ test split OK ‚Äî 38149 samples
‚úÖ node_features.parquet OK

üéâ DATA PRECHECK PASSED ‚Äî DATASET IS TRAIN-READY üéâ


In [35]:
# Cell 12: Build edges_static.parquet (baseline self-loop graph)

import pandas as pd
import numpy as np
from pathlib import Path

# ---- Get symbol universe from node features ----
node_df = pd.read_parquet("node_features.parquet")
symbols = sorted(node_df["symbol"].unique())

if len(symbols) == 0:
    raise ValueError("‚ùå No symbols found in node_features")

print(f"Building static edges for {len(symbols)} symbols")

# ---- Self-loop edges ----
edges = pd.DataFrame({
    "symbol_i": symbols,
    "symbol_j": symbols,
    "weight": np.ones(len(symbols), dtype=np.float32),
})

# ---- Validation ----
if not np.isfinite(edges["weight"].values).all():
    raise ValueError("‚ùå Non-finite edge weights")

if not set(edges["symbol_i"]).issubset(symbols):
    raise ValueError("‚ùå symbol_i outside node_features symbol set")

if not set(edges["symbol_j"]).issubset(symbols):
    raise ValueError("‚ùå symbol_j outside node_features symbol set")

# ---- Write file ----
out_path = Path("edges_static.parquet")
edges.to_parquet(out_path, index=False)

print("‚úÖ edges_static.parquet written")
print(f"Edges: {len(edges)}")
print(f"Path: {out_path.resolve()}")

print("\nCell 12 PASSED ‚Äî Static edge graph created.")


Building static edges for 66 symbols
‚úÖ edges_static.parquet written
Edges: 66
Path: D:\MarketSentinel\MarketSentinel\edges_static.parquet

Cell 12 PASSED ‚Äî Static edge graph created.
