In [1]:
import pandas as pd
import numpy as np

PATH = "data/processed/XAUUSD/combined_data.csv"

FEATURES = [
    "session_code", "open", "attempt", "rolling_f1", "rolling_accuracy", "ret30m_voladj", "ma14_slope_5", "pos_in_day_range", "vol_ratio_5_30", "range15m_voladj", "drawdown_30", "daily_volatility", "avg_return_30d", "hour_sin", "dow_sin", "dow_cos", "volume_shifted", "breakout_30_up", "breakout_30_dn", "t10yie", "dgs10", "dtwexbgs", "ma_14", "atr_14", "max_price_14", "min_price_14", "ret_30m", "vol_5m", "vol_15m", "vol_30m",  "range_15m", "atr_z_60", "vix_z_60", "vix_close", "day_of_week", "week_number", "hour_of_day", "cpiaucsl", "skew_30", "kurt_30", "vixz_x_ret30m"
]

df = pd.read_csv(PATH, index_col="timestamp", parse_dates=True, low_memory=False)
print(f"Loaded {len(df):,} rows, {len(df.columns)} columns")

present = [c for c in FEATURES if c in df.columns]
missing = [c for c in FEATURES if c not in df.columns]
if missing:
    print("\nMissing feature columns:")
    for c in missing: print("  -", c)
else:
    print("\nAll requested features are present.")

num = df[present].apply(pd.to_numeric, errors="coerce")
nan_cnt = num.isna().sum()
nan_rate = (nan_cnt / len(num)).sort_values(ascending=False)
inf_cnt = np.isinf(num).sum()

print("\nNaN rate per present feature (descending):")
for c, r in nan_rate.items():
    print(f"  {c:20s}  NaN: {nan_cnt[c]:8d}  ({r:6.2%})  Inf: {int(inf_cnt[c]):6d}")

survivors = num.replace([np.inf, -np.inf], np.nan).dropna().shape[0]
print(f"\nRows surviving dropna on ALL present features: {survivors:,} / {len(df):,}")
if survivors == 0:
    print("⚠️  Zero survivors — at least one feature is mostly NaN/Inf. Check the list above.")

# (Optional) show the worst 5 features quickly
print("\nTop 5 worst (highest NaN rate):")
print(nan_rate.head(5).to_string())


KeyboardInterrupt: 