# Notebook 00: Data Overview & Sanity Checks

**Purpose**: Validate data integrity before any analysis

**Principle**: Trust but verify - every assumption must be checked

---

## Objectives

1. Load and validate all exported data files
2. Verify feature count matches contract (98 features)
3. Check data types, NaN, Inf, missing values
4. Validate categorical features (book_valid, time_regime, mbo_ready, schema_version)
5. Basic statistics per feature category
6. Confirm known relationships hold


In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
import json
import sys

# Add src to path for lobtrainer imports
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from lobtrainer.constants import (
    FEATURE_COUNT, LOB_FEATURE_COUNT, DERIVED_FEATURE_COUNT,
    MBO_FEATURE_COUNT, SIGNAL_FEATURE_COUNT, SCHEMA_VERSION,
    FeatureIndex, LABEL_DOWN, LABEL_STABLE, LABEL_UP, LABEL_NAMES
)

# Plotting configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Display settings
pd.set_option('display.max_columns', 100)
pd.set_option('display.precision', 6)

print("Environment ready")
print(f"Expected feature count: {FEATURE_COUNT}")
print(f"  LOB: {LOB_FEATURE_COUNT}, Derived: {DERIVED_FEATURE_COUNT}, MBO: {MBO_FEATURE_COUNT}, Signals: {SIGNAL_FEATURE_COUNT}")


## 1. Data Loading

Load all data files from train/val/test splits and verify structure.


In [None]:
# Data directories
DATA_ROOT = Path.cwd().parent.parent / 'data' / 'exports' / 'nvda_98feat'
SPLITS = ['train', 'val', 'test']

print(f"Data root: {DATA_ROOT}")
print(f"Exists: {DATA_ROOT.exists()}")

# Discover all files
data_inventory = {}
for split in SPLITS:
    split_dir = DATA_ROOT / split
    if not split_dir.exists():
        print(f"Warning: {split} directory not found")
        continue
    
    feature_files = sorted(split_dir.glob('*_features.npy'))
    label_files = sorted(split_dir.glob('*_labels.npy'))
    metadata_files = sorted(split_dir.glob('*_metadata.json'))
    
    data_inventory[split] = {
        'feature_files': feature_files,
        'label_files': label_files,
        'metadata_files': metadata_files,
    }
    
    print(f"\n{split.upper()}:")
    print(f"  Feature files: {len(feature_files)}")
    print(f"  Label files: {len(label_files)}")
    print(f"  Metadata files: {len(metadata_files)}")
    
    if feature_files:
        dates = [f.stem.replace('_features', '') for f in feature_files]
        print(f"  Dates: {dates[0]} to {dates[-1]}")


In [None]:
def load_day_data(feature_path: Path, label_path: Path, metadata_path: Path = None):
    """
    Load a single day's data with validation.
    
    Returns:
        dict with 'features', 'labels', 'metadata', 'date'
    """
    features = np.load(feature_path)
    labels = np.load(label_path)
    
    metadata = None
    if metadata_path and metadata_path.exists():
        with open(metadata_path) as f:
            metadata = json.load(f)
    
    date = feature_path.stem.replace('_features', '')
    
    return {
        'features': features,
        'labels': labels,
        'metadata': metadata,
        'date': date,
    }

# Load all data
all_data = {split: [] for split in SPLITS}

for split in SPLITS:
    if split not in data_inventory:
        continue
    
    inv = data_inventory[split]
    for feat_file, label_file in zip(inv['feature_files'], inv['label_files']):
        # Find matching metadata file
        date = feat_file.stem.replace('_features', '')
        meta_file = feat_file.parent / f"{date}_metadata.json"
        
        day_data = load_day_data(feat_file, label_file, meta_file)
        all_data[split].append(day_data)

print("Data loaded successfully")
for split in SPLITS:
    if all_data[split]:
        print(f"  {split}: {len(all_data[split])} days")


## 2. Shape and Type Validation

Verify that all files have the expected structure.


In [None]:
# Validation results
validation_results = []

for split in SPLITS:
    for day in all_data[split]:
        features = day['features']
        labels = day['labels']
        
        result = {
            'split': split,
            'date': day['date'],
            'n_features': features.shape[0],
            'n_feature_dims': features.shape[1],
            'n_labels': labels.shape[0],
            'feature_dtype': str(features.dtype),
            'label_dtype': str(labels.dtype),
            'feature_dim_ok': features.shape[1] == FEATURE_COUNT,
            'ratio': features.shape[0] / labels.shape[0] if labels.shape[0] > 0 else 0,
        }
        
        validation_results.append(result)

df_validation = pd.DataFrame(validation_results)
print("Shape Validation Summary:")
print(df_validation.to_string(index=False))


In [None]:
# Check for any validation failures
failures = df_validation[~df_validation['feature_dim_ok']]
if len(failures) > 0:
    print("❌ CRITICAL: Feature dimension mismatch detected!")
    print(failures)
else:
    print(f"✅ All {len(df_validation)} files have correct feature dimension ({FEATURE_COUNT})")

# Check feature/label ratio consistency
ratio_mean = df_validation['ratio'].mean()
ratio_std = df_validation['ratio'].std()
print(f"\nFeature/Label ratio: {ratio_mean:.2f} ± {ratio_std:.2f}")
print(f"  Expected: ~10 (stride=10, window=100)")

if abs(ratio_mean - 10) > 1:
    print("  ⚠️ Warning: Ratio deviates from expected value")


## 3. Data Quality Checks

Check for NaN, Inf, and other data quality issues.


In [None]:
# Aggregate all training data for analysis
train_features = np.vstack([d['features'] for d in all_data['train']])
train_labels = np.concatenate([d['labels'] for d in all_data['train']])

print(f"Training data shape: {train_features.shape}")
print(f"Training labels shape: {train_labels.shape}")


In [None]:
# Check for NaN and Inf
nan_count = np.isnan(train_features).sum()
inf_count = np.isinf(train_features).sum()
finite_count = np.isfinite(train_features).sum()
total_count = train_features.size

print("Data Quality Check:")
print(f"  Total values: {total_count:,}")
print(f"  Finite values: {finite_count:,} ({100*finite_count/total_count:.4f}%)")
print(f"  NaN values: {nan_count:,} ({100*nan_count/total_count:.6f}%)")
print(f"  Inf values: {inf_count:,} ({100*inf_count/total_count:.6f}%)")

if nan_count > 0 or inf_count > 0:
    print("\n❌ WARNING: Non-finite values detected!")
    # Find which features have issues
    for col in range(train_features.shape[1]):
        col_nan = np.isnan(train_features[:, col]).sum()
        col_inf = np.isinf(train_features[:, col]).sum()
        if col_nan > 0 or col_inf > 0:
            print(f"    Feature {col}: {col_nan} NaN, {col_inf} Inf")
else:
    print("\n✅ All values are finite (no NaN or Inf)")


In [None]:
# Check label values
unique_labels, label_counts = np.unique(train_labels, return_counts=True)

print("Label Distribution:")
for label, count in zip(unique_labels, label_counts):
    label_name = LABEL_NAMES.get(int(label), f"Unknown({label})")
    pct = 100 * count / len(train_labels)
    print(f"  {label_name:8s} (label={int(label):2d}): {count:7,} ({pct:5.2f}%)")

# Verify expected labels
expected_labels = {LABEL_DOWN, LABEL_STABLE, LABEL_UP}
actual_labels = set(unique_labels.astype(int))

if actual_labels == expected_labels:
    print(f"\n✅ Label values match expected: {expected_labels}")
else:
    unexpected = actual_labels - expected_labels
    missing = expected_labels - actual_labels
    print(f"\n⚠️ Label mismatch:")
    if unexpected:
        print(f"    Unexpected labels: {unexpected}")
    if missing:
        print(f"    Missing labels: {missing}")


## 4. Categorical Feature Validation

Verify that categorical features (book_valid, time_regime, mbo_ready, schema_version) have expected values.


In [None]:
# Categorical feature indices and expected values
categorical_features = {
    'book_valid': {
        'index': FeatureIndex.BOOK_VALID,
        'expected_values': {0.0, 1.0},
        'description': 'Safety gate: 1=valid book, 0=crossed/empty'
    },
    'time_regime': {
        'index': FeatureIndex.TIME_REGIME,
        'expected_values': {0.0, 1.0, 2.0, 3.0, 4.0},
        'description': '0=Open, 1=Early, 2=Midday, 3=Close, 4=Closed'
    },
    'mbo_ready': {
        'index': FeatureIndex.MBO_READY,
        'expected_values': {0.0, 1.0},
        'description': 'Warmup gate: 1=ready, 0=warming up'
    },
    'invalidity_delta': {
        'index': FeatureIndex.INVALIDITY_DELTA,
        'expected_values': None,  # Count, any non-negative
        'description': 'Count of feed problems since last sample'
    },
    'schema_version': {
        'index': FeatureIndex.SCHEMA_VERSION_FEATURE,
        'expected_values': {float(SCHEMA_VERSION)},
        'description': f'Schema version constant: {SCHEMA_VERSION}'
    },
}

print("Categorical Feature Validation:")
print("=" * 70)

for name, info in categorical_features.items():
    col = train_features[:, info['index']]
    unique_vals = set(np.unique(col))
    
    print(f"\n{name} (index {info['index']}):")
    print(f"  Description: {info['description']}")
    print(f"  Unique values: {sorted(unique_vals)}")
    
    if info['expected_values'] is not None:
        if unique_vals == info['expected_values']:
            print(f"  ✅ Matches expected values")
        elif unique_vals.issubset(info['expected_values']):
            missing = info['expected_values'] - unique_vals
            print(f"  ⚠️ Subset of expected (missing: {missing})")
        else:
            unexpected = unique_vals - info['expected_values']
            print(f"  ❌ Unexpected values: {unexpected}")
    
    # Value distribution
    for val in sorted(unique_vals):
        count = (col == val).sum()
        pct = 100 * count / len(col)
        print(f"    {val}: {count:,} ({pct:.2f}%)")


## 5. Signal Feature Analysis

Detailed analysis of the 14 trading signals (indices 84-97).


In [None]:
# Signal feature names and indices
signal_info = {
    84: ('true_ofi', 'Cont et al. OFI', 'continuous'),
    85: ('depth_norm_ofi', 'OFI / avg_depth', 'continuous'),
    86: ('executed_pressure', 'trades_ask - trades_bid', 'continuous'),
    87: ('signed_mp_delta_bps', 'Microprice deviation', 'continuous'),
    88: ('trade_asymmetry', '(ask-bid)/total trades', 'continuous'),
    89: ('cancel_asymmetry', '(ask-bid)/total cancels', 'continuous'),
    90: ('fragility_score', 'concentration/ln(depth)', 'continuous'),
    91: ('depth_asymmetry', '(bid-ask)/total depth', 'continuous'),
    92: ('book_valid', 'Safety gate', 'binary'),
    93: ('time_regime', 'Market session', 'categorical'),
    94: ('mbo_ready', 'Warmup flag', 'binary'),
    95: ('dt_seconds', 'Sample duration', 'continuous'),
    96: ('invalidity_delta', 'Feed problems', 'count'),
    97: ('schema_version', 'Version constant', 'constant'),
}

# Compute detailed statistics for each signal
signal_stats = []

for idx, (name, description, dtype) in signal_info.items():
    col = train_features[:, idx]
    
    stats = {
        'index': idx,
        'name': name,
        'type': dtype,
        'mean': col.mean(),
        'std': col.std(),
        'min': col.min(),
        'max': col.max(),
        'median': np.median(col),
        'q25': np.percentile(col, 25),
        'q75': np.percentile(col, 75),
        'n_unique': len(np.unique(col)),
    }
    
    # For continuous features, check for outliers (|z| > 4)
    if dtype == 'continuous' and stats['std'] > 0:
        z_scores = np.abs((col - stats['mean']) / stats['std'])
        stats['pct_outliers'] = 100 * (z_scores > 4).mean()
    else:
        stats['pct_outliers'] = 0
    
    signal_stats.append(stats)

df_signals = pd.DataFrame(signal_stats)
print("Signal Feature Statistics:")
print(df_signals[['index', 'name', 'type', 'mean', 'std', 'min', 'max', 'pct_outliers']].to_string(index=False))


## 6. Sanity Checks: Known Relationships

Verify known relationships still hold after normalization.


In [None]:
# After Z-score normalization, we check correlation patterns and sign conventions
print("Sanity Check: Signal Relationships")
print("=" * 70)

# true_ofi and depth_norm_ofi should be highly correlated (same base signal)
true_ofi = train_features[:, 84]
depth_norm_ofi = train_features[:, 85]
corr_ofi = np.corrcoef(true_ofi, depth_norm_ofi)[0, 1]
print(f"\n1. true_ofi ↔ depth_norm_ofi correlation: {corr_ofi:.4f}")
if corr_ofi > 0.7:
    print("   ✅ High correlation (expected: same base signal)")
else:
    print("   ⚠️ Lower than expected correlation")

# trade_asymmetry and executed_pressure should be positively correlated
trade_asym = train_features[:, 88]
exec_pressure = train_features[:, 86]
corr_trade = np.corrcoef(trade_asym, exec_pressure)[0, 1]
print(f"\n2. trade_asymmetry ↔ executed_pressure correlation: {corr_trade:.4f}")
if corr_trade > 0.3:
    print("   ✅ Positive correlation (expected: both measure trade imbalance)")
else:
    print("   ⚠️ Lower than expected correlation")

# book_valid should be 1 for most samples (data quality)
book_valid = train_features[:, 92]
valid_pct = (book_valid == 1.0).mean() * 100
print(f"\n3. book_valid = 1: {valid_pct:.2f}%")
if valid_pct > 95:
    print("   ✅ Most samples have valid book (expected for quality data)")
elif valid_pct > 80:
    print("   ⚠️ Some invalid book samples")
else:
    print("   ❌ Many invalid samples - data quality issue")

# mbo_ready should be 1 for most samples (after warmup)
mbo_ready = train_features[:, 94]
ready_pct = (mbo_ready == 1.0).mean() * 100
print(f"\n4. mbo_ready = 1: {ready_pct:.2f}%")
if ready_pct > 95:
    print("   ✅ Most samples are warmed up (expected)")
else:
    print("   ⚠️ Significant warmup period")

# schema_version should be constant at 2
schema = train_features[:, 97]
unique_schema = np.unique(schema)
print(f"\n5. schema_version values: {unique_schema}")
if len(unique_schema) == 1 and unique_schema[0] == SCHEMA_VERSION:
    print(f"   ✅ Constant at {SCHEMA_VERSION} (expected)")
else:
    print("   ❌ Schema version inconsistency!")


## 7. Summary

Complete data profile for downstream analysis.


In [None]:
# Aggregate statistics across all splits
total_samples = sum([d['features'].shape[0] for split in SPLITS for d in all_data[split]])
total_labels = sum([d['labels'].shape[0] for split in SPLITS for d in all_data[split]])
total_days = sum([len(all_data[split]) for split in SPLITS])

train_days = len(all_data['train'])
val_days = len(all_data['val'])
test_days = len(all_data['test'])

val_labels = np.concatenate([d['labels'] for d in all_data['val']])
test_labels = np.concatenate([d['labels'] for d in all_data['test']])

print("="*70)
print("DATASET PROFILE SUMMARY")
print("="*70)

print(f"\nSymbol: NVDA")
print(f"Date range: {all_data['train'][0]['date']} to {all_data['test'][-1]['date']}")
print(f"Total trading days: {total_days}")

print(f"\n--- Sample Counts ---")
print(f"Total feature samples: {total_samples:,}")
print(f"Total labels: {total_labels:,}")
print(f"Feature/Label ratio: {total_samples/total_labels:.2f}")

print(f"\n--- Split Distribution ---")
print(f"Train: {train_days} days, {len(train_labels):,} labels ({100*len(train_labels)/total_labels:.1f}%)")
print(f"Val:   {val_days} days, {len(val_labels):,} labels ({100*len(val_labels)/total_labels:.1f}%)")
print(f"Test:  {test_days} days, {len(test_labels):,} labels ({100*len(test_labels)/total_labels:.1f}%)")

print(f"\n--- Label Distribution (Train) ---")
for lbl in [LABEL_DOWN, LABEL_STABLE, LABEL_UP]:
    count = (train_labels == lbl).sum()
    print(f"{LABEL_NAMES[lbl]:8s}: {count:6,} ({100*count/len(train_labels):5.2f}%)")

print(f"\n--- Data Quality ---")
print(f"NaN values: {nan_count} ({100*nan_count/train_features.size:.6f}%)")
print(f"Inf values: {inf_count} ({100*inf_count/train_features.size:.6f}%)")
print(f"book_valid=1: {valid_pct:.2f}%")
print(f"mbo_ready=1: {ready_pct:.2f}%")

print("\n" + "="*70)
print("✅ DATA OVERVIEW COMPLETE - Ready for analysis")
print("="*70)


In [None]:
# Save summary for other notebooks
import os
os.makedirs('../docs/figures', exist_ok=True)

summary = {
    'total_samples': int(total_samples),
    'total_labels': int(total_labels),
    'total_days': total_days,
    'train_days': train_days,
    'val_days': val_days,
    'test_days': test_days,
    'feature_count': FEATURE_COUNT,
    'label_distribution': {
        'down': int((train_labels == LABEL_DOWN).sum()),
        'stable': int((train_labels == LABEL_STABLE).sum()),
        'up': int((train_labels == LABEL_UP).sum()),
    },
    'data_quality': {
        'nan_count': int(nan_count),
        'inf_count': int(inf_count),
        'book_valid_pct': float(valid_pct),
        'mbo_ready_pct': float(ready_pct),
    },
}

with open('../docs/data_overview_summary.json', 'w') as f:
    json.dump(summary, f, indent=2)

print("Summary saved to docs/data_overview_summary.json")
