# Notebook 01: Label Analysis

**Purpose**: Deeply understand the labels we're trying to predict

**Critical Questions**:
1. Is the current horizon (50 samples) optimal?
2. Is the current threshold (8 bps) optimal?
3. Are labels clustered (autocorrelated) or random?
4. What are the transition probabilities between labels?

---

## Label Configuration

Current configuration (from `nvda_98feat.toml`):
- **Horizon**: 50 samples ahead
- **Smoothing**: 10-sample window
- **Threshold**: 8 bps (0.08%)
- **Labels**: -1=Down (>8bps drop), 0=Stable, 1=Up (>8bps rise)


In [None]:
# Standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from scipy import stats
from collections import Counter
import json
import sys

# Add src to path
sys.path.insert(0, str(Path.cwd().parent / 'src'))

from lobtrainer.constants import (
    FEATURE_COUNT, FeatureIndex,
    LABEL_DOWN, LABEL_STABLE, LABEL_UP, LABEL_NAMES
)

# Plotting configuration
plt.style.use('seaborn-v0_8-whitegrid')
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['font.size'] = 10

# Data path
DATA_ROOT = Path.cwd().parent.parent / 'data' / 'exports' / 'nvda_98feat'

print("Environment ready")


In [None]:
# Load training data
def load_split(split_name):
    """Load all data for a split."""
    split_dir = DATA_ROOT / split_name
    features_list = []
    labels_list = []
    metadata_list = []
    
    for feat_file in sorted(split_dir.glob('*_features.npy')):
        date = feat_file.stem.replace('_features', '')
        label_file = feat_file.parent / f"{date}_labels.npy"
        meta_file = feat_file.parent / f"{date}_metadata.json"
        
        features_list.append(np.load(feat_file))
        labels_list.append(np.load(label_file))
        
        if meta_file.exists():
            with open(meta_file) as f:
                metadata_list.append(json.load(f))
    
    return {
        'features': np.vstack(features_list),
        'labels': np.concatenate(labels_list),
        'metadata': metadata_list,
        'n_days': len(features_list),
    }

train_data = load_split('train')
print(f"Training data: {train_data['features'].shape[0]:,} samples, {len(train_data['labels']):,} labels, {train_data['n_days']} days")


## 1. Label Distribution Analysis

How balanced are the labels? Is there class imbalance we need to address?


In [None]:
labels = train_data['labels']

# Basic distribution
unique, counts = np.unique(labels, return_counts=True)
total = len(labels)

print("Label Distribution:")
print("=" * 50)

label_dist = {}
for lbl, cnt in zip(unique, counts):
    name = LABEL_NAMES.get(int(lbl), f"Unknown({lbl})")
    pct = 100 * cnt / total
    label_dist[int(lbl)] = {'count': cnt, 'pct': pct, 'name': name}
    print(f"  {name:8s} (label={int(lbl):2d}): {cnt:7,} ({pct:5.2f}%)")

# Class imbalance metrics
max_count = max(counts)
min_count = min(counts)
imbalance_ratio = max_count / min_count

print(f"\nClass Imbalance Metrics:")
print(f"  Max/Min ratio: {imbalance_ratio:.3f}")
print(f"  Majority class: {LABEL_NAMES[int(unique[np.argmax(counts)])]}")
print(f"  Minority class: {LABEL_NAMES[int(unique[np.argmin(counts)])]}")

if imbalance_ratio < 1.5:
    print(f"  ✅ Labels are well-balanced (ratio < 1.5)")
elif imbalance_ratio < 2.0:
    print(f"  ⚠️ Slight imbalance (consider weighting)")
else:
    print(f"  ❌ Significant imbalance (requires class weighting or resampling)")


In [None]:
# Visualize label distribution
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Bar chart
colors = ['#e74c3c', '#95a5a6', '#27ae60']  # Red, Gray, Green
ax1 = axes[0]
bars = ax1.bar([LABEL_NAMES[lbl] for lbl in unique], counts, color=colors, edgecolor='black')
ax1.set_xlabel('Label')
ax1.set_ylabel('Count')
ax1.set_title('Label Distribution')

# Add value labels on bars
for bar, cnt in zip(bars, counts):
    ax1.text(bar.get_x() + bar.get_width()/2, bar.get_height() + 500,
             f'{cnt:,}\n({100*cnt/total:.1f}%)', ha='center', va='bottom', fontsize=10)

# Pie chart
ax2 = axes[1]
wedges, texts, autotexts = ax2.pie(counts, labels=[LABEL_NAMES[lbl] for lbl in unique],
                                    colors=colors, autopct='%1.1f%%', startangle=90)
ax2.set_title('Label Proportions')

plt.tight_layout()
plt.savefig('../docs/figures/label_distribution.png', dpi=150, bbox_inches='tight')
plt.show()

print(f"\nRandom baseline accuracy: {100/len(unique):.1f}% (predicting uniformly)")


## 2. Label Autocorrelation

Are labels clustered (trends persist) or random? This affects:
- Whether sequence models will help
- Whether we should predict sequences of labels
- Potential for momentum strategies


In [None]:
def compute_autocorrelation(series, max_lag=50):
    """
    Compute autocorrelation function for a series.
    
    ACF(k) = Corr(X_t, X_{t+k})
    
    Returns:
        lags: array of lag values
        acf: autocorrelation at each lag
    """
    n = len(series)
    series = series.astype(float)
    mean = series.mean()
    var = series.var()
    
    if var == 0:
        return np.arange(max_lag + 1), np.ones(max_lag + 1)
    
    acf = np.zeros(max_lag + 1)
    acf[0] = 1.0  # Correlation with self
    
    for lag in range(1, max_lag + 1):
        if lag >= n:
            break
        # Compute correlation between series[:-lag] and series[lag:]
        cov = np.mean((series[:-lag] - mean) * (series[lag:] - mean))
        acf[lag] = cov / var
    
    return np.arange(max_lag + 1), acf

# Compute autocorrelation
max_lag = 100
lags, acf = compute_autocorrelation(labels, max_lag=max_lag)

# 95% confidence interval for white noise: ±1.96/sqrt(n)
ci = 1.96 / np.sqrt(len(labels))

print("Label Autocorrelation Analysis:")
print("=" * 50)
print(f"Sample size: {len(labels):,}")
print(f"95% CI for white noise: ±{ci:.4f}")
print(f"\nAutocorrelation at key lags:")
for lag in [1, 5, 10, 20, 50]:
    if lag < len(acf):
        sig = "***" if abs(acf[lag]) > ci else ""
        print(f"  Lag {lag:3d}: {acf[lag]:+.4f} {sig}")


In [None]:
# Visualize autocorrelation
fig, ax = plt.subplots(figsize=(14, 5))

ax.bar(lags, acf, color='steelblue', alpha=0.7, width=0.8)
ax.axhline(y=0, color='black', linestyle='-', linewidth=0.5)
ax.axhline(y=ci, color='red', linestyle='--', alpha=0.7, label=f'95% CI (±{ci:.4f})')
ax.axhline(y=-ci, color='red', linestyle='--', alpha=0.7)

ax.set_xlabel('Lag')
ax.set_ylabel('Autocorrelation')
ax.set_title('Label Autocorrelation Function (ACF)')
ax.legend()
ax.set_xlim(-1, max_lag + 1)

plt.tight_layout()
plt.savefig('../docs/figures/label_autocorrelation.png', dpi=150, bbox_inches='tight')
plt.show()

# Interpretation
if acf[1] > 0.1:
    print("\n✅ Strong positive autocorrelation: Labels cluster (trends persist)")
    print("   → Sequence models may capture momentum patterns")
elif acf[1] > ci:
    print("\n⚠️ Weak positive autocorrelation: Some label persistence")
    print("   → Sequence information may help slightly")
elif acf[1] < -ci:
    print("\n⚠️ Negative autocorrelation: Mean-reversion in labels")
    print("   → Consider mean-reversion strategies")
else:
    print("\n❌ No significant autocorrelation: Labels appear random")
    print("   → Sequence models may not add value over point predictions")


## 3. Transition Matrix Analysis

What are the probabilities of transitioning between labels?

- High P(Up→Up) suggests momentum
- High P(Up→Down) suggests mean-reversion
- Equal probabilities suggest random walk


In [None]:
def compute_transition_matrix(labels):
    """
    Compute transition probability matrix for labels.
    
    P[i, j] = P(label_{t+1} = j | label_t = i)
    
    Returns:
        transition_counts: raw counts
        transition_probs: normalized probabilities
        label_order: order of labels in matrix
    """
    label_order = sorted(np.unique(labels))
    n_labels = len(label_order)
    label_to_idx = {lbl: i for i, lbl in enumerate(label_order)}
    
    # Count transitions
    counts = np.zeros((n_labels, n_labels), dtype=int)
    for i in range(len(labels) - 1):
        from_idx = label_to_idx[labels[i]]
        to_idx = label_to_idx[labels[i + 1]]
        counts[from_idx, to_idx] += 1
    
    # Normalize to probabilities
    row_sums = counts.sum(axis=1, keepdims=True)
    probs = np.divide(counts, row_sums, where=row_sums > 0)
    
    return counts, probs, label_order

counts, probs, label_order = compute_transition_matrix(labels)

print("Transition Matrix Analysis:")
print("=" * 60)
print("\nTransition Counts:")
print(f"{'From \\ To':>10s}", end='')
for lbl in label_order:
    print(f"{LABEL_NAMES[lbl]:>10s}", end='')
print()
for i, from_lbl in enumerate(label_order):
    print(f"{LABEL_NAMES[from_lbl]:>10s}", end='')
    for j in range(len(label_order)):
        print(f"{counts[i, j]:>10,}", end='')
    print()

print("\nTransition Probabilities:")
print(f"{'From \\ To':>10s}", end='')
for lbl in label_order:
    print(f"{LABEL_NAMES[lbl]:>10s}", end='')
print()
for i, from_lbl in enumerate(label_order):
    print(f"{LABEL_NAMES[from_lbl]:>10s}", end='')
    for j in range(len(label_order)):
        print(f"{probs[i, j]:>10.3f}", end='')
    print()


In [None]:
# Visualize transition matrix
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Probability heatmap
ax1 = axes[0]
label_names_ordered = [LABEL_NAMES[lbl] for lbl in label_order]
sns.heatmap(probs, annot=True, fmt='.3f', cmap='Blues', ax=ax1,
            xticklabels=label_names_ordered, yticklabels=label_names_ordered,
            vmin=0, vmax=1)
ax1.set_xlabel('To')
ax1.set_ylabel('From')
ax1.set_title('Transition Probabilities P(To | From)')

# Compare to stationary (random) probabilities
ax2 = axes[1]
stationary_probs = np.array([label_dist[lbl]['pct']/100 for lbl in label_order])
expected_probs = np.outer(np.ones(len(label_order)), stationary_probs)
diff = probs - expected_probs

sns.heatmap(diff, annot=True, fmt='+.3f', cmap='RdBu_r', ax=ax2,
            xticklabels=label_names_ordered, yticklabels=label_names_ordered,
            center=0, vmin=-0.1, vmax=0.1)
ax2.set_xlabel('To')
ax2.set_ylabel('From')
ax2.set_title('Deviation from Stationary Distribution')

plt.tight_layout()
plt.savefig('../docs/figures/transition_matrix.png', dpi=150, bbox_inches='tight')
plt.show()

# Interpretation
print("\nKey Insights:")
diagonal = np.diag(probs)
for i, lbl in enumerate(label_order):
    name = LABEL_NAMES[lbl]
    stationary = stationary_probs[i]
    persist = diagonal[i]
    if persist > stationary + 0.05:
        print(f"  • {name} persists: P({name}→{name}) = {persist:.1%} > stationary {stationary:.1%}")
    elif persist < stationary - 0.05:
        print(f"  • {name} reverts: P({name}→{name}) = {persist:.1%} < stationary {stationary:.1%}")


## 4. Signal-Label Correlation

Quick preview of which signals correlate with labels (detailed analysis in Notebook 04).


In [None]:
# Align features with labels
# Features are at higher frequency; labels align with sequence ends
features = train_data['features']
stride = 10  # From config
window = 100  # From config

# Sample features at label positions (end of each sequence)
n_labels = len(labels)
aligned_features = []

for i in range(n_labels):
    # The label for sequence i corresponds to features ending at position (i+1)*stride + window - stride
    feat_end_idx = min((i + 1) * stride + window - stride, features.shape[0] - 1)
    aligned_features.append(features[feat_end_idx])

aligned_features = np.array(aligned_features)
print(f"Aligned {len(aligned_features)} feature samples with {len(labels)} labels")

# Signal indices and names
signal_indices = {
    84: 'true_ofi',
    85: 'depth_norm_ofi',
    86: 'executed_pressure',
    87: 'signed_mp_delta_bps',
    88: 'trade_asymmetry',
    89: 'cancel_asymmetry',
    90: 'fragility_score',
    91: 'depth_asymmetry',
}

# Compute correlations
print("\nSignal-Label Correlations:")
print("=" * 50)
correlations = {}
for idx, name in signal_indices.items():
    signal = aligned_features[:, idx]
    corr = np.corrcoef(signal, labels)[0, 1]
    correlations[name] = corr
    significance = "***" if abs(corr) > 0.05 else "**" if abs(corr) > 0.02 else "*" if abs(corr) > 0.01 else ""
    print(f"  {name:25s}: {corr:+.4f} {significance}")

# Sort by absolute correlation
sorted_corrs = sorted(correlations.items(), key=lambda x: abs(x[1]), reverse=True)
print(f"\nTop Predictors:")
for name, corr in sorted_corrs[:5]:
    print(f"  {name:25s}: {corr:+.4f}")


## 5. Label Dynamics by Time Regime

Do label patterns vary by market session?


In [None]:
# Time regime at each label position
time_regimes = aligned_features[:, FeatureIndex.TIME_REGIME]

regime_names = {
    0: 'Open (9:30-9:45)',
    1: 'Early (9:45-10:30)',
    2: 'Midday (10:30-15:30)',
    3: 'Close (15:30-16:00)',
    4: 'Closed'
}

print("Label Distribution by Time Regime:")
print("=" * 70)

regime_stats = []
for regime in sorted(np.unique(time_regimes)):
    mask = time_regimes == regime
    regime_labels = labels[mask]
    
    if len(regime_labels) < 100:
        continue
    
    up_pct = 100 * (regime_labels == LABEL_UP).mean()
    down_pct = 100 * (regime_labels == LABEL_DOWN).mean()
    stable_pct = 100 * (regime_labels == LABEL_STABLE).mean()
    
    # Compute signal-label correlation in this regime
    regime_ofi = aligned_features[mask, 84]  # true_ofi
    ofi_corr = np.corrcoef(regime_ofi, regime_labels)[0, 1]
    
    regime_stats.append({
        'regime': int(regime),
        'name': regime_names.get(int(regime), 'Unknown'),
        'n_samples': len(regime_labels),
        'up_pct': up_pct,
        'down_pct': down_pct,
        'stable_pct': stable_pct,
        'ofi_corr': ofi_corr,
    })
    
    print(f"\n{regime_names.get(int(regime), 'Unknown')} (n={len(regime_labels):,}):")
    print(f"  Up: {up_pct:.1f}%, Down: {down_pct:.1f}%, Stable: {stable_pct:.1f}%")
    print(f"  true_ofi correlation: {ofi_corr:+.4f}")

df_regimes = pd.DataFrame(regime_stats)
print("\n" + df_regimes.to_string(index=False))


## 6. Summary & Key Findings

Synthesize label analysis findings to guide modeling decisions.


In [None]:
print("=" * 70)
print("LABEL ANALYSIS SUMMARY")
print("=" * 70)

print(f"""
1. LABEL DISTRIBUTION
   • Total labels: {len(labels):,}
   • Class balance: Max/Min ratio = {imbalance_ratio:.2f}
   • Down: {label_dist[LABEL_DOWN]['pct']:.1f}%, Stable: {label_dist[LABEL_STABLE]['pct']:.1f}%, Up: {label_dist[LABEL_UP]['pct']:.1f}%
   • Assessment: {'Well-balanced' if imbalance_ratio < 1.5 else 'Needs class weighting'}

2. AUTOCORRELATION
   • Lag-1 ACF: {acf[1]:.4f}
   • Lag-5 ACF: {acf[5]:.4f}
   • Assessment: {'Labels cluster (momentum)' if acf[1] > 0.05 else 'Labels appear random' if abs(acf[1]) < ci else 'Labels mean-revert'}

3. TRANSITION DYNAMICS
   • P(Up→Up): {probs[label_order.index(LABEL_UP), label_order.index(LABEL_UP)]:.3f}
   • P(Down→Down): {probs[label_order.index(LABEL_DOWN), label_order.index(LABEL_DOWN)]:.3f}
   • Assessment: {'Momentum patterns present' if np.diag(probs).mean() > 0.4 else 'Near-random transitions'}

4. SIGNAL PREDICTABILITY
   • Best predictor: {sorted_corrs[0][0]} (r={sorted_corrs[0][1]:+.4f})
   • Second best: {sorted_corrs[1][0]} (r={sorted_corrs[1][1]:+.4f})
   • Assessment: {'Signals have predictive power' if abs(sorted_corrs[0][1]) > 0.05 else 'Weak signal-label relationship'}

5. REGIME EFFECTS
   • Strongest OFI correlation: {max(df_regimes['ofi_corr']):.4f} in {df_regimes.loc[df_regimes['ofi_corr'].idxmax(), 'name']}
   • Weakest OFI correlation: {min(df_regimes['ofi_corr']):.4f} in {df_regimes.loc[df_regimes['ofi_corr'].idxmin(), 'name']}
   • Assessment: {'Regime-specific models may help' if df_regimes['ofi_corr'].std() > 0.02 else 'Uniform predictability across regimes'}
""")

print("=" * 70)
print("RECOMMENDATIONS")
print("=" * 70)

recommendations = []
if imbalance_ratio < 1.5:
    recommendations.append("✅ Use standard cross-entropy loss (balanced classes)")
else:
    recommendations.append("⚠️ Use class-weighted loss or oversampling")

if acf[1] > 0.05:
    recommendations.append("✅ Sequence models (LSTM/Transformer) likely beneficial")
else:
    recommendations.append("⚠️ Point prediction models may suffice")

if abs(sorted_corrs[0][1]) > 0.05:
    recommendations.append(f"✅ Focus on {sorted_corrs[0][0]} as primary feature")
else:
    recommendations.append("⚠️ Need feature engineering or more signals")

for rec in recommendations:
    print(f"  {rec}")

print("\n" + "=" * 70)
print("✅ LABEL ANALYSIS COMPLETE")
print("=" * 70)


In [None]:
# Save analysis results
import os
os.makedirs('../docs/figures', exist_ok=True)

analysis_results = {
    'label_distribution': {
        'down': int(label_dist[LABEL_DOWN]['count']),
        'stable': int(label_dist[LABEL_STABLE]['count']),
        'up': int(label_dist[LABEL_UP]['count']),
        'imbalance_ratio': float(imbalance_ratio),
    },
    'autocorrelation': {
        'lag_1': float(acf[1]),
        'lag_5': float(acf[5]),
        'lag_10': float(acf[10]),
        'confidence_interval': float(ci),
    },
    'transition_matrix': probs.tolist(),
    'signal_correlations': {k: float(v) for k, v in correlations.items()},
    'top_predictors': [(k, float(v)) for k, v in sorted_corrs[:5]],
    'regime_stats': df_regimes.to_dict('records'),
}

with open('../docs/label_analysis_results.json', 'w') as f:
    json.dump(analysis_results, f, indent=2)

print("Results saved to docs/label_analysis_results.json")
