# Backtesting with Data Vintages

This notebook demonstrates production-realistic backtesting using point-in-time data vintages.

**Topics Covered:**
1. Creating synthetic vintage data
2. VintageCV setup for time-series cross-validation
3. WorkflowSet backtesting across multiple models
4. Vintage drift analysis
5. Forecast horizon performance
6. Comparing vintage vs final data
7. Production forecasting workflow

**Use Case:** Commodity price forecasting with data revisions

**Why Vintages Matter:**
- Real-world data gets revised (GDP, employment, earnings)
- Training on "final" data = data leakage
- Vintage data simulates production conditions

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from py_parsnip import linear_reg, rand_forest
from py_workflows import workflow
from py_workflowsets import WorkflowSet
from py_backtest import create_vintage_data, VintageCV, validate_vintage_data
from py_yardstick import metric_set, rmse, mae, r_squared

# Set random seed
np.random.seed(42)
sns.set_style('whitegrid')

print("Imports successful!")

## 1. Generate Commodity Price Data

Create realistic commodity price time series with:
- Trend component
- Seasonal pattern
- Exogenous predictors (USD index, demand)
- Random shocks

In [None]:
# Generate commodity price data (monthly)
start_date = pd.to_datetime('2020-01-01')
n_months = 60  # 5 years of monthly data
dates = pd.date_range(start=start_date, periods=n_months, freq='MS')

# Trend component
trend = np.linspace(50, 80, n_months)

# Seasonal component (annual cycle)
seasonality = 10 * np.sin(2 * np.pi * np.arange(n_months) / 12)

# Exogenous variables
usd_index = 100 + np.cumsum(np.random.randn(n_months) * 2)  # USD strength
demand_index = 80 + np.cumsum(np.random.randn(n_months) * 1.5)  # Demand

# True price relationship
true_price = (
    trend +
    seasonality +
    0.2 * (usd_index - 100) +  # USD effect
    0.3 * (demand_index - 80) +  # Demand effect
    np.random.randn(n_months) * 3  # Random noise
)

# Create "final" dataset (what we'd have with perfect hindsight)
final_data = pd.DataFrame({
    'date': dates,
    'price': true_price,
    'usd_index': usd_index,
    'demand_index': demand_index
})

print(f"Generated {len(final_data)} months of commodity price data")
print(f"Date range: {final_data['date'].min().date()} to {final_data['date'].max().date()}")
print(f"\nData summary:")
print(final_data.describe())

In [None]:
# Visualize the data
fig, axes = plt.subplots(3, 1, figsize=(14, 10))

# Price
axes[0].plot(final_data['date'], final_data['price'], linewidth=2, label='Price')
axes[0].set_xlabel('Date')
axes[0].set_ylabel('Price ($)')
axes[0].set_title('Commodity Price Over Time')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# USD Index
axes[1].plot(final_data['date'], final_data['usd_index'], linewidth=2, color='orange', label='USD Index')
axes[1].set_xlabel('Date')
axes[1].set_ylabel('USD Index')
axes[1].set_title('USD Strength Index')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

# Demand Index
axes[2].plot(final_data['date'], final_data['demand_index'], linewidth=2, color='green', label='Demand Index')
axes[2].set_xlabel('Date')
axes[2].set_ylabel('Demand Index')
axes[2].set_title('Demand Index')
axes[2].legend()
axes[2].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 2. Create Vintage Data

Simulate data revisions by creating multiple vintages.

**Key Concepts:**
- **Observation Date (date):** When event occurred
- **As-of Date (as_of_date):** When data was available
- **Vintages:** Multiple snapshots of same observation at different as-of dates

**Real-world examples:**
- GDP gets revised 3+ times
- Employment data revised monthly
- Corporate earnings restated

In [None]:
# Create synthetic vintage data
# Simulate 3 revisions with 5% standard deviation
vintage_data = create_vintage_data(
    final_data=final_data,
    date_col='date',
    n_revisions=4,  # Create 4 vintages per observation
    revision_std=0.05,  # 5% std deviation in revisions
    revision_delay_months=1,  # First revision available 1 month after observation
    value_cols=['price', 'usd_index', 'demand_index']  # Columns to revise
)

print(f"\nVintage data created!")
print(f"Total rows: {len(vintage_data)} (original: {len(final_data)})")
print(f"Vintages per observation: {len(vintage_data) // len(final_data)}")
print(f"\nColumns: {vintage_data.columns.tolist()}")
print(f"\nFirst few rows:")
print(vintage_data.head(12))

In [None]:
# Validate vintage data structure
validate_vintage_data(
    data=vintage_data,
    as_of_col='as_of_date',
    date_col='date'
)
print("✓ Vintage data validation passed!")

# Examine revisions for a specific observation
example_date = pd.to_datetime('2020-06-01')
example_vintages = vintage_data[vintage_data['date'] == example_date].sort_values('as_of_date')

print(f"\nRevisions for {example_date.date()}:")
print(example_vintages[['date', 'as_of_date', 'price', 'usd_index', 'demand_index']])

# Calculate revision magnitude
if len(example_vintages) > 1:
    first_vintage = example_vintages.iloc[0]['price']
    final_vintage = example_vintages.iloc[-1]['price']
    revision_pct = abs((final_vintage - first_vintage) / first_vintage) * 100
    print(f"\nPrice revision: {first_vintage:.2f} → {final_vintage:.2f} ({revision_pct:.1f}% change)")

In [None]:
# Visualize vintage revisions
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Select a few observations to track revisions
sample_dates = final_data['date'][::12][:3]  # Every 12 months, first 3

for sample_date in sample_dates:
    vintages = vintage_data[vintage_data['date'] == sample_date].sort_values('as_of_date')

    axes[0].plot(vintages['as_of_date'], vintages['price'],
                marker='o', label=f'Obs: {sample_date.date()}')

axes[0].set_xlabel('As-of Date (When Data Available)')
axes[0].set_ylabel('Price')
axes[0].set_title('Price Revisions Over Time')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Revision magnitude distribution
first_vintages = vintage_data.groupby('date').first()['price']
final_vintages = vintage_data.groupby('date').last()['price']
revision_pct = abs((final_vintages - first_vintages) / first_vintages * 100)

axes[1].hist(revision_pct, bins=20, edgecolor='black', alpha=0.7)
axes[1].set_xlabel('Absolute Revision (%)')
axes[1].set_ylabel('Count')
axes[1].set_title('Distribution of Price Revisions')
axes[1].axvline(revision_pct.mean(), color='red', linestyle='--',
               label=f'Mean: {revision_pct.mean():.1f}%')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

## 3. VintageCV: Cross-Validation with Vintages

Create vintage-aware CV splits that respect point-in-time constraints.

In [None]:
# Create VintageCV
vintage_cv = VintageCV(
    data=vintage_data,
    as_of_col='as_of_date',
    date_col='date',
    initial='2 years',  # Initial training window
    assess='6 months',  # Test/assessment period
    skip='3 months',  # Gap between folds
    cumulative=False,  # Rolling window (not expanding)
    vintage_selection='latest'  # Use most recent vintage available
)

print(f"VintageCV created with {vintage_cv.n_splits} splits")
print(f"\nConfiguration:")
print(f"  Initial training: 2 years")
print(f"  Assessment period: 6 months")
print(f"  Skip between folds: 3 months")
print(f"  Window type: Rolling (cumulative=False)")

In [None]:
# Examine CV splits
print("\nCV Split Details:")
print("="*80)

for i, split in enumerate(vintage_cv.splits[:3]):  # Show first 3 splits
    train_data = split.train_data()
    test_data = split.test_data()

    train_dates = train_data['date']
    test_dates = test_data['date']
    train_as_of = train_data['as_of_date'].max()
    test_as_of = test_data['as_of_date'].max()

    print(f"\nSplit {i+1}:")
    print(f"  Train dates: {train_dates.min().date()} to {train_dates.max().date()} ({len(train_data)} rows)")
    print(f"  Test dates:  {test_dates.min().date()} to {test_dates.max().date()} ({len(test_data)} rows)")
    print(f"  Train as-of: {train_as_of.date()}")
    print(f"  Test as-of:  {test_as_of.date()}")
    print(f"  → Ensures no future information leakage!")

In [None]:
# Visualize CV splits
fig, ax = plt.subplots(figsize=(14, 6))

colors = plt.cm.Set3(np.linspace(0, 1, vintage_cv.n_splits))

for i, split in enumerate(vintage_cv.splits):
    train_data = split.train_data()
    test_data = split.test_data()

    # Plot train period
    ax.barh(i, (train_data['date'].max() - train_data['date'].min()).days,
           left=train_data['date'].min(), height=0.3,
           color=colors[i], alpha=0.6, label=f'Split {i+1} Train' if i < 3 else None)

    # Plot test period
    ax.barh(i, (test_data['date'].max() - test_data['date'].min()).days,
           left=test_data['date'].min(), height=0.3,
           color=colors[i], alpha=0.9, edgecolor='black', linewidth=2)

ax.set_xlabel('Date')
ax.set_ylabel('CV Split')
ax.set_title('VintageCV Time Splits (Dark = Test, Light = Train)')
ax.legend(loc='upper left')
ax.grid(True, alpha=0.3, axis='x')
plt.tight_layout()
plt.show()

## 4. WorkflowSet Backtesting

Evaluate multiple model workflows across all vintage CV splits.

In [None]:
# Create multiple workflows for comparison
formulas = [
    "price ~ usd_index + demand_index",  # Simple linear
    "price ~ usd_index + demand_index + I(usd_index**2)",  # With polynomial
]

models = [
    linear_reg(),
    rand_forest(trees=50, min_n=5).set_mode('regression')
]

# Create WorkflowSet
wf_set = WorkflowSet.from_cross(preproc=formulas, models=models)

print(f"Created WorkflowSet with {len(wf_set.workflows)} workflows:")
for wf_id in wf_set.workflows.keys():
    print(f"  - {wf_id}")

In [None]:
# Run backtest across all CV splits
print("\nRunning backtest...")
print("This evaluates all workflows on all CV splits with vintage data")

backtest_results = wf_set.fit_backtests(
    vintage_cv,
    metrics=metric_set(rmse, mae, r_squared)
)

print("\nBacktest complete!")
print(f"Evaluated {len(wf_set.workflows)} workflows × {vintage_cv.n_splits} CV splits")

## 5. Analyze Backtest Results

Examine performance across workflows and time periods.

In [None]:
# Collect metrics across all splits
metrics_summary = backtest_results.collect_metrics()

print("\nBacktest Metrics Summary:")
print(metrics_summary)

In [None]:
# Rank workflows by performance
ranked = backtest_results.rank_results('rmse', n=10)

print("\nWorkflow Rankings (by RMSE):")
print(ranked[['wflow_id', 'rmse_mean', 'rmse_std', 'mae_mean', 'r_squared_mean']])

In [None]:
# Visualize workflow comparison
backtest_results.autoplot('rmse', top_n=4)
plt.title('Backtest Performance Comparison (RMSE)')
plt.tight_layout()
plt.show()

backtest_results.autoplot('r_squared', top_n=4)
plt.title('Backtest Performance Comparison (R²)')
plt.tight_layout()
plt.show()

## 6. Vintage Drift Analysis

Analyze how model performance changes over time (concept drift).

In [None]:
# Analyze vintage drift
drift_analysis = backtest_results.analyze_vintage_drift('rmse')

print("\nVintage Drift Analysis (RMSE over time):")
print(drift_analysis)

In [None]:
# Visualize drift over time
fig, ax = plt.subplots(figsize=(12, 6))

for wflow_id in drift_analysis['wflow_id'].unique()[:4]:  # Top 4 workflows
    wflow_data = drift_analysis[drift_analysis['wflow_id'] == wflow_id]
    ax.plot(wflow_data['vintage_date'], wflow_data['rmse'],
           marker='o', label=wflow_id, linewidth=2)

ax.set_xlabel('Vintage Date (Forecast Date)')
ax.set_ylabel('RMSE')
ax.set_title('Model Performance Over Time (Vintage Drift)')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("- Increasing RMSE over time suggests concept drift")
print("- Stable RMSE suggests robust model")
print("- Models may perform differently in different time periods")

## 7. Forecast Horizon Analysis

Examine how accuracy degrades with forecast horizon.

In [None]:
# Analyze forecast horizon performance
horizon_analysis = backtest_results.analyze_forecast_horizon('rmse')

print("\nForecast Horizon Analysis:")
print(horizon_analysis)

In [None]:
# Visualize horizon degradation
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

for wflow_id in horizon_analysis['wflow_id'].unique()[:4]:
    wflow_data = horizon_analysis[horizon_analysis['wflow_id'] == wflow_id]

    axes[0].plot(wflow_data['horizon_months'], wflow_data['rmse'],
                marker='o', label=wflow_id, linewidth=2)

    axes[1].plot(wflow_data['horizon_months'], wflow_data['r_squared'],
                marker='o', label=wflow_id, linewidth=2)

axes[0].set_xlabel('Forecast Horizon (Months)')
axes[0].set_ylabel('RMSE')
axes[0].set_title('RMSE vs Forecast Horizon')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

axes[1].set_xlabel('Forecast Horizon (Months)')
axes[1].set_ylabel('R²')
axes[1].set_title('R² vs Forecast Horizon')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\nTypical pattern: Accuracy decreases with longer horizons")

## 8. Compare Vintage vs Final Data

Demonstrate the importance of using vintage data for realistic evaluation.

In [None]:
# Fit model on FINAL data (data leakage!)
wf_final = workflow().add_formula(formulas[0]).add_model(models[0])
fit_final = wf_final.fit(final_data)

# Evaluate on same test period as last CV split
last_split = vintage_cv.splits[-1]
test_dates_last = last_split.test_data()['date'].unique()

final_test = final_data[final_data['date'].isin(test_dates_last)]
preds_final = fit_final.predict(final_test)

rmse_final = rmse(final_test['price'], preds_final['.pred']).iloc[0]['value']

# Compare with vintage-based backtest
rmse_vintage = ranked.iloc[0]['rmse_mean']  # Best vintage-based model

print("\n" + "="*70)
print("VINTAGE vs FINAL DATA COMPARISON")
print("="*70)
print(f"\nRMSE with FINAL data (data leakage):  {rmse_final:.4f}")
print(f"RMSE with VINTAGE data (realistic):   {rmse_vintage:.4f}")
print(f"\nDifference: {abs(rmse_vintage - rmse_final):.4f}")
print(f"Vintage RMSE is {(rmse_vintage/rmse_final - 1)*100:+.1f}% vs final")
print(f"\nUsing final data gives OVERLY OPTIMISTIC estimates!")
print(f"Vintage backtesting provides REALISTIC production performance.")

## 9. Production Forecasting Workflow

Demonstrate the full production workflow with vintage-aware training.

In [None]:
# Select best workflow from backtest
best_wflow_id = ranked.iloc[0]['wflow_id']
best_wf = wf_set.workflows[best_wflow_id]

print(f"Best workflow: {best_wflow_id}")

# Get latest available vintage for training
latest_as_of = vintage_data['as_of_date'].max()
latest_vintage = vintage_data[vintage_data['as_of_date'] == latest_as_of].copy()

print(f"\nLatest vintage as-of: {latest_as_of.date()}")
print(f"Training data: {len(latest_vintage)} observations")
print(f"Date range: {latest_vintage['date'].min().date()} to {latest_vintage['date'].max().date()}")

In [None]:
# Train production model on latest vintage
prod_model = best_wf.fit(latest_vintage)

print("\nProduction model trained on latest vintage data!")
print(f"Model: {prod_model.spec.model_type}")
print(f"Engine: {prod_model.spec.engine}")

# Get coefficients
outputs, coeffs, stats = prod_model.extract_outputs()

print("\nModel Coefficients:")
print(coeffs[['variable', 'estimate', 'std_error', 'p_value']])

print("\nModel Statistics:")
print(stats[['split', 'rmse', 'mae', 'r_squared', 'n']])

In [None]:
# Generate forecasts for next 6 months
last_date = latest_vintage['date'].max()
forecast_dates = pd.date_range(start=last_date + pd.DateOffset(months=1), periods=6, freq='MS')

# Create forecast data (would come from external source in production)
# For demo, extrapolate trends
last_usd = latest_vintage['usd_index'].iloc[-1]
last_demand = latest_vintage['demand_index'].iloc[-1]

forecast_data = pd.DataFrame({
    'date': forecast_dates,
    'usd_index': last_usd + np.cumsum(np.random.randn(6) * 2),
    'demand_index': last_demand + np.cumsum(np.random.randn(6) * 1.5)
})

# Generate forecasts
forecasts = prod_model.predict(forecast_data)

forecast_summary = pd.DataFrame({
    'date': forecast_dates,
    'predicted_price': forecasts['.pred'].values,
    'usd_index': forecast_data['usd_index'].values,
    'demand_index': forecast_data['demand_index'].values
})

print("\nProduction Forecasts (Next 6 Months):")
print(forecast_summary)

In [None]:
# Visualize production forecast
fig, ax = plt.subplots(figsize=(14, 6))

# Historical data (latest vintage)
ax.plot(latest_vintage['date'], latest_vintage['price'],
       linewidth=2, label='Historical (Latest Vintage)', color='blue')

# Forecasts
ax.plot(forecast_dates, forecast_summary['predicted_price'],
       linewidth=2, linestyle='--', marker='o', markersize=8,
       label='Forecast', color='red')

# Add vertical line at forecast start
ax.axvline(last_date, color='black', linestyle=':', alpha=0.5, label='Forecast Start')

ax.set_xlabel('Date')
ax.set_ylabel('Price ($)')
ax.set_title('Production Forecast with Vintage-Trained Model')
ax.legend()
ax.grid(True, alpha=0.3)
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

## 10. Summary and Best Practices

**Key Takeaways:**

1. **Why Vintages Matter:**
   - Real-world data gets revised (GDP, earnings, economic indicators)
   - Training on "final" data = data leakage = overly optimistic evaluation
   - Vintage data simulates production conditions accurately

2. **VintageCV Features:**
   - Point-in-time data selection (no future information)
   - Respects data revision timelines
   - Multiple vintage selection strategies
   - Integration with WorkflowSet

3. **Backtest Analysis:**
   - Rank workflows by realistic performance
   - Detect concept drift (performance over time)
   - Analyze forecast horizon degradation
   - Compare multiple models on same vintages

4. **Production Workflow:**
   - Train on latest available vintage
   - Track model performance over time
   - Retrain when drift detected
   - Document vintage metadata

5. **Best Practices:**
   - Always use vintage data for time-series forecasting
   - Validate vintage data structure
   - Monitor drift over time
   - Document as-of dates for reproducibility
   - Test on realistic forecast horizons

6. **When to Use:**
   - Economic forecasting (GDP, employment, inflation)
   - Financial modeling (earnings, revenue)
   - Supply chain forecasting (inventory, demand)
   - Any domain with data revisions

In [None]:
# Final summary visualization
fig = plt.figure(figsize=(16, 10))
gs = fig.add_gridspec(3, 2, hspace=0.3, wspace=0.3)

# 1. Vintage revision timeline
ax1 = fig.add_subplot(gs[0, :])
sample_dates = final_data['date'][::12][:3]
for sample_date in sample_dates:
    vintages = vintage_data[vintage_data['date'] == sample_date].sort_values('as_of_date')
    ax1.plot(vintages['as_of_date'], vintages['price'],
            marker='o', label=f'Obs: {sample_date.date()}', linewidth=2)
ax1.set_xlabel('As-of Date')
ax1.set_ylabel('Price')
ax1.set_title('Data Revision Timeline (Multiple Vintages per Observation)')
ax1.legend()
ax1.grid(True, alpha=0.3)

# 2. Workflow comparison
ax2 = fig.add_subplot(gs[1, 0])
top_workflows = ranked.head(4)
ax2.barh(range(len(top_workflows)), top_workflows['rmse_mean'], xerr=top_workflows['rmse_std'])
ax2.set_yticks(range(len(top_workflows)))
ax2.set_yticklabels(top_workflows['wflow_id'])
ax2.set_xlabel('RMSE (Mean ± Std)')
ax2.set_title('Workflow Performance Comparison')
ax2.invert_yaxis()
ax2.grid(True, alpha=0.3, axis='x')

# 3. Vintage drift
ax3 = fig.add_subplot(gs[1, 1])
for wflow_id in drift_analysis['wflow_id'].unique()[:3]:
    wflow_data = drift_analysis[drift_analysis['wflow_id'] == wflow_id]
    ax3.plot(wflow_data['vintage_date'], wflow_data['rmse'],
            marker='o', label=wflow_id, linewidth=2)
ax3.set_xlabel('Vintage Date')
ax3.set_ylabel('RMSE')
ax3.set_title('Model Drift Over Time')
ax3.legend()
ax3.grid(True, alpha=0.3)
plt.setp(ax3.xaxis.get_majorticklabels(), rotation=45)

# 4. Production forecast
ax4 = fig.add_subplot(gs[2, :])
ax4.plot(latest_vintage['date'].tail(24), latest_vintage['price'].tail(24),
        linewidth=2, label='Historical', color='blue')
ax4.plot(forecast_dates, forecast_summary['predicted_price'],
        linewidth=2, linestyle='--', marker='o', markersize=8,
        label='Forecast', color='red')
ax4.axvline(last_date, color='black', linestyle=':', alpha=0.5, label='Forecast Start')
ax4.set_xlabel('Date')
ax4.set_ylabel('Price ($)')
ax4.set_title('Production Forecast (Vintage-Trained Model)')
ax4.legend()
ax4.grid(True, alpha=0.3)
plt.setp(ax4.xaxis.get_majorticklabels(), rotation=45)

plt.suptitle('Vintage Backtesting Summary', fontsize=16, fontweight='bold')
plt.show()

print("\n" + "="*70)
print("DEMO COMPLETE")
print("="*70)
print("Vintage backtesting provides realistic production performance estimates!")