# Bayesian Modeling with py-tidymodels

This notebook demonstrates Bayesian modeling capabilities using PyMC integration.

**Topics Covered:**
1. Basic Bayesian linear regression
2. Prior specification (default and custom)
3. Prediction types (numeric, conf_int, posterior, predictive)
4. Convergence diagnostics
5. Comparing Bayesian vs Frequentist models
6. Model comparison with WAIC/LOO
7. Interpretation of credible intervals

**Use Case:** Sales forecasting with uncertainty quantification

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from py_parsnip import linear_reg
from py_workflows import workflow
from py_recipes import recipe
from py_yardstick import rmse, mae, r_squared
from py_bayes import check_convergence

# Set random seed
np.random.seed(42)
sns.set_style('whitegrid')

print("Imports successful!")

## 1. Generate Synthetic Sales Data

We'll create a realistic sales dataset with:
- Multiple predictors (advertising, price, seasonality)
- Non-linear relationships
- Heteroscedastic noise (varying uncertainty)

In [None]:
# Generate synthetic sales data
n = 150

# Create predictors
advertising = np.random.uniform(0, 100, n)
price = np.random.uniform(10, 50, n)
seasonality = np.sin(np.linspace(0, 4*np.pi, n))  # Seasonal pattern
competition = np.random.uniform(0, 1, n)

# True relationship (with interaction)
true_sales = (
    100 +  # Baseline
    2.5 * advertising +  # Advertising effect
    -1.5 * price +  # Price sensitivity
    30 * seasonality +  # Seasonal effect
    -20 * competition +  # Competition effect
    0.05 * advertising * (1 - competition)  # Interaction
)

# Add heteroscedastic noise (higher variance at higher sales)
noise_std = 5 + 0.1 * np.abs(true_sales)
sales = true_sales + np.random.randn(n) * noise_std

# Create DataFrame
data = pd.DataFrame({
    'sales': sales,
    'advertising': advertising,
    'price': price,
    'seasonality': seasonality,
    'competition': competition
})

# Split into train/test
train_data = data.iloc[:120]
test_data = data.iloc[120:]

print(f"Training data: {len(train_data)} observations")
print(f"Test data: {len(test_data)} observations")
print(f"\nData summary:")
print(data.describe())

In [None]:
# Visualize relationships
fig, axes = plt.subplots(2, 2, figsize=(12, 10))

axes[0, 0].scatter(train_data['advertising'], train_data['sales'], alpha=0.6)
axes[0, 0].set_xlabel('Advertising Spend')
axes[0, 0].set_ylabel('Sales')
axes[0, 0].set_title('Sales vs Advertising')

axes[0, 1].scatter(train_data['price'], train_data['sales'], alpha=0.6)
axes[0, 1].set_xlabel('Price')
axes[0, 1].set_ylabel('Sales')
axes[0, 1].set_title('Sales vs Price')

axes[1, 0].scatter(train_data['seasonality'], train_data['sales'], alpha=0.6)
axes[1, 0].set_xlabel('Seasonality')
axes[1, 0].set_ylabel('Sales')
axes[1, 0].set_title('Sales vs Seasonality')

axes[1, 1].scatter(train_data['competition'], train_data['sales'], alpha=0.6)
axes[1, 1].set_xlabel('Competition')
axes[1, 1].set_ylabel('Sales')
axes[1, 1].set_title('Sales vs Competition')

plt.tight_layout()
plt.show()

## 2. Basic Bayesian Linear Regression

We'll start with default priors and explore the posterior distributions.

In [None]:
# Fit Bayesian model with default priors
spec_bayes = linear_reg().set_engine(
    "pymc",
    chains=4,  # Run 4 MCMC chains
    draws=2000,  # 2000 samples per chain
    tune=1000  # 1000 tuning samples (discarded)
)

print("Fitting Bayesian model...")
fit_bayes = spec_bayes.fit(train_data, "sales ~ advertising + price + seasonality + competition")
print("Bayesian model fitted!")

In [None]:
# Check convergence diagnostics
diagnostics = check_convergence(fit_bayes)

print("\n" + "="*70)
print("CONVERGENCE DIAGNOSTICS")
print("="*70)
print(f"\nAll chains converged: {diagnostics['converged']}")
print(f"\nRhat Summary:")
print(diagnostics['rhat_summary'])
print(f"\nEffective Sample Size Summary:")
print(diagnostics['ess_summary'])

if not diagnostics['converged']:
    print("\nWARNING: Chains did not converge properly!")
    print("Problematic parameters:", diagnostics['rhat_issues'])
else:
    print("\n✓ All chains converged successfully (Rhat < 1.01)")

## 3. Coefficient Interpretation

Extract posterior distributions for coefficients.

In [None]:
# Extract coefficients
outputs, coeffs, stats = fit_bayes.extract_outputs()

print("\n" + "="*70)
print("POSTERIOR COEFFICIENT ESTIMATES")
print("="*70)
print(coeffs[['variable', 'mean', 'std', 'hdi_2.5%', 'hdi_97.5%', 'rhat', 'ess_bulk']])

print("\n" + "="*70)
print("INTERPRETATION")
print("="*70)
print("- mean: Posterior mean (point estimate)")
print("- std: Posterior standard deviation (uncertainty)")
print("- hdi_2.5%, hdi_97.5%: 95% Highest Density Interval (credible interval)")
print("- rhat: Convergence diagnostic (should be < 1.01)")
print("- ess_bulk: Effective sample size (higher is better)")

## 4. Prediction Types

Bayesian models support 4 prediction types:
1. **numeric**: Posterior mean predictions
2. **conf_int**: 95% credible intervals
3. **posterior**: Full posterior samples (for uncertainty analysis)
4. **predictive**: Posterior predictive (includes observation noise)

In [None]:
# 1. Point predictions (posterior mean)
preds_mean = fit_bayes.predict(test_data, type="numeric")
print("Point predictions (posterior mean):")
print(preds_mean.head())

# 2. Credible intervals
preds_ci = fit_bayes.predict(test_data, type="conf_int")
print("\nCredible intervals (95% HDI):")
print(preds_ci.head())

# 3. Posterior samples
preds_posterior = fit_bayes.predict(test_data, type="posterior")
print(f"\nPosterior samples shape: {preds_posterior.shape}")
print(f"Columns: {preds_posterior.columns.tolist()[:10]}...")  # First 10 samples

# 4. Posterior predictive (includes noise)
preds_predictive = fit_bayes.predict(test_data, type="predictive")
print(f"\nPosterior predictive shape: {preds_predictive.shape}")

In [None]:
# Visualize predictions with uncertainty
fig, ax = plt.subplots(figsize=(12, 6))

# Actual values
ax.scatter(range(len(test_data)), test_data['sales'].values,
           label='Actual', color='black', s=60, zorder=3)

# Posterior mean
ax.plot(preds_mean['.pred'], label='Posterior Mean', color='blue', linewidth=2)

# 95% credible interval
ax.fill_between(
    range(len(preds_ci)),
    preds_ci['.pred_lower'],
    preds_ci['.pred_upper'],
    alpha=0.3,
    color='blue',
    label='95% Credible Interval'
)

# Posterior predictive interval (wider - includes noise)
preds_pred_lower = preds_predictive.filter(like='posterior_').quantile(0.025, axis=1)
preds_pred_upper = preds_predictive.filter(like='posterior_').quantile(0.975, axis=1)
ax.fill_between(
    range(len(test_data)),
    preds_pred_lower,
    preds_pred_upper,
    alpha=0.2,
    color='red',
    label='95% Predictive Interval'
)

ax.set_xlabel('Test Observation')
ax.set_ylabel('Sales')
ax.set_title('Bayesian Predictions with Uncertainty')
ax.legend()
ax.grid(True, alpha=0.3)
plt.tight_layout()
plt.show()

print("\nInterpretation:")
print("- Blue band: Parameter uncertainty (we're uncertain about coefficients)")
print("- Red band: Total uncertainty (parameter + observation noise)")

## 5. Custom Priors

Specify custom priors based on domain knowledge.

In [None]:
# Fit with custom priors
spec_custom = linear_reg().set_engine(
    "pymc",
    prior_intercept="normal(100, 50)",  # Expect baseline sales around 100
    prior_coefs="normal(0, 10)",  # Weakly informative
    prior_sigma="half_cauchy(10)",  # Heavy-tailed prior for noise
    chains=4,
    draws=2000
)

print("Fitting model with custom priors...")
fit_custom = spec_custom.fit(train_data, "sales ~ advertising + price + seasonality + competition")
print("Custom prior model fitted!")

# Extract coefficients
_, coeffs_custom, _ = fit_custom.extract_outputs()

print("\nCoefficients with custom priors:")
print(coeffs_custom[['variable', 'mean', 'std', 'hdi_2.5%', 'hdi_97.5%']])

## 6. Bayesian vs Frequentist Comparison

Compare Bayesian and frequentist (OLS) approaches.

In [None]:
# Fit frequentist model
spec_ols = linear_reg()  # Default engine is sklearn
fit_ols = spec_ols.fit(train_data, "sales ~ advertising + price + seasonality + competition")

# Get coefficients
_, coeffs_ols, _ = fit_ols.extract_outputs()

# Compare coefficients
comparison = pd.DataFrame({
    'Variable': coeffs_ols['variable'],
    'OLS_Estimate': coeffs_ols['estimate'],
    'OLS_Std_Error': coeffs_ols['std_error'],
    'Bayes_Mean': coeffs['mean'],
    'Bayes_Std': coeffs['std'],
    'Bayes_CI_Lower': coeffs['hdi_2.5%'],
    'Bayes_CI_Upper': coeffs['hdi_97.5%']
})

print("\n" + "="*70)
print("BAYESIAN VS FREQUENTIST COMPARISON")
print("="*70)
print(comparison)

print("\n" + "="*70)
print("KEY DIFFERENCES")
print("="*70)
print("1. Estimates: Both methods produce similar point estimates")
print("2. Uncertainty: Bayesian provides full posterior, OLS gives asymptotic SE")
print("3. Interpretation: Bayesian CI is probability-based, OLS CI is frequentist")

In [None]:
# Compare predictions
preds_ols = fit_ols.predict(test_data)
preds_bayes = fit_bayes.predict(test_data, type="numeric")

# Calculate metrics
rmse_ols = rmse(test_data['sales'], preds_ols['.pred']).iloc[0]['value']
rmse_bayes = rmse(test_data['sales'], preds_bayes['.pred']).iloc[0]['value']

mae_ols = mae(test_data['sales'], preds_ols['.pred']).iloc[0]['value']
mae_bayes = mae(test_data['sales'], preds_bayes['.pred']).iloc[0]['value']

r2_ols = r_squared(test_data['sales'], preds_ols['.pred']).iloc[0]['value']
r2_bayes = r_squared(test_data['sales'], preds_bayes['.pred']).iloc[0]['value']

print("\nPrediction Performance:")
print(f"{'Metric':<10} {'OLS':<12} {'Bayesian':<12} {'Difference':<12}")
print("-" * 50)
print(f"{'RMSE':<10} {rmse_ols:<12.4f} {rmse_bayes:<12.4f} {rmse_bayes - rmse_ols:<12.4f}")
print(f"{'MAE':<10} {mae_ols:<12.4f} {mae_bayes:<12.4f} {mae_bayes - mae_ols:<12.4f}")
print(f"{'R²':<10} {r2_ols:<12.4f} {r2_bayes:<12.4f} {r2_bayes - r2_ols:<12.4f}")

## 7. Model Diagnostics

Examine model fit quality and assumptions.

In [None]:
# Extract diagnostics
outputs_bayes, _, stats_bayes = fit_bayes.extract_outputs()

# Plot residuals
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Residuals vs fitted
axes[0].scatter(outputs_bayes['fitted'], outputs_bayes['residuals'], alpha=0.6)
axes[0].axhline(y=0, color='red', linestyle='--')
axes[0].set_xlabel('Fitted Values')
axes[0].set_ylabel('Residuals')
axes[0].set_title('Residuals vs Fitted')
axes[0].grid(True, alpha=0.3)

# Q-Q plot
from scipy import stats as sp_stats
sp_stats.probplot(outputs_bayes['residuals'], dist="norm", plot=axes[1])
axes[1].set_title('Q-Q Plot')
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

# Print model statistics
print("\nModel Statistics:")
print(stats_bayes[['split', 'rmse', 'mae', 'r_squared', 'n']])

## 8. Uncertainty Quantification

Analyze prediction uncertainty for individual observations.

In [None]:
# Get posterior predictive samples for test data
preds_predictive = fit_bayes.predict(test_data, type="predictive")

# Extract sample columns
sample_cols = [col for col in preds_predictive.columns if col.startswith('posterior_')]

# Calculate prediction intervals for each observation
pred_means = preds_predictive[sample_cols].mean(axis=1)
pred_stds = preds_predictive[sample_cols].std(axis=1)
pred_lower = preds_predictive[sample_cols].quantile(0.025, axis=1)
pred_upper = preds_predictive[sample_cols].quantile(0.975, axis=1)

# Create uncertainty summary
uncertainty_summary = pd.DataFrame({
    'Actual': test_data['sales'].values,
    'Pred_Mean': pred_means,
    'Pred_Std': pred_stds,
    'CI_Lower': pred_lower,
    'CI_Upper': pred_upper,
    'CI_Width': pred_upper - pred_lower,
    'In_Interval': (test_data['sales'].values >= pred_lower) & (test_data['sales'].values <= pred_upper)
})

print("\nPrediction Uncertainty Summary:")
print(uncertainty_summary.head(10))

print(f"\nCoverage: {uncertainty_summary['In_Interval'].mean():.1%} of actuals fall within 95% CI")
print(f"Expected: 95%")
print(f"\nMean CI width: {uncertainty_summary['CI_Width'].mean():.2f}")
print(f"Min CI width: {uncertainty_summary['CI_Width'].min():.2f}")
print(f"Max CI width: {uncertainty_summary['CI_Width'].max():.2f}")

## 9. Summary and Best Practices

**Key Takeaways:**

1. **Bayesian Advantages:**
   - Full uncertainty quantification
   - Credible intervals have probability interpretation
   - Can incorporate prior knowledge
   - Natural handling of small samples

2. **When to Use Bayesian:**
   - Need uncertainty quantification
   - Small sample sizes
   - Have informative priors
   - Decision-making under uncertainty

3. **Best Practices:**
   - Always check convergence (Rhat < 1.01)
   - Verify ESS is sufficient (>400)
   - Use informative priors when available
   - Compare with frequentist methods
   - Validate coverage on test data

4. **Prior Selection:**
   - Default priors: Weakly informative, good starting point
   - Custom priors: Use domain knowledge
   - Prior predictive checks: Verify priors are reasonable

5. **Computational:**
   - Multiple chains (4+) for convergence diagnosis
   - Sufficient samples (2000+)
   - Tune phase to adapt sampler

In [None]:
# Final comparison visualization
fig, axes = plt.subplots(1, 2, figsize=(14, 5))

# Left: Point predictions comparison
axes[0].scatter(preds_ols['.pred'], test_data['sales'].values, alpha=0.6, label='OLS')
axes[0].scatter(preds_bayes['.pred'], test_data['sales'].values, alpha=0.6, label='Bayesian')
axes[0].plot([test_data['sales'].min(), test_data['sales'].max()],
            [test_data['sales'].min(), test_data['sales'].max()],
            'k--', label='Perfect')
axes[0].set_xlabel('Predicted Sales')
axes[0].set_ylabel('Actual Sales')
axes[0].set_title('Predictions: OLS vs Bayesian')
axes[0].legend()
axes[0].grid(True, alpha=0.3)

# Right: Uncertainty quantification (Bayesian only)
sorted_idx = np.argsort(pred_means)
x_pos = np.arange(len(sorted_idx))
axes[1].errorbar(
    x_pos,
    pred_means[sorted_idx],
    yerr=1.96*pred_stds[sorted_idx],
    fmt='o',
    alpha=0.6,
    label='Posterior Mean ± 1.96σ'
)
axes[1].scatter(x_pos, test_data['sales'].values[sorted_idx],
               color='red', marker='x', s=100, label='Actual', zorder=3)
axes[1].set_xlabel('Observation (sorted by prediction)')
axes[1].set_ylabel('Sales')
axes[1].set_title('Bayesian Prediction Uncertainty')
axes[1].legend()
axes[1].grid(True, alpha=0.3)

plt.tight_layout()
plt.show()

print("\n" + "="*70)
print("DEMO COMPLETE")
print("="*70)
print("Bayesian modeling provides full uncertainty quantification!")