# Bootstrap Regression Experiments

### Run bootstrap experiments for linear regression with simulated heteroskedastic, heavy-tailed data.

In [None]:
import pandas as pd
from src.simulate import generate_linear_data, save_df, plot_data
from src.bootstrap_methods import (
    fit_ols,
    bootstrap_parametric_normal,
    bootstrap_pairs,
    bootstrap_summary,
)
from src.evaluate import (
    plot_bootstrap_lines,
    plot_coef_histogram,
    compute_bias_var,
)

ImportError: cannot import name 'plot_coef_hist' from 'src.evaluate' (/Users/mani/Desktop/regression/bootstrap-regression-fall2025/src/evaluate.py)

## Experiment Setup

In [None]:
random_seed = 2025
n_boot = 10000   # lower for quicker runtime

# Simulate messy data (heteroskedastic + heavy-tailed)
df, true_params = generate_linear_data(
    seed=random_seed,
    heteroskedastic=True,
    hetero_strength=5.0,
    heavy_tails=True,
)
save_df(df, "data/simulated.csv")

X = df["x"].values
y = df["y"].values

# Quick visualization of the simulated data
plot_data(df, true_params)

In [None]:
# Baseline OLS fit

ols_fit = fit_ols(X, y)
print(ols_fit.summary())

# 1. Parametric Normal Bootstrap

In [None]:
boot_param = bootstrap_parametric_normal(X, y, n_boot=n_boot, seed=1)
boot_param = boot_param.rename(columns={0: "x1"})  # nicer column names

print("Parametric bootstrap summary:\n", bootstrap_summary(boot_param))

plot_bootstrap_lines(X, y, boot_param, n_lines=50)
plot_coef_histogram(boot_param, coef="x1", alpha=0.05, ols_res=ols_fit)

print("Parametric bootstrap bias/variance:\n",
      compute_bias_var(boot_param, {
          "const": true_params["beta0"],
          "x1": true_params["beta1"],
      }).round(4))

## 2. Pairs Bootstrap

In [None]:
boot_pairs = bootstrap_pairs(X, y, n_boot=n_boot, seed=1)
boot_pairs = boot_pairs.rename(columns={0: "x1"})

print("Pairs bootstrap summary:\n", bootstrap_summary(boot_pairs))

plot_bootstrap_lines(X, y, boot_pairs, n_lines=50)
plot_coef_histogram(boot_pairs, coef="x1", alpha=0.05, ols_res=ols_fit)

print("Pairs bootstrap bias/variance:\n",
      compute_bias_var(boot_pairs, {
          "const": true_params["beta0"],
          "x1": true_params["beta1"],
      }).round(4))

## Notes/Insights

### - Both bootstraps recover the slope (x1) very well: almost no bias (~+0.05).
### - Intercept (const) is less precise, with upward bias (~+0.47).
### - Parametric bootstrap assumes normal residuals, so it yields slightly narrower CIs.
### - In heavy-tailed data, pairs bootstrap is more robust.