# BLR (Conjugate / g-prior)

#### Speed specifications & Imports

In [3]:
#  SPEED HEADER (single-BLAS + joblib CV) 
import os
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"

# Parallelism knob 
N_JOBS = 18
RANDOM_STATE = 42

# Print fold-by-fold metrics?
VERBOSE_CV = True  #  False to parallelize configs with clean logs

# Imports 
import warnings; warnings.filterwarnings("ignore")
import re
from math import sqrt

import numpy as np
import pandas as pd

from IPython.display import display
from joblib import Parallel, delayed, dump
from threadpoolctl import threadpool_limits

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, RegressorMixin

#### Paths, Data Loading & Checks

In [5]:
SAVE_DIR   = '../Extended Parametric Regression Files+Plots'
TRAIN_CSV  = f'{SAVE_DIR}/train.csv'
TEST_CSV   = f'{SAVE_DIR}/test.csv'
FOLDS_NPY  = f'{SAVE_DIR}/train_folds.npy'   # must exist (precomputed)

# Output paths for models & residuals
MODELS_DIR = "../Extended Parametric Regression Files+Plots/Models/MLR"           # one CV-chosen best pipeline for MLR models
RESID_DIR  = "../Extended Parametric Regression Files+Plots/Reports/Residuals_Test"    # residuals per model (from the 80/20 test) 
RESID_DIR_OOF = "../Extended Parametric Regression Files+Plots/Reports/Residuals_OOF"  # OOF Residuals

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESID_DIR,  exist_ok=True)
os.makedirs(RESID_DIR_OOF, exist_ok=True)

def slug(obj):
    """Filename-safe tag for params."""
    if isinstance(obj, dict) and obj:
        parts = []
        for k in sorted(obj.keys()):
            v = obj[k]
            if isinstance(v, (float, np.floating)): v = float(v)
            parts.append(f"{k}={v}")
        s = "__".join(parts)
    else:
        s = str(obj) if obj not in (None, {}, []) else ""
    return re.sub(r"[^A-Za-z0-9._=-]+", "_", s).strip("_")

#  Load splits 
df_train = pd.read_csv(TRAIN_CSV)
df_train["_rowpos"] = np.arange(len(df_train), dtype=int)
df_test  = pd.read_csv(TEST_CSV)

#  Required columns check 
required_cols = [
    'PL','distance','frequency','c_walls','w_walls',
    'co2','humidity','pm25','pressure','temperature','snr'
]
missing = [c for c in required_cols if c not in df_train.columns or c not in df_test.columns]
if missing:
    raise ValueError(f"Missing required columns in train/test: {missing}")

#### Cross-Validation Folds

In [7]:
fold_assignments_full = np.load(FOLDS_NPY)  # vector aligned 

rowpos = df_train["_rowpos"].to_numpy(dtype=int)
if rowpos.max() >= len(fold_assignments_full):
    raise ValueError(
        f"[folds] Mismatch: train_folds.npy len={len(fold_assignments_full)} < max(_rowpos)={rowpos.max()}.\n"
        f"train.csv and train_folds.npy are out of sync."
    )
fold_assignments = fold_assignments_full[rowpos]

K = int(fold_assignments.max()) + 1
folds = [(np.where(fold_assignments != k)[0], np.where(fold_assignments == k)[0]) for k in range(K)]
print(f"[CV] Using saved folds (remapped) | K={K} | n_train={len(df_train)} | "
      f"fold sizes={[len(v) for _, v in folds]}")


[CV] Using saved folds (remapped) | K=5 | n_train=1415913 | fold sizes=[283183, 283183, 283183, 283182, 283182]


#### Physics, Features & Metrics

In [9]:
# Physics helpers 
d0 = 1.0  # reference distance in meters

def z_of_d(d):
    d = np.clip(d.astype(float), 1e-6, None)
    return 10.0 * np.log10(d / d0)

def f_term(f):
    f = np.clip(f.astype(float), 1e-12, None)
    return 20.0 * np.log10(f)

#  Features & targets 
raw_feats  = ['distance','frequency','c_walls','w_walls',
              'co2','humidity','pm25','pressure','temperature','snr']
target_col = 'PL'

Xtr_raw = df_train[raw_feats].copy()
ytr_pl  = df_train[target_col].astype(float).values
Xte_raw = df_test[raw_feats].copy()
yte_pl  = df_test[target_col].astype(float).values

# Friis adjustment: y_adj = PL - 20*log10(f)
ftr_tr, ftr_te = f_term(Xtr_raw['frequency'].values), f_term(Xte_raw['frequency'].values)
ytr_adj, yte_adj = ytr_pl - ftr_tr, yte_pl - ftr_te

# Linear feature map used by BLR
LIN_COLS = ['z_d','c_walls','w_walls','co2','humidity','pm25','pressure','temperature','snr']

Xtr_lin = pd.DataFrame({
    'z_d': z_of_d(Xtr_raw['distance'].values),
    'c_walls': Xtr_raw['c_walls'].values,
    'w_walls': Xtr_raw['w_walls'].values,
    'co2': Xtr_raw['co2'].values,
    'humidity': Xtr_raw['humidity'].values,
    'pm25': Xtr_raw['pm25'].values,
    'pressure': Xtr_raw['pressure'].values,
    'temperature': Xtr_raw['temperature'].values,
    'snr': Xtr_raw['snr'].values
}, columns=LIN_COLS).values

Xte_lin = pd.DataFrame({
    'z_d': z_of_d(Xte_raw['distance'].values),
    'c_walls': Xte_raw['c_walls'].values,
    'w_walls': Xte_raw['w_walls'].values,
    'co2': Xte_raw['co2'].values,
    'humidity': Xte_raw['humidity'].values,
    'pm25': Xte_raw['pm25'].values,
    'pressure': Xte_raw['pressure'].values,
    'temperature': Xte_raw['temperature'].values,
    'snr': Xte_raw['snr'].values
}, columns=LIN_COLS).values

# Metrics (PL-domain) 
def rmse_r2_on_PL(y_true_pl, y_pred_adj, fterm):
    y_pred_pl = y_pred_adj + fterm
    rmse = sqrt(((y_true_pl - y_pred_pl) ** 2).mean())
    ss_res = ((y_true_pl - y_pred_pl) ** 2).sum()
    ss_tot = ((y_true_pl - y_true_pl.mean()) ** 2).sum()
    r2 = 1.0 - ss_res / ss_tot if ss_tot > 0 else np.nan
    return rmse, r2

#### BLR Estimators (Conjugate & g-prior)

In [11]:
# BLR estimators (Conjugate) 
class FullBLRConjugate(BaseEstimator, RegressorMixin):
    """
    Conjugate Bayesian Linear Regression with Normal–Inverse-Gamma prior:
        beta | sigma^2 ~ N(beta0, sigma^2 V0),  sigma^2 ~ Inv-Gamma(a0, b0)
    - Works on adjusted target (y = PL - 20 log10 f).
    - Adds intercept internally (augments X with a column of ones).
    - Assumes you standardized X upstream if using spherical V0 (we do via StandardScaler).
    """
    def __init__(self, beta0=None, V0_scale=1e6, a0=1e-2, b0=1e-2):
        self.beta0 = beta0
        self.V0_scale = float(V0_scale)
        self.a0 = float(a0)
        self.b0 = float(b0)
        # learned
        self.beta_n_ = None
        self.Vn_ = None
        self.an_ = None
        self.bn_ = None

    def _augment(self, X):
        n = X.shape[0]
        return np.hstack([np.ones((n, 1)), X])

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float).reshape(-1)
        X_aug = self._augment(X)         # [n x (p+1)]
        n, d = X_aug.shape

        beta0 = np.zeros(d) if self.beta0 is None else np.asarray(self.beta0, dtype=float).reshape(-1)
        if beta0.shape[0] != d:
            raise ValueError("beta0 size mismatch.")

        # Prior covariance: V0 = V0_scale * I_d  (weakly informative on standardized X)
        V0_inv = np.eye(d) / self.V0_scale

        XtX = X_aug.T @ X_aug
        Vn_inv = V0_inv + XtX
        # small jitter for numerical stability
        Vn = np.linalg.inv(Vn_inv + 1e-12*np.eye(d))

        Xty = X_aug.T @ y
        beta_n = Vn @ (V0_inv @ beta0 + Xty)

        an = self.a0 + 0.5 * n
        # bn two equivalent forms; this one is numerically stable:
        bn = self.b0 + 0.5*( y @ y + beta0 @ (V0_inv @ beta0) - beta_n @ (Vn_inv @ beta_n) )

        self.beta_n_ = beta_n
        self.Vn_ = Vn
        self.an_ = an
        self.bn_ = float(bn)
        return self

    def predict(self, X, return_std=False):
        X = np.asarray(X, dtype=float)
        X_aug = self._augment(X)
        mean = X_aug @ self.beta_n_
        if not return_std:
            return mean
        # predictive variance for y: (bn/an) * (1 + x^T Vn x)
        pred_var = (self.bn_ / self.an_) * (1.0 + np.sum((X_aug @ self.Vn_) * X_aug, axis=1))
        pred_std = np.sqrt(np.maximum(pred_var, 0.0))
        return mean, pred_std

# BLR estimators ( g-prior) 
class BLR_GPrior(BaseEstimator, RegressorMixin):
    """
    Zellner g-prior on slopes (intercept gets flat/improper prior):
      beta = [beta0 (intercept); beta_s] ;  beta_s | sigma^2 ~ N(0, g sigma^2 (X'X)^{-1})
    - g_mode: 'uip' (g = n), or 'eb' (empirical Bayes from OLS R^2).
    - Works on adjusted target; adds intercept internally; expects standardized X.
    """
    def __init__(self, g_mode='uip', a0=1e-2, b0=1e-2, g_fixed=None):
        self.g_mode = str(g_mode)
        self.a0 = float(a0)
        self.b0 = float(b0)
        self.g_fixed = None if g_fixed is None else float(g_fixed)
        # learned
        self.beta_n_ = None
        self.Vn_ = None
        self.an_ = None
        self.bn_ = None
        self.g_ = None

    def _augment(self, X):
        n = X.shape[0]
        return np.hstack([np.ones((n, 1)), X])

    def _ols_fit(self, X_aug, y):
        # least squares solution
        beta_ols, *_ = np.linalg.lstsq(X_aug, y, rcond=None)
        yhat = X_aug @ beta_ols
        resid = y - yhat
        return beta_ols, yhat, resid

    def _choose_g(self, X_aug, y):
        n, d = X_aug.shape
        p = d - 1  # number of slopes
        if self.g_fixed is not None:
            return max(self.g_fixed, 1e-8)
        if self.g_mode.lower() == 'uip':
            return float(n)
        # empirical Bayes from OLS R^2
        beta_ols, yhat, resid = self._ols_fit(X_aug, y)
        tss = ((y - y.mean())**2).sum()
        rss = (resid**2).sum()
        R2 = 0.0 if tss <= 0 else max(0.0, 1.0 - rss / tss)
        R2 = min(R2, 1.0 - 1e-8)
        # g_hat = max( (R2/(1-R2)) * (n - p - 1), 1e-8 )
        g_hat = (R2 / (1.0 - R2)) * max(n - p - 1, 1.0)
        return float(np.clip(g_hat, 1e-8, 1e12))

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float).reshape(-1)
        X_aug = self._augment(X)  # [n x (p+1)]
        n, d = X_aug.shape
        p = d - 1

        g = self._choose_g(X_aug, y)
        self.g_ = g

        # Build prior precision: V0_inv = diag([0, (1/g)*XtX_slopes]) in augmented coords
        # Compute blocks
        one = X_aug[:, [0]]           # intercept column (ones)
        Z   = X_aug[:, 1:]            # slopes (standardized)
        XtX_11 = (one.T @ one)        # scalar [1x1] = n
        XtX_12 = (one.T @ Z)          # [1 x p]
        XtX_22 = (Z.T @ Z)            # [p x p]

        V0_inv = np.zeros((d, d), dtype=float)
        if p > 0:
            V0_inv[1:, 1:] = (1.0 / g) * XtX_22

        XtX = X_aug.T @ X_aug
        Vn_inv = V0_inv + XtX
        Vn = np.linalg.inv(Vn_inv + 1e-12*np.eye(d))

        Xty = X_aug.T @ y
        beta_n = Vn @ Xty  # prior mean = 0 (on slopes), flat on intercept

        an = self.a0 + 0.5 * n
        # bn form with prior precision:
        bn = self.b0 + 0.5*( y @ y - beta_n @ (Vn_inv @ beta_n) )

        self.beta_n_ = beta_n
        self.Vn_ = Vn
        self.an_ = an
        self.bn_ = float(bn)
        return self

    def predict(self, X, return_std=False):
        X = np.asarray(X, dtype=float)
        X_aug = self._augment(X)
        mean = X_aug @ self.beta_n_
        if not return_std:
            return mean
        pred_var = (self.bn_ / self.an_) * (1.0 + np.sum((X_aug @ self.Vn_) * X_aug, axis=1))
        pred_std = np.sqrt(np.maximum(pred_var, 0.0))
        return mean, pred_std

#### Pipeline Builders & Grids

In [13]:
# Pipeline builders 
def build_blr_conjugate(cfg):
    # Standardize X; conjugate BLR on adjusted target
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        FullBLRConjugate(beta0=None, V0_scale=cfg["V0_scale"], a0=cfg["a0"], b0=cfg["b0"])
    )

def build_blr_gprior(cfg):
    # Standardize X; g-prior BLR on adjusted target
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        BLR_GPrior(g_mode=cfg["g_mode"], a0=cfg["a0"], b0=cfg["b0"])
    )

# Grids 
conj_grid = [dict(V0_scale=v, a0=1e-2, b0=1e-2) for v in (1e2, 1e3, 1e4, 1e5, 1e6)]
gprior_grid = [dict(g_mode=m, a0=1e-2, b0=1e-2) for m in ("uip","eb")]

blr_specs = [
    ("BLR-Linear (Conjugate)", build_blr_conjugate, conj_grid),
    ("BLR-Linear (g-prior)",   build_blr_gprior,   gprior_grid),
]

#### Posterior Unscaling (Original Units)

In [15]:
#  Unscale BLR posterior to original units 
def unscale_blr_posterior(pipeline, feat_names):
    """
    Map posterior (intercept + standardized coefs) back to original feature units.
    Returns (names, mean_orig, cov_orig).
    """
    steps = pipeline.named_steps
    scaler = steps['standardscaler']
    est = steps['fullblrconjugate'] if 'fullblrconjugate' in steps else steps['blr_gprior']

    beta_std = est.beta_n_.copy()         # length p+1
    Vn_std   = est.Vn_.copy()             # (p+1)x(p+1)

    mu = scaler.mean_.astype(float)
    sig = scaler.scale_.astype(float)
    p = len(mu)

    # Transform matrix T from [intercept, beta_std] -> [intercept_orig, beta_orig]
    T = np.zeros((p+1, p+1), dtype=float)
    T[0,0] = 1.0
    T[0,1:] = -mu / sig
    for j in range(p):
        T[j+1, j+1] = 1.0 / sig[j]

    beta_orig = T @ beta_std
    Vn_orig = T @ Vn_std @ T.T

    names = ["Intercept"] + list(feat_names)
    return names, beta_orig, Vn_orig

#### CV, Refit, Test & Residuals

In [17]:
# CV, refit on train, test, residuals & coefficients 
def eval_cfg_blr(factory, cfg, folds, verbose=False, label="BLR"):
    tr_rmse, val_rmse, tr_r2, val_r2 = [], [], [], []
    if verbose:
        print(f"-- {label} | cfg: {cfg}")
    for i, (tr_idx, val_idx) in enumerate(folds, start=1):
        X_tr, X_val = Xtr_lin[tr_idx], Xtr_lin[val_idx]
        y_tr, y_val = ytr_adj[tr_idx], ytr_adj[val_idx]
        ypl_tr, ypl_val = ytr_pl[tr_idx], ytr_pl[val_idx]
        f_tr,  f_val    = ftr_tr[tr_idx],  ftr_tr[val_idx]

        pipe = factory(cfg)
        with threadpool_limits(limits=1, user_api="blas"):
            pipe.fit(X_tr, y_tr)

        y_tr_pred = pipe.predict(X_tr)
        r_tr = rmse_r2_on_PL(ypl_tr, y_tr_pred, f_tr); tr_rmse.append(r_tr[0]); tr_r2.append(r_tr[1])

        y_val_pred = pipe.predict(X_val)
        r_val = rmse_r2_on_PL(ypl_val, y_val_pred, f_val); val_rmse.append(r_val[0]); val_r2.append(r_val[1])

        if verbose:
            print(f"   Fold {i}: RMSE_train={r_tr[0]:.4f}, RMSE_val={r_val[0]:.4f}")

    return {
        "cfg": cfg,
        "rmse_train_mean": float(np.mean(tr_rmse)), "rmse_train_sd": float(np.std(tr_rmse)),
        "rmse_val_mean":   float(np.mean(val_rmse)), "rmse_val_sd":   float(np.std(val_rmse)),
        "r2_train_mean":   float(np.mean(tr_r2)),    "r2_train_sd":    float(np.std(tr_r2)),
        "r2_val_mean":     float(np.mean(val_r2)),   "r2_val_sd":      float(np.std(val_r2)),
    }

blr_results = []
residuals_list = []

for name, factory, grid in blr_specs:
    print(f"=== {name} — grid size: {len(grid)}")
    if VERBOSE_CV:
        grid_results = [eval_cfg_blr(factory, cfg, folds, verbose=True, label=name) for cfg in grid]
    else:
        grid_results = Parallel(n_jobs=N_JOBS, backend="threading", prefer="threads", verbose=0)(
            delayed(eval_cfg_blr)(factory, cfg, folds, verbose=False, label=name) for cfg in grid
        )

    # Pick best by mean Val RMSE
    best = min(grid_results, key=lambda r: r["rmse_val_mean"])
    best_cfg, best_cv = best["cfg"], {k:v for k,v in best.items() if k!='cfg'}
    print(f"=== Best (Val RMSE) for {name}: {best_cv['rmse_val_mean']:.4f} ± {best_cv['rmse_val_sd']:.4f} | cfg={best_cfg}")

    # Final fit on all training data
    final_pipe = factory(best_cfg)
    with threadpool_limits(limits=1, user_api="blas"):
        final_pipe.fit(Xtr_lin, ytr_adj)

    # Test performance (PL domain)
    yte_pred_adj, yte_pred_std = final_pipe.predict(Xte_lin, return_std=True)
    test_rmse, test_r2 = rmse_r2_on_PL(yte_pl, yte_pred_adj, ftr_te)

    # Residuals on 20% test
    PL_pred  = yte_pred_adj + ftr_te
    resid_db = yte_pl - PL_pred

    tag_cfg = slug(best_cfg)
    base_tag = "Conjugate" if "Conjugate" in name else "gprior"
    model_tag = f"BLR_{base_tag}" if tag_cfg == "" else f"BLR_{base_tag}__{tag_cfg}"

    res_df = pd.DataFrame({
        "model":       model_tag,
        "split":       "test",
        "row_id":      np.arange(len(df_test), dtype=int),
        "time":        df_test.get("time", pd.Series(index=df_test.index, dtype=float)),
        "device_id":   df_test["device_id"].values,
        "distance":    df_test["distance"].values,
        "frequency":   df_test["frequency"].values,
        "c_walls":     df_test["c_walls"].values,
        "w_walls":     df_test["w_walls"].values,
        "co2":         df_test["co2"].values,
        "humidity":    df_test["humidity"].values,
        "pm25":        df_test["pm25"].values,
        "pressure":    df_test["pressure"].values,
        "temperature": df_test["temperature"].values,
        "snr":         df_test["snr"].values,
        "PL_true":     yte_pl,
        "PL_pred":     PL_pred,
        "resid_db":    resid_db
    })
    residuals_list.append((model_tag, res_df))

    # Posterior means & std errors (original units)
    names, mean_orig, cov_orig = unscale_blr_posterior(final_pipe, LIN_COLS)
    se_orig = np.sqrt(np.clip(np.diag(cov_orig), 0.0, None))
    coef_tbl = pd.DataFrame({"mean": mean_orig, "std_err": se_orig}, index=names)

    blr_results.append({
        "model": name,
        "best_cfg": best_cfg,
        "cv": best_cv,
        "test": {"rmse": float(test_rmse), "r2": float(test_r2)},
        "final_pipe": final_pipe,
        "coef_tbl": coef_tbl,
        "coeffs": coef_tbl["mean"].copy(),   # for side-by-side display
        "model_tag": model_tag
    })

print("Done.")

=== BLR-Linear (Conjugate) — grid size: 5
-- BLR-Linear (Conjugate) | cfg: {'V0_scale': 100.0, 'a0': 0.01, 'b0': 0.01}
   Fold 1: RMSE_train=8.1216, RMSE_val=8.1188
   Fold 2: RMSE_train=8.1204, RMSE_val=8.1236
   Fold 3: RMSE_train=8.1230, RMSE_val=8.1133
   Fold 4: RMSE_train=8.1216, RMSE_val=8.1188
   Fold 5: RMSE_train=8.1186, RMSE_val=8.1311
-- BLR-Linear (Conjugate) | cfg: {'V0_scale': 1000.0, 'a0': 0.01, 'b0': 0.01}
   Fold 1: RMSE_train=8.1216, RMSE_val=8.1188
   Fold 2: RMSE_train=8.1204, RMSE_val=8.1236
   Fold 3: RMSE_train=8.1230, RMSE_val=8.1133
   Fold 4: RMSE_train=8.1216, RMSE_val=8.1188
   Fold 5: RMSE_train=8.1186, RMSE_val=8.1311
-- BLR-Linear (Conjugate) | cfg: {'V0_scale': 10000.0, 'a0': 0.01, 'b0': 0.01}
   Fold 1: RMSE_train=8.1216, RMSE_val=8.1188
   Fold 2: RMSE_train=8.1204, RMSE_val=8.1236
   Fold 3: RMSE_train=8.1230, RMSE_val=8.1133
   Fold 4: RMSE_train=8.1216, RMSE_val=8.1188
   Fold 5: RMSE_train=8.1186, RMSE_val=8.1311
-- BLR-Linear (Conjugate) | cfg: {

#### Save Best Pipeline & Residuals

In [19]:
# Save best BLR pipeline + residuals (CSV) 
# Best-of-family (BLR) by mean Val RMSE (no test peeking)
best_overall = min(blr_results, key=lambda r: r["cv"]["rmse_val_mean"])
best_pipe = best_overall["final_pipe"]
best_tag  = best_overall["model_tag"]      # e.g., "BLR_Conjugate__V0_scale=..._a0=..._b0=..."
pipe_path = f"{MODELS_DIR}/{best_tag}.joblib"
dump(best_pipe, pipe_path)
print(f"[Models] Saved best BLR pipeline: {pipe_path}")

# Residuals per model (CSV) on 20% test
for tag, df_res in residuals_list:
    out_path = f"{RESID_DIR}/residuals__{tag}__test.csv"
    df_res.to_csv(out_path, index=False)
    print(f"[Residuals] Saved (csv): {out_path}")

# Canonical alias for the chosen/best BLR model
best_alias_path = f"{RESID_DIR}/residuals__BLR__BEST__test.csv"
best_res_df = next(df for tag, df in residuals_list if tag == best_tag)
best_res_df.to_csv(best_alias_path, index=False)
print(f"[Residuals] Saved (csv, alias for BEST): {best_alias_path}")

# OOF residuals for BEST BLR model
# Build train-only out-of-fold residuals using the same CV fold assignments
best_model_name  = best_overall["model"]
best_cfg         = best_overall["best_cfg"]
factory_for_best = next(f for (n, f, g) in blr_specs if n == best_model_name)

K = int(np.max(fold_assignments)) + 1
y_pred_adj_oof = np.empty_like(ytr_adj, dtype=float)

for k in range(K):
    tr_idx = np.where(fold_assignments != k)[0]
    val_idx = np.where(fold_assignments == k)[0]
    pipe = factory_for_best(best_cfg)
    with threadpool_limits(limits=1, user_api="blas"):
        pipe.fit(Xtr_lin[tr_idx], ytr_adj[tr_idx])
    # BLR pipeline returns mean by default; no std needed for residuals
    y_pred_adj_oof[val_idx] = pipe.predict(Xtr_lin[val_idx])

PL_pred_oof = y_pred_adj_oof + ftr_tr
resid_oof   = ytr_pl - PL_pred_oof

res_oof_df = pd.DataFrame({
    "model":       best_tag,
    "split":       "oof",
    "row_id":      np.arange(len(df_train), dtype=int),
    "fold":        fold_assignments.astype(int),
    "time":        df_train.get("time", pd.Series(index=df_train.index, dtype=float)),
    "device_id":   df_train["device_id"].values,
    "distance":    df_train["distance"].values,
    "frequency":   df_train["frequency"].values,
    "c_walls":     df_train["c_walls"].values,
    "w_walls":     df_train["w_walls"].values,
    "co2":         df_train["co2"].values,
    "humidity":    df_train["humidity"].values,
    "pm25":        df_train["pm25"].values,
    "pressure":    df_train["pressure"].values,
    "temperature": df_train["temperature"].values,
    "snr":         df_train["snr"].values,
    "PL_true":     ytr_pl,
    "PL_pred":     PL_pred_oof,
    "resid_db":    resid_oof
})

oof_path = f"{RESID_DIR_OOF}/residuals__{best_tag}__oof.csv"
res_oof_df.to_csv(oof_path, index=False)
print(f"[Residuals] Saved OOF residuals (BLR best): {oof_path}")

# Canonical alias for BLR BEST
best_oof_alias = f"{RESID_DIR_OOF}/residuals__BLR__BEST__oof.csv"
res_oof_df.to_csv(best_oof_alias, index=False)
print(f"[Residuals] Saved (csv, alias for BLR BEST OOF): {best_oof_alias}")

[Models] Saved best BLR pipeline: ../Extended Parametric Regression Files+Plots/Models/MLR/BLR_Conjugate__V0_scale=100.0__a0=0.01__b0=0.01.joblib
[Residuals] Saved (csv): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__BLR_Conjugate__V0_scale=100.0__a0=0.01__b0=0.01__test.csv
[Residuals] Saved (csv): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__BLR_gprior__a0=0.01__b0=0.01__g_mode=eb__test.csv
[Residuals] Saved (csv, alias for BEST): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__BLR__BEST__test.csv
[Residuals] Saved OOF residuals (BLR best): ../Extended Parametric Regression Files+Plots/Reports/Residuals_OOF/residuals__BLR_Conjugate__V0_scale=100.0__a0=0.01__b0=0.01__oof.csv
[Residuals] Saved (csv, alias for BLR BEST OOF): ../Extended Parametric Regression Files+Plots/Reports/Residuals_OOF/residuals__BLR__BEST__oof.csv


#### CV Summary Table

In [21]:
def fmt(mu, sd): 
    return f"{mu:.4f} ± {sd:.4f}"

cv_rows = []
for res in blr_results:
    cv = res['cv']
    cv_rows.append({
        "Model":       res['model'],
        "RMSE (Train)": fmt(cv["rmse_train_mean"], cv["rmse_train_sd"]),
        "RMSE (Val)":   fmt(cv["rmse_val_mean"],   cv["rmse_val_sd"]),
        "R2 (Train)":   fmt(cv["r2_train_mean"],   cv["r2_train_sd"]),
        "R2 (Val)":     fmt(cv["r2_val_mean"],     cv["r2_val_sd"]),
    })
cv_blr_df = pd.DataFrame(cv_rows)
print("\n=== Cross-Validation Results (BLR, linear basis) ===")
display(cv_blr_df)


=== Cross-Validation Results (BLR, linear basis) ===


Unnamed: 0,Model,RMSE (Train),RMSE (Val),R2 (Train),R2 (Val)
0,BLR-Linear (Conjugate),8.1211 ± 0.0015,8.1211 ± 0.0060,0.8147 ± 0.0001,0.8147 ± 0.0004
1,BLR-Linear (g-prior),8.1211 ± 0.0015,8.1211 ± 0.0060,0.8147 ± 0.0001,0.8147 ± 0.0004


#### Test Summary

In [23]:
test_rows = []
for res in blr_results:
    te = res['test']
    test_rows.append({
        "Model": res['model'],
        "Test RMSE": f"{te['rmse']:.4f}",
        "Test R2":   f"{te['r2']:.4f}"
    })

test_blr_df = pd.DataFrame(test_rows)
print("\n=== Test (final fit on all train) ===")
display(test_blr_df)


=== Test (final fit on all train) ===


Unnamed: 0,Model,Test RMSE,Test R2
0,BLR-Linear (Conjugate),8.1133,0.8149
1,BLR-Linear (g-prior),8.1133,0.8149


#### Coefficient Table

In [25]:
coef_blr_df = pd.concat([res['coeffs'] for res in blr_results], axis=1)
coef_blr_df.columns = [res['model'] for res in blr_results]

print("\n=== Posterior Means (original units) — BLR, linear basis ===")
display(coef_blr_df)


=== Posterior Means (original units) — BLR, linear basis ===


Unnamed: 0,BLR-Linear (Conjugate),BLR-Linear (g-prior)
Intercept,-0.437634,-0.43763
z_d,3.839249,3.839249
c_walls,6.753458,6.753457
w_walls,1.991284,1.991284
co2,-0.002507,-0.002507
humidity,-0.068243,-0.068243
pm25,-0.070585,-0.070585
pressure,-0.002642,-0.002642
temperature,-0.101301,-0.101301
snr,-2.063356,-2.063355
