#  Regularized MLR (OLS / Ridge / Lasso / ElasticNet) 

#### Imports — core utils, data wrangling, ML, and stats

In [1]:
# SPEED HEADER (7950X, sklearn CV parallel)
import os
# To prevent BLAS oversubscription; we parallelize at sklearn/joblib level
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"

# Parallelism knob
N_JOBS = 18

# Core + utils
from math import sqrt
import re

# Data handling
import numpy as np
import pandas as pd
from IPython.display import display

# ML (scikit-learn)
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.metrics import mean_squared_error, r2_score

# Parallel
from joblib import Parallel, delayed, dump

# Stats (statsmodels)
import statsmodels.formula.api as smf
from statsmodels.stats.anova import anova_lm

#### Data paths, load, and prep

In [2]:
# Paths (aligned with earlier prep)
SAVE_DIR   = '../Extended Parametric Regression Files+Plots'
TRAIN_CSV  = f'{SAVE_DIR}/train.csv'
TEST_CSV   = f'{SAVE_DIR}/test.csv'
FOLDS_NPY  = f'{SAVE_DIR}/train_folds.npy'

# Load splits
df_train = pd.read_csv(TRAIN_CSV)
df_test  = pd.read_csv(TEST_CSV)
fold_assignments = np.load(FOLDS_NPY)

# Feature/target setup (physics-consistent)
raw_feats  = ['distance','frequency','c_walls','w_walls',
              'co2','humidity','pm25','pressure','temperature','snr']
target_col = 'PL'

# Train/test matrices
Xtr_raw = df_train[raw_feats].copy()
ytr_pl  = df_train[target_col].astype(float).values
Xte_raw = df_test[raw_feats].copy()
yte_pl  = df_test[target_col].astype(float).values

# Output paths for models & residuals
MODELS_DIR = "../Extended Parametric Regression Files+Plots/Models/MLR"           # one CV-chosen best pipeline for MLR models
RESID_DIR  = "../Extended Parametric Regression Files+Plots/Reports/Residuals_Test"    # residuals per model (from the 80/20 test) 
RESID_DIR_OOF = "../Extended Parametric Regression Files+Plots/Reports/Residuals_OOF"  # OOF Residuals

os.makedirs(MODELS_DIR, exist_ok=True)
os.makedirs(RESID_DIR,  exist_ok=True)
os.makedirs(RESID_DIR_OOF, exist_ok=True)

def slug(obj):
    """Filename-safe tag for model/config (kept readable)."""
    if isinstance(obj, dict) and obj:
        items = []
        for k in sorted(obj.keys()):
            v = obj[k]
            if isinstance(v, (float, np.floating)): v = float(v)
            items.append(f"{k}={v}")
        s = "__".join(items)
    else:
        s = str(obj) if obj not in (None, {}, []) else ""
    return re.sub(r"[^A-Za-z0-9._=-]+", "_", s).strip("_")

#### Physics-consistent linearization + helpers + model specs

In [3]:
# Linearize (Friis-adjusted): y_adj = PL - 20*log10(f)
d0 = 1.0
def z_of_d(d): 
    return 10.0*np.log10(np.clip(d.astype(float), 1e-6, None)/d0)
def f_term(f):
    return 20.0*np.log10(np.clip(f.astype(float), 1e-12, None))

# Adjusted targets
ftr_tr, ftr_te = f_term(Xtr_raw['frequency'].values), f_term(Xte_raw['frequency'].values)
ytr_adj, yte_adj = ytr_pl - ftr_tr, yte_pl - ftr_te

# Linear feature maps
cols = ['z_d','c_walls','w_walls','co2','humidity','pm25','pressure','temperature','snr']
Xtr_lin = pd.DataFrame({
    'z_d': z_of_d(Xtr_raw['distance'].values),
    'c_walls': Xtr_raw['c_walls'].values,
    'w_walls': Xtr_raw['w_walls'].values,
    'co2': Xtr_raw['co2'].values,
    'humidity': Xtr_raw['humidity'].values,
    'pm25': Xtr_raw['pm25'].values,
    'pressure': Xtr_raw['pressure'].values,
    'temperature': Xtr_raw['temperature'].values,
    'snr': Xtr_raw['snr'].values
}, columns=cols).values
Xte_lin = pd.DataFrame({
    'z_d': z_of_d(Xte_raw['distance'].values),
    'c_walls': Xte_raw['c_walls'].values,
    'w_walls': Xte_raw['w_walls'].values,
    'co2': Xte_raw['co2'].values,
    'humidity': Xte_raw['humidity'].values,
    'pm25': Xte_raw['pm25'].values,
    'pressure': Xte_raw['pressure'].values,
    'temperature': Xte_raw['temperature'].values,
    'snr': Xte_raw['snr'].values
}, columns=cols).values

# Param labels (for reporting)
param_names = [
    'PL(d0) [dB]', 'Path loss exponent (n)',
    'Brick Wall Loss (L_c) [dB]', 'Wood Wall Loss (L_w) [dB]',
    'CO2 coef. [dB/unit]', 'Humidity coef. [dB/%]',
    'PM2.5 coef. [dB/µg/m³]', 'Pressure coef. [dB/hPa]',
    'Temp. coef. [dB/°C]', 'SNR scaling (k_snr)'
]

# Helpers
def unscale_coefficients(pipeline):
    """Undo StandardScaler effect → coeffs in original units."""
    steps = pipeline.named_steps
    est = steps.get('ridge') or steps.get('lasso') or steps.get('elasticnet') or steps.get('linearregression')
    if 'standardscaler' not in steps:
        return float(est.intercept_), est.coef_.astype(float).copy()
    scaler = steps['standardscaler']
    beta_scaled = est.coef_.astype(float)
    mu, sig = scaler.mean_, scaler.scale_
    beta_orig = beta_scaled / sig
    intercept_orig = float(est.intercept_ - np.sum(beta_scaled * mu / sig))
    return intercept_orig, beta_orig

def fold_indices(folds, k):
    val_idx = np.where(folds == k)[0]
    tr_idx  = np.where(folds != k)[0]
    return tr_idx, val_idx

def rmse_r2_on_PL(y_true_pl, y_pred_adj, fterm):
    """Score in PL-domain (add back freq term)."""
    y_pred_pl = y_pred_adj + fterm
    rmse = sqrt(mean_squared_error(y_true_pl, y_pred_pl))
    r2   = r2_score(y_true_pl, y_pred_pl)
    return rmse, r2

# Model factories
def make_OLS(_): return make_pipeline(LinearRegression())
def make_Ridge(cfg): return make_pipeline(StandardScaler(), Ridge(alpha=cfg["alpha"], random_state=42))
def make_Lasso(cfg): return make_pipeline(StandardScaler(), Lasso(alpha=cfg["alpha"], max_iter=20000, random_state=42))
def make_ElasticNet(cfg): return make_pipeline(
    StandardScaler(),
    ElasticNet(alpha=cfg["alpha"], l1_ratio=cfg["l1_ratio"], max_iter=20000, random_state=42)
)

# Grids
ridge_grid = [dict(alpha=a) for a in np.logspace(-4, 3, 15)]
lasso_grid = [dict(alpha=a) for a in np.logspace(-4, 1, 15)]
enet_grid  = [dict(alpha=a, l1_ratio=r) for a in np.logspace(-4, 1, 10) for r in (0.2, 0.5, 0.8)]

# Spec list
specs = [
    ("OLS",        make_OLS,        [dict()]),
    ("Ridge",      make_Ridge,      ridge_grid),
    ("Lasso",      make_Lasso,      lasso_grid),
    ("ElasticNet", make_ElasticNet, enet_grid),
]

#### CV search → best hyperparams → refit on full train → evaluate on test

In [4]:
# K-fold CV over each spec; pick by mean Val RMSE; refit on all train; score on test
# also collects final_pipe and builds residuals per model
results = []
K = int(np.max(fold_assignments)) + 1

# Precompute indices once (saves a bit)
folds = [fold_indices(fold_assignments, k) for k in range(K)]

residuals_list = []  # collect per-model test residual DataFrames for saving

def eval_cfg(factory, cfg, folds):
    tr_rmse_list, val_rmse_list, tr_r2_list, val_r2_list = [], [], [], []
    for tr_idx, val_idx in folds:
        X_tr, X_val = Xtr_lin[tr_idx], Xtr_lin[val_idx]
        y_tr, y_val = ytr_adj[tr_idx], ytr_adj[val_idx]
        ypl_tr, ypl_val = ytr_pl[tr_idx], ytr_pl[val_idx]
        f_tr,  f_val  = ftr_tr[tr_idx],  ftr_tr[val_idx]

        pipe = factory(cfg)
        pipe.fit(X_tr, y_tr)

        y_tr_pred_adj = pipe.predict(X_tr)
        rmse_tr, r2_tr = rmse_r2_on_PL(ypl_tr, y_tr_pred_adj, f_tr)
        tr_rmse_list.append(rmse_tr); tr_r2_list.append(r2_tr)

        y_val_pred_adj = pipe.predict(X_val)
        rmse_val, r2_val = rmse_r2_on_PL(ypl_val, y_val_pred_adj, f_val)
        val_rmse_list.append(rmse_val); val_r2_list.append(r2_val)

    return {
        "cfg": cfg,
        "rmse_train_mean": float(np.mean(tr_rmse_list)), "rmse_train_sd": float(np.std(tr_rmse_list)),
        "rmse_val_mean":   float(np.mean(val_rmse_list)), "rmse_val_sd":   float(np.std(val_rmse_list)),
        "r2_train_mean":   float(np.mean(tr_r2_list)),    "r2_train_sd":    float(np.std(tr_r2_list)),
        "r2_val_mean":     float(np.mean(val_r2_list)),   "r2_val_sd":      float(np.std(val_r2_list)),
    }

for name, factory, grid in specs:
    if len(grid) == 1:  # OLS: no sweep
        grid_results = [eval_cfg(factory, grid[0], folds)]
    else:
        # Parallelize across configs; BLAS is single-threaded (from header), so this scales well.
        grid_results = Parallel(n_jobs=N_JOBS, backend="threading", prefer="threads", verbose=0)(
            delayed(eval_cfg)(factory, cfg, folds) for cfg in grid
        )

    best_res = min(grid_results, key=lambda r: r["rmse_val_mean"])
    best_cfg, best_cv = best_res["cfg"], {k: v for k, v in best_res.items() if k != "cfg"}

    # Refit on all training data with best hyperparams
    final_pipe = factory(best_cfg)
    final_pipe.fit(Xtr_lin, ytr_adj)

    # Test metrics (PL domain)
    yte_pred_adj = final_pipe.predict(Xte_lin)
    test_rmse, test_r2 = rmse_r2_on_PL(yte_pl, yte_pred_adj, ftr_te)

    # ---- build residuals on the 20% test split (uniform schema) ----
    PL_pred  = yte_pred_adj + ftr_te
    resid_db = yte_pl - PL_pred
    tag_cfg  = slug(best_cfg)
    model_tag = f"MLR_{name}" if tag_cfg == "" else f"MLR_{name}__{tag_cfg}"

    res_df = pd.DataFrame({
        "model":       model_tag,
        "split":       "test",
        "row_id":      np.arange(len(df_test), dtype=int),
        "time":        df_test.get("time", pd.Series(index=df_test.index, dtype=float)),
        "device_id":   df_test["device_id"].values,
        "distance":    df_test["distance"].values,
        "frequency":   df_test["frequency"].values,
        "c_walls":     df_test["c_walls"].values,
        "w_walls":     df_test["w_walls"].values,
        "co2":         df_test["co2"].values,
        "humidity":    df_test["humidity"].values,
        "pm25":        df_test["pm25"].values,
        "pressure":    df_test["pressure"].values,
        "temperature": df_test["temperature"].values,
        "snr":         df_test["snr"].values,
        "PL_true":     yte_pl,
        "PL_pred":     PL_pred,
        "resid_db":    resid_db
    })
    residuals_list.append((model_tag, res_df))

    # Coefficients back in original units
    intercept_orig, beta_orig = unscale_coefficients(final_pipe)
    coeffs = np.concatenate(([intercept_orig], beta_orig))
    coeffs_series = pd.Series(coeffs, index=param_names, name=name)

    results.append({
        "model":       name,
        "best_cfg":    best_cfg,
        "cv":          best_cv,
        "test":        {"rmse": float(test_rmse), "r2": float(test_r2)},
        "coeffs":      coeffs_series,
        "final_pipe":  final_pipe,   # keep the fitted pipeline (for best-of-family save)
        "model_tag":   model_tag     # tag used for filenames
    })

# Save ONE best pipeline (by CV) + residuals per model (test split)

# Best-of-family (MLR) by mean Val RMSE (no test peeking)
best_overall = min(results, key=lambda r: r["cv"]["rmse_val_mean"])
best_pipe = best_overall["final_pipe"]
best_tag  = best_overall["model_tag"]
pipe_path = f"{MODELS_DIR}/{best_tag}.joblib"
dump(best_pipe, pipe_path)
print(f"\n[Models] Saved best MLR pipeline: {pipe_path} \n")

# Residuals per model (CSV only)
for tag, df_res in residuals_list:
    out_path = f"{RESID_DIR}/residuals__{tag}__test.csv"
    df_res.to_csv(out_path, index=False)
    print(f"[Residuals] Saved (csv): {out_path}")

# Canonical alias for the chosen/best MLR model for SF/FM consumption
best_alias_path = f"{RESID_DIR}/residuals__MLR__BEST__test.csv"
best_res_df = next(df for tag, df in residuals_list if tag == best_tag)
best_res_df.to_csv(best_alias_path, index=False)
print(f"\n [Residuals] Saved (csv, alias for BEST): {best_alias_path}\n")

# Recover factory + cfg for the best model family
best_model_name = best_overall["model"]
best_cfg        = best_overall["best_cfg"]
factory_for_best = next(f for (n, f, g) in specs if n == best_model_name)

# Build true out-of-fold predictions on TRAIN (using the CV folds)
y_pred_adj_oof = np.empty_like(ytr_adj, dtype=float)
for tr_idx, val_idx in folds:
    pipe = factory_for_best(best_cfg)
    pipe.fit(Xtr_lin[tr_idx], ytr_adj[tr_idx])
    y_pred_adj_oof[val_idx] = pipe.predict(Xtr_lin[val_idx])

# Back to PL domain and residuals
PL_pred_oof = y_pred_adj_oof + ftr_tr
resid_oof   = ytr_pl - PL_pred_oof

# Save OOF residuals (schema mirrors test, plus 'fold')
res_oof_df = pd.DataFrame({
    "model":       best_tag,
    "split":       "oof",
    "row_id":      np.arange(len(df_train), dtype=int),
    "fold":        fold_assignments.astype(int),
    "time":        df_train.get("time", pd.Series(index=df_train.index, dtype=float)),
    "device_id":   df_train["device_id"].values,
    "distance":    df_train["distance"].values,
    "frequency":   df_train["frequency"].values,
    "c_walls":     df_train["c_walls"].values,
    "w_walls":     df_train["w_walls"].values,
    "co2":         df_train["co2"].values,
    "humidity":    df_train["humidity"].values,
    "pm25":        df_train["pm25"].values,
    "pressure":    df_train["pressure"].values,
    "temperature": df_train["temperature"].values,
    "snr":         df_train["snr"].values,
    "PL_true":     ytr_pl,
    "PL_pred":     PL_pred_oof,
    "resid_db":    resid_oof
})
oof_path = f"{RESID_DIR_OOF}/residuals__{best_tag}__oof.csv"
res_oof_df.to_csv(oof_path, index=False)
print(f"[Residuals] Saved OOF residuals (best model): {oof_path}")

# Canonical alias for BEST
best_oof_alias = f"{RESID_DIR_OOF}/residuals__MLR__BEST__oof.csv"
res_oof_df.to_csv(best_oof_alias, index=False)
print(f"[Residuals] Saved (csv, alias for BEST OOF): {best_oof_alias}")


[Models] Saved best MLR pipeline: ../Extended Parametric Regression Files+Plots/Models/MLR/MLR_Ridge__alpha=3.1622776601683795.joblib 

[Residuals] Saved (csv): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__MLR_OLS__test.csv
[Residuals] Saved (csv): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__MLR_Ridge__alpha=3.1622776601683795__test.csv
[Residuals] Saved (csv): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__MLR_Lasso__alpha=0.0001__test.csv
[Residuals] Saved (csv): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__MLR_ElasticNet__alpha=0.0001__l1_ratio=0.8__test.csv

 [Residuals] Saved (csv, alias for BEST): ../Extended Parametric Regression Files+Plots/Reports/Residuals_Test/residuals__MLR__BEST__test.csv

[Residuals] Saved OOF residuals (best model): ../Extended Parametric Regression Files+Plots/Reports/Residuals_OOF/residuals__MLR_Ridge__alpha=3.1622

####  CV summary (train/val RMSE + R²)

In [5]:
def fmt(mu, sd): 
    return f"{mu:.4f} ± {sd:.4f}"

# Collect per-model CV stats
cv_rows = []
for res in results:
    cv = res['cv']
    cv_rows.append({
        "Model":       res['model'],
        "RMSE (Train)": fmt(cv["rmse_train_mean"], cv["rmse_train_sd"]),
        "RMSE (Val)":   fmt(cv["rmse_val_mean"],   cv["rmse_val_sd"]),
        "R2 (Train)":   fmt(cv["r2_train_mean"],   cv["r2_train_sd"]),
        "R2 (Val)":     fmt(cv["r2_val_mean"],     cv["r2_val_sd"]),
    })

# Wrap in DataFrame for organized display
cv_df = pd.DataFrame(cv_rows)
print("\n=== Cross-Validation Results (training set) ===")
display(cv_df)


=== Cross-Validation Results (training set) ===


Unnamed: 0,Model,RMSE (Train),RMSE (Val),R2 (Train),R2 (Val)
0,OLS,8.0971 ± 0.0027,8.0972 ± 0.0109,0.8149 ± 0.0001,0.8149 ± 0.0005
1,Ridge,8.0971 ± 0.0027,8.0972 ± 0.0109,0.8149 ± 0.0001,0.8149 ± 0.0005
2,Lasso,8.0971 ± 0.0027,8.0972 ± 0.0109,0.8149 ± 0.0001,0.8149 ± 0.0005
3,ElasticNet,8.0971 ± 0.0027,8.0972 ± 0.0109,0.8149 ± 0.0001,0.8149 ± 0.0005


#### Test summary (final fit on full train)

In [6]:
# Collect per-model test scores
test_rows = []
for res in results:
    te = res['test']
    test_rows.append({
        "Model": res['model'],
        "Test RMSE": f"{te['rmse']:.4f}",
        "Test R2":   f"{te['r2']:.4f}"
    })

# Frame + display
test_df = pd.DataFrame(test_rows)
print("\n=== Test (final fit on all train) ===")
display(test_df)


=== Test (final fit on all train) ===


Unnamed: 0,Model,Test RMSE,Test R2
0,OLS,8.0738,0.8155
1,Ridge,8.0738,0.8155
2,Lasso,8.0738,0.8155
3,ElasticNet,8.0739,0.8155


#### Coefficient table (all models, final fits, original units)

In [7]:
# Collect coeffs side by side
coef_df = pd.concat([res['coeffs'] for res in results], axis=1)
coef_df.columns = [res['model'] for res in results]

print("\n=== Harmonized Coefficients (final all-train fits, original units) ===")
display(coef_df)


=== Harmonized Coefficients (final all-train fits, original units) ===


Unnamed: 0,OLS,Ridge,Lasso,ElasticNet
PL(d0) [dB],-4.303158,-4.302936,-4.302045,-4.299936
Path loss exponent (n),4.047943,4.047918,4.048023,4.047754
Brick Wall Loss (L_c) [dB],6.24009,6.240106,6.239798,6.240027
Wood Wall Loss (L_w) [dB],1.727841,1.727872,1.72768,1.728029
CO2 coef. [dB/unit],-0.002313,-0.002313,-0.002312,-0.002313
Humidity coef. [dB/%],-0.083998,-0.083998,-0.083979,-0.083976
PM2.5 coef. [dB/µg/m³],-0.046312,-0.046309,-0.046272,-0.046254
Pressure coef. [dB/hPa],0.004836,0.004835,0.004823,0.004825
Temp. coef. [dB/°C],-0.1046,-0.104601,-0.104569,-0.104582
SNR scaling (k_snr),-2.07009,-2.070087,-2.070068,-2.070038


#### Chosen hyperparameters (picked by mean Val RMSE)

In [8]:
# Collect best config per model
hp_rows = []
for res in results:
    cfg = res['best_cfg']
    hp_rows.append({"Model": res['model'], **({} if cfg is None else cfg)})

# Frame + display
hp_df = pd.DataFrame(hp_rows).fillna("—")
print("\n=== Chosen Hyperparameters (by mean Val RMSE) ===")
display(hp_df)


=== Chosen Hyperparameters (by mean Val RMSE) ===


Unnamed: 0,Model,alpha,l1_ratio
0,OLS,—,—
1,Ridge,3.162278,—
2,Lasso,0.0001,—
3,ElasticNet,0.0001,0.8


##  ANOVA for MLR (physics form)

#### OLS analysis frames + formulas (PL adj by Friis; linear predictors)

In [9]:
# Build analysis DataFrame (PL_adj + linear drivers)
dfA = pd.DataFrame({
    "PL_adj":       df_train[target_col].astype(float).values - f_term(df_train["frequency"].values),
    "z_d":          z_of_d(df_train["distance"].values),
    "c_walls":      df_train["c_walls"].values,
    "w_walls":      df_train["w_walls"].values,
    "co2":          df_train["co2"].values,
    "humidity":     df_train["humidity"].values,
    "pm25":         df_train["pm25"].values,
    "pressure":     df_train["pressure"].values,
    "temperature":  df_train["temperature"].values,
    "snr":          df_train["snr"].values,
})

# Formulas
full_terms        = "z_d + c_walls + w_walls + co2 + humidity + pm25 + pressure + temperature + snr"
formula_full      = f"PL_adj ~ {full_terms}"
struct_terms      = "z_d + c_walls + w_walls"
formula_struct    = f"PL_adj ~ {struct_terms}"
env_terms         = "co2 + humidity + pm25 + pressure + temperature"
formula_structenv = f"PL_adj ~ {struct_terms} + {env_terms}"

# Fit OLS (HC3 robust SEs)
model_full   = smf.ols(formula=formula_full,      data=dfA).fit(cov_type="HC3")
model_struct = smf.ols(formula=formula_struct,    data=dfA).fit(cov_type="HC3")
model_env    = smf.ols(formula=formula_structenv, data=dfA).fit(cov_type="HC3")

#### ANOVA (Type II, HC3 robust)

In [10]:
# Run Type-II ANOVA (HC3)
anova_type2 = anova_lm(model_full, typ=2, robust="hc3")

print("\n=== Type-II ANOVA (HC3 robust) — additive linear model ===")
display(anova_type2)


=== Type-II ANOVA (HC3 robust) — additive linear model ===


Unnamed: 0,sum_sq,df,F,PR(>F)
z_d,31903220.0,1.0,486600.46478,0.0
c_walls,17617300.0,1.0,268706.013042,0.0
w_walls,2771150.0,1.0,42266.674888,0.0
co2,106311.6,1.0,1621.506613,0.0
humidity,409737.2,1.0,6249.474185,0.0
pm25,16551.29,1.0,252.446828,7.677337e-57
pressure,2598.599,1.0,39.634852,3.062462e-10
temperature,244353.4,1.0,3726.974244,0.0
snr,17836260.0,1.0,272045.645932,0.0
Residual,109083100.0,1663778.0,,


#### ANOVA (Type III, HC3 robust)

In [11]:
# Run Type-III ANOVA (HC3)
anova_type3 = anova_lm(model_full, typ=3, robust="hc3")

print("\n=== Type-III ANOVA (HC3 robust) — additive linear model ===")
display(anova_type3)


=== Type-III ANOVA (HC3 robust) — additive linear model ===


Unnamed: 0,sum_sq,df,F,PR(>F)
Intercept,16307.87,1.0,248.734133,4.948598e-56
z_d,31903220.0,1.0,486600.46478,0.0
c_walls,17617300.0,1.0,268706.013042,0.0
w_walls,2771150.0,1.0,42266.674888,0.0
co2,106311.6,1.0,1621.506613,0.0
humidity,409737.2,1.0,6249.474185,0.0
pm25,16551.29,1.0,252.446828,7.677337e-57
pressure,2598.599,1.0,39.634852,3.062462e-10
temperature,244353.4,1.0,3726.974244,0.0
snr,17836260.0,1.0,272045.645932,0.0


#### Nested partial-F tests (RSS-based, classical)

In [12]:
# Refit non-robust (needed for compare_f_test → uses RSS/df)
model_full_nr   = smf.ols(formula=formula_full,      data=dfA).fit()
model_struct_nr = smf.ols(formula=formula_struct,    data=dfA).fit()
model_env_nr    = smf.ols(formula=formula_structenv, data=dfA).fit()

# (A) struct → +env
F_A, p_A, _ = model_full_nr.compare_f_test(model_struct_nr)
df_num_A = model_full_nr.df_model - model_struct_nr.df_model
df_den_A = model_full_nr.df_resid
partial_eta2_A = (F_A * df_num_A) / (F_A * df_num_A + df_den_A)

# (B) +env → +snr
F_B, p_B, _ = model_full_nr.compare_f_test(model_env_nr)
df_num_B = model_full_nr.df_model - model_env_nr.df_model
df_den_B = model_full_nr.df_resid
partial_eta2_B = (F_B * df_num_B) / (F_B * df_num_B + df_den_B)

print("\n=== Nested block tests (partial-F on RSS) ===")
print(f"(A) Struct → +Env:  F = {F_A:.4f}, df = ({int(df_num_A)}, {int(df_den_A)}), "
      f"p = {p_A:.3e}, partial η² = {partial_eta2_A:.4f}")
print(f"(B) +Env   → +SNR:  F = {F_B:.4f}, df = ({int(df_num_B)}, {int(df_den_B)}), "
      f"p = {p_B:.3e}, partial η² = {partial_eta2_B:.4f}")


=== Nested block tests (partial-F on RSS) ===
(A) Struct → +Env:  F = 231850.5230, df = (6, 1663778), p = 0.000e+00, partial η² = 0.4554
(B) +Env   → +SNR:  F = 1368200.6716, df = (1, 1663778), p = 0.000e+00, partial η² = 0.4513


#### HC3 coefficient table (interpretable params)

In [13]:
# Extract HC3-robust coefficients + CIs
coef_tbl = model_full.get_robustcov_results(cov_type="HC3").summary2().tables[1].copy()
coef_tbl.rename(columns={
    "Coef.": "coef",
    "Std.Err.": "std_err",
    "P>|t|": "pval",
    "[0.025": "ci_low",
    "0.975]": "ci_high"
}, inplace=True)

print("\n=== Coefficients (HC3 robust) — key parameters ===")
display(coef_tbl.loc[['Intercept','z_d','c_walls','w_walls',
                      'co2','humidity','pm25','pressure','temperature','snr']])


=== Coefficients (HC3 robust) — key parameters ===


Unnamed: 0,coef,std_err,z,P>|z|,ci_low,ci_high
Intercept,-4.303158,0.272847,-15.771307,4.902446e-56,-4.837929,-3.768388
z_d,4.047943,0.005803,697.567534,0.0,4.036569,4.059316
c_walls,6.24009,0.012038,518.368607,0.0,6.216496,6.263684
w_walls,1.727841,0.008404,205.588606,0.0,1.711369,1.744314
co2,-0.002313,5.7e-05,-40.267935,0.0,-0.002426,-0.002201
humidity,-0.083998,0.001063,-79.053616,0.0,-0.086081,-0.081916
pm25,-0.046312,0.002915,-15.888575,7.603601e-57,-0.052025,-0.040599
pressure,0.004836,0.000768,6.295622,3.061703e-10,0.00333,0.006341
temperature,-0.1046,0.001713,-61.04895,0.0,-0.107958,-0.101242
snr,-2.07009,0.003969,-521.579952,0.0,-2.077869,-2.062311
