<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Parametric Fitting + Regularization Methods for Indoor LoRaWAN Signal Propagation 
</p>

In [2]:
# ============================== Core & Data Libraries ==============================
import os
import pickle
import numpy as np
import pandas as pd

# ============================== Machine Learning & Stats ===========================
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.linear_model import LinearRegression, RidgeCV, LassoCV, ElasticNetCV
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.pipeline import make_pipeline
from sklearn.base import BaseEstimator, RegressorMixin, clone
from numpy.linalg import inv

# neat prints
np.set_printoptions(suppress=True, linewidth=120)

# fixed seed for CV components that use randomness
RANDOM_STATE = 42

###  Load data + fold index + basic split

In [4]:
# Path to the standardized database directory
base_path = '../Extended Parametric Regression Files+Plots.'

# Load train and test splits# Path to standardized data
base_path = '../Extended Parametric Regression Files+Plots.'

# Train/test splits
df_train = pd.read_csv(f"{base_path}/train.csv")
df_test  = pd.read_csv(f"{base_path}/test.csv")

df_train = df_train.sort_values('time').reset_index(drop=True).iloc[::20].reset_index(drop=True)
df_test  = df_test.sort_values('time').reset_index(drop=True).iloc[::20].reset_index(drop=True)


# Raw feature set (distance, freq, walls, env, snr)
feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity',
    'pm25', 'pressure', 'temperature', 'snr'
]

X_train_raw = df_train[feature_names].values
y_train     = df_train['PL'].values
X_test_raw  = df_test[feature_names].values
y_test      = df_test['PL'].values

# Optional time columns if needed later
time_train = df_train.get('time', pd.Series(np.arange(len(df_train)))).values
time_test  = df_test.get('time',  pd.Series(np.arange(len(df_test)))).values

print(f"Training samples: {X_train_raw.shape[0]}, Test samples: {X_test_raw.shape[0]}")

# Leakage-safe fold assignments (grouped-by-device, time-blocked) prepared offline
fold_assignments = np.load(f"{base_path}/train_folds.npy")
unique, counts = np.unique(fold_assignments, return_counts=True)
print("Fold distribution:", dict(zip(unique, counts)))

Training samples: 60483, Test samples: 15121
Fold distribution: {np.int64(0): np.int64(241929), np.int64(1): np.int64(241929), np.int64(2): np.int64(241929), np.int64(3): np.int64(241928), np.int64(4): np.int64(241928)}


### Model linearization (log-distance + freq offset)

In [6]:
# Linearization separates the non-linear frequency term and transforms distance
d0 = 1.0# Linearize LDPL: move 20*log10(f) to RHS, transform distance to 10*log10(d/d0)
d0 = 1.0

# Train transforms
log_d_train  = np.log10(np.clip(X_train_raw[:, 0], 1e-9, None) / d0)
offset_train = 20.0 * np.log10(np.clip(X_train_raw[:, 1], 1e-9, None))  # frequency offset
X_lin_train  = np.column_stack([
    10.0 * log_d_train,           # distance term for exponent n
    X_train_raw[:, 2:10]          # c_walls, w_walls, co2, humidity, pm25, pressure, temperature, snr
])
y_train_adj  = y_train - offset_train

# Test transforms
log_d_test  = np.log10(np.clip(X_test_raw[:, 0], 1e-9, None) / d0)
offset_test = 20.0 * np.log10(np.clip(X_test_raw[:, 1], 1e-9, None))
X_lin_test  = np.column_stack([
    10.0 * log_d_test,
    X_test_raw[:, 2:10]
])
y_test_adj  = y_test - offset_test

p = X_lin_train.shape[1]
print(f"Linearized feature dimensionality (p): {p}")

Linearized feature dimensionality (p): 9


### Model builders (linear + poly(2))

In [8]:
# ---------- Linear (degree=1): OLS + L2 + L1 + EN ----------
def build_mlr_linear():
    # Standardize features; plain OLS on adjusted target
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        LinearRegression()
    )

def build_ridge_linear():
    alphas = np.logspace(-6, 6, 100)
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
    )

def build_lasso_linear():
    alphas = np.logspace(-6, 2, 60)  # Lasso needs tighter range
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        LassoCV(alphas=alphas, cv=5, random_state=RANDOM_STATE, max_iter=20000)
    )

def build_enet_linear():
    alphas = np.logspace(-6, 2, 40)
    l1_ratios = [0.1, 0.3, 0.5, 0.7, 0.9]
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=5,
                     random_state=RANDOM_STATE, max_iter=30000)
    )

# ---------- Polynomial (degree=2 on the linearized features) ----------
def build_poly_regr():
    # OLS on degree-2 map (squares + pairwise interactions); walls remain additive in X
    return make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        StandardScaler(with_mean=True, with_std=True),
        LinearRegression()
    )

def build_ridge_poly2():
    alphas = np.logspace(-6, 6, 100)
    return make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        StandardScaler(with_mean=True, with_std=True),
        RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
    )

def build_lasso_poly2():
    alphas = np.logspace(-6, 0, 50)  # keep stable
    return make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        StandardScaler(with_mean=True, with_std=True),
        LassoCV(alphas=alphas, cv=5, random_state=RANDOM_STATE, max_iter=200000)
    )

def build_enet_poly2():
    alphas = np.logspace(-3, 0, 30)
    l1_ratios = [0.2, 0.4, 0.6, 0.8]
    return make_pipeline(
        PolynomialFeatures(degree=2, include_bias=False),
        StandardScaler(with_mean=True, with_std=True),
        ElasticNetCV(alphas=alphas, l1_ratio=l1_ratios, cv=5,
                     random_state=RANDOM_STATE, max_iter=300000, tol=1e-3, 
                     n_jobs=-1, precompute='auto')
    )

### Full conjugate BLR on the linear design

In [10]:
class FullBLRConjugate(BaseEstimator, RegressorMixin):
    """
    Conjugate Bayesian Linear Regression with Normal-Inverse-Gamma prior:
        beta | sigma^2 ~ N(beta0, sigma^2 V0)
        sigma^2 ~ Inv-Gamma(a0, b0)
    Works on adjusted target; intercept added internally.
    """
    def __init__(self, beta0=None, V0_scale=1e6, a0=1e-2, b0=1e-2):
        self.beta0 = beta0
        self.V0_scale = V0_scale
        self.a0 = a0
        self.b0 = b0
        self.beta_n_ = None
        self.Vn_ = None
        self.an_ = None
        self.bn_ = None

    def _augment(self, X):
        n = X.shape[0]
        return np.hstack([np.ones((n, 1)), X])

    def fit(self, X, y):
        X = np.asarray(X, dtype=float)
        y = np.asarray(y, dtype=float).reshape(-1)

        X_aug = self._augment(X)   # [n x (p+1)]
        n, d = X_aug.shape

        if self.beta0 is None:
            self.beta0 = np.zeros(d)
        beta0 = self.beta0

        # prior covariance V0 = V0_scale * I  (weakly informative on standardized X)
        V0_inv = np.eye(d) / self.V0_scale

        XtX = X_aug.T @ X_aug
        Vn_inv = V0_inv + XtX
        Vn = inv(Vn_inv)

        Xty = X_aug.T @ y
        beta_n = Vn @ (V0_inv @ beta0 + Xty)

        an = self.a0 + 0.5 * n
        resid = y - X_aug @ beta_n
        term = (beta_n - beta0).T @ V0_inv @ (beta_n - beta0)
        bn = self.b0 + 0.5 * (resid @ resid + term)

        self.beta_n_ = beta_n
        self.Vn_ = Vn
        self.an_ = an
        self.bn_ = bn
        return self

    def predict(self, X, return_std=False):
        X = np.asarray(X, dtype=float)
        X_aug = self._augment(X)
        mean = X_aug @ self.beta_n_
        if not return_std:
            return mean
        # predictive var = (bn/an) * (1 + x^T Vn x)
        pred_var = (self.bn_ / self.an_) * (1.0 + np.sum(X_aug @ self.Vn_ * X_aug, axis=1))
        pred_std = np.sqrt(np.maximum(pred_var, 0.0))
        return mean, pred_std

def build_full_blr_linear():
    # Standardize first; BLR on adjusted target. This is *not* BayesianRidge.
    return make_pipeline(
        StandardScaler(with_mean=True, with_std=True),
        FullBLRConjugate(beta0=None, V0_scale=1e6, a0=1e-2, b0=1e-2)
    )

### Model dictionary

In [12]:
models = {
    # Linear (degree=1)
    "MLR_linear":         build_mlr_linear(),
    "Ridge_linear":       build_ridge_linear(),
    "Lasso_linear":       build_lasso_linear(),
    "ElasticNet_linear":  build_enet_linear(),

    # Polynomial (degree=2 on linearized features)
    "Poly2_OLS":          build_poly_regr(),
    "Ridge_poly2":        build_ridge_poly2(),
    "Lasso_poly2":        build_lasso_poly2(),
    "ElasticNet_poly2":   build_enet_poly2(),

    # Full conjugate Bayesian Linear Regression on linear design
    "BLR_linear":         build_full_blr_linear(),
}
print("Models defined:", list(models.keys()))

Models defined: ['MLR_linear', 'Ridge_linear', 'Lasso_linear', 'ElasticNet_linear', 'Poly2_OLS', 'Ridge_poly2', 'Lasso_poly2', 'ElasticNet_poly2', 'BLR_linear']


### Leakage‑safe 5‑fold CV (outer) and summary

In [14]:
# Ensure fold_assignments matches the current dataset size
fold_assignments = np.array(fold_assignments)[:X_lin_train.shape[0]]

def reconstruct_full(y_pred_adj, offset):
    # put 20*log10(f) back to compare in PL (dB)
    return y_pred_adj + offset

cv_results = []

for name, pipe in models.items():
    rmse_tr, rmse_val = [], []
    r2_tr, r2_val = [], []

    for fold in range(5):
        tr_idx = np.where(fold_assignments != fold)[0]
        val_idx = np.where(fold_assignments == fold)[0]

        X_tr, y_tr, y_tr_adj, off_tr = (
            X_lin_train[tr_idx],
            y_train[tr_idx],
            y_train_adj[tr_idx],
            offset_train[tr_idx]
        )

        X_val, y_val, y_val_adj, off_val = (
            X_lin_train[val_idx],
            y_train[val_idx],
            y_train_adj[val_idx],
            offset_train[val_idx]
        )

        est = clone(pipe)
        est.fit(X_tr, y_tr_adj)

        # Train metrics in PL units
        y_tr_pred = reconstruct_full(est.predict(X_tr), off_tr)
        rmse_tr.append(np.sqrt(mean_squared_error(y_tr, y_tr_pred)))
        r2_tr.append(r2_score(y_tr, y_tr_pred))

        # Validation metrics in PL units
        y_val_pred = reconstruct_full(est.predict(X_val), off_val)
        rmse_val.append(np.sqrt(mean_squared_error(y_val, y_val_pred)))
        r2_val.append(r2_score(y_val, y_val_pred))

    row = {
        "model": name,
        "RMSE_train_mean": np.mean(rmse_tr), "RMSE_train_std": np.std(rmse_tr),
        "RMSE_val_mean":   np.mean(rmse_val), "RMSE_val_std":   np.std(rmse_val),
        "R2_train_mean":   np.mean(r2_tr),    "R2_train_std":   np.std(r2_tr),
        "R2_val_mean":     np.mean(r2_val),   "R2_val_std":     np.std(r2_val),
    }
    cv_results.append(row)
    print(f"{name:>16} | RMSE_val: {row['RMSE_val_mean']:.3f} ± {row['RMSE_val_std']:.3f} | "
          f"R2_val: {row['R2_val_mean']:.3f} ± {row['R2_val_std']:.3f}")

cv_df = pd.DataFrame(cv_results).sort_values(by="RMSE_val_mean")
cv_df

      MLR_linear | RMSE_val: 8.205 ± 0.068 | R2_val: 0.816 ± 0.003
    Ridge_linear | RMSE_val: 8.210 ± 0.066 | R2_val: 0.816 ± 0.003
    Lasso_linear | RMSE_val: 8.205 ± 0.068 | R2_val: 0.816 ± 0.003
ElasticNet_linear | RMSE_val: 8.215 ± 0.067 | R2_val: 0.816 ± 0.003
       Poly2_OLS | RMSE_val: 7.008 ± 0.098 | R2_val: 0.866 ± 0.003
     Ridge_poly2 | RMSE_val: 7.054 ± 0.101 | R2_val: 0.864 ± 0.003
     Lasso_poly2 | RMSE_val: 7.052 ± 0.099 | R2_val: 0.864 ± 0.003
ElasticNet_poly2 | RMSE_val: 7.054 ± 0.100 | R2_val: 0.864 ± 0.003
      BLR_linear | RMSE_val: 8.205 ± 0.068 | R2_val: 0.816 ± 0.003


Unnamed: 0,model,RMSE_train_mean,RMSE_train_std,RMSE_val_mean,RMSE_val_std,R2_train_mean,R2_train_std,R2_val_mean,R2_val_std
4,Poly2_OLS,6.991773,0.024207,7.008468,0.098382,0.866408,0.000782,0.865759,0.003166
6,Lasso_poly2,7.03879,0.024346,7.051644,0.098677,0.864606,0.000786,0.864099,0.003217
5,Ridge_poly2,7.041912,0.024653,7.053656,0.101059,0.864485,0.000802,0.864021,0.003318
7,ElasticNet_poly2,7.042654,0.024405,7.054281,0.099649,0.864457,0.000791,0.863997,0.003258
8,BLR_linear,8.203852,0.01689,8.20489,0.068137,0.816075,0.000733,0.816018,0.002954
0,MLR_linear,8.203852,0.01689,8.20489,0.068137,0.816075,0.000733,0.816018,0.002954
2,Lasso_linear,8.20455,0.016997,8.205498,0.068066,0.816044,0.000741,0.81599,0.002959
1,Ridge_linear,8.209306,0.017996,8.210337,0.066245,0.815831,0.000786,0.815774,0.002895
3,ElasticNet_linear,8.214319,0.016834,8.215289,0.067167,0.815606,0.000733,0.815551,0.00295


### Fit full‑train models + persist

In [16]:
os.makedirs('Models', exist_ok=True)
fitted_models = {}

for name, pipe in models.items():
    est = clone(pipe)
    est.fit(X_lin_train, y_train_adj)
    fitted_models[name] = est

    with open(f"Models/{name}.pkl", "wb") as f:
        pickle.dump(est, f)

print("Saved models:", list(fitted_models.keys()))

Saved models: ['MLR_linear', 'Ridge_linear', 'Lasso_linear', 'ElasticNet_linear', 'Poly2_OLS', 'Ridge_poly2', 'Lasso_poly2', 'ElasticNet_poly2', 'BLR_linear']


### Test‑set evaluation + summaries + poly term count

In [18]:
test_rows = []
for name, est in fitted_models.items():
    y_pred_adj = est.predict(X_lin_test)
    y_pred = y_pred_adj + offset_test
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    r2   = r2_score(y_test, y_pred)
    test_rows.append({"model": name, "RMSE_test": rmse, "R2_test": r2})

test_df = pd.DataFrame(test_rows).sort_values(by="RMSE_test")
print(test_df.to_string(index=False))

# Persist summaries
cv_df.to_csv("Models/cv_summary.csv", index=False)
test_df.to_csv("Models/test_summary.csv", index=False)

# sanity: degree-2 expansion size on p linearized features
tmp_poly = PolynomialFeatures(degree=2, include_bias=False).fit(X_lin_train)
print("Polynomial degree-2 expanded feature count:", tmp_poly.transform(X_lin_train[:1]).shape[1])

# choose best-by-CV and best-by-Test
best_cv  = cv_df.iloc[0]["model"]
best_tst = test_df.iloc[0]["model"]
print(f"\nBest by CV (lowest RMSE_val_mean): {best_cv}")
print(f"Best on Test (lowest RMSE_test):    {best_tst}")

            model  RMSE_test  R2_test
        Poly2_OLS   7.322138 0.855506
      Lasso_poly2   7.372619 0.853507
 ElasticNet_poly2   7.387541 0.852913
      Ridge_poly2   7.395745 0.852586
       MLR_linear   8.486832 0.805882
       BLR_linear   8.486832 0.805882
     Lasso_linear   8.489267 0.805771
     Ridge_linear   8.492559 0.805620
ElasticNet_linear   8.498211 0.805361
Polynomial degree-2 expanded feature count: 54

Best by CV (lowest RMSE_val_mean): Poly2_OLS
Best on Test (lowest RMSE_test):    Poly2_OLS
