<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
Ridge Regression Model fitting 
</p>

In [None]:
# ==== SPEED HEADER (minimal, safe) ============================================
import os

# Use joblib for parallelism; stop BLAS from oversubscribing
os.environ["MKL_NUM_THREADS"] = "1"
os.environ["OPENBLAS_NUM_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"
os.environ["NUMEXPR_MAX_THREADS"] = "1"

# Faster temp for joblib if available (RAM-backed)
if os.path.isdir("/dev/shm"):
    os.environ.setdefault("JOBLIB_TEMP_FOLDER", "/dev/shm/joblib")

# Parallelism knob for bootstrap work
N_JOBS = 16   # start at 16
# ============================================================================

In [31]:
# ============================== Core & Data Libraries ==============================
import os                                   # File and directory operations
import pickle                               # Object serialization
import numpy as np                          # Numerical computations
import pandas as pd                         # Data manipulation and analysis

# ============================== Machine Learning & Stats ===========================
from sklearn.metrics import mean_squared_error, r2_score      # Model evaluation metrics
from sklearn.linear_model import RidgeCV                      # Ridge regression with CV
from sklearn.preprocessing import StandardScaler              # Feature scaling
from sklearn.pipeline import make_pipeline                    # Pipeline for scaling and model

<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Dataset: Load Saved Splits and Fold Assignments
</p>

In [32]:
# Path to the standardized database directory
base_path = '../Extended Parametric Regression Files+Plots'

# Load train and test splits
df_train = pd.read_csv(f"{base_path}/train.csv")
df_test = pd.read_csv(f"{base_path}/test.csv")

# Extract features and targets
feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity', 
    'pm25', 'pressure', 'temperature', 'snr'
]
X_train = df_train[feature_names].values
y_train = df_train['PL'].values
X_test = df_test[feature_names].values
y_test = df_test['PL'].values

# (Should we need 'time' for plotting)
time_train = df_train['time'].values
time_test = df_test['time'].values

# Print number of samples in train and test sets
print(f"\nTraining samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Load 5-fold assignments (array of fold numbers for each train sample)
fold_assignments = np.load(f"{base_path}/train_folds.npy")

# Print fold distribution
unique, counts = np.unique(fold_assignments, return_counts=True)
print(dict(zip(unique, counts)))

print('\nDataset loaded successfully!\n')

# Prepare linearized features and adjusted targets for linear models
# Linearization separates the non-linear frequency term and transforms distance term
d0 = 1.0

# Train
log_d_train = np.log10(X_train[:, 0] / d0)
offset_train = 20 * np.log10(X_train[:, 1])  # Fixed frequency contribution
X_lin_train = np.column_stack((
    10 * log_d_train,  # Transformed distance term for path loss exponent
    X_train[:, 2:10]   # Remaining linear features
))
y_train_adj = y_train - offset_train  # Adjust target by subtracting frequency offset

# Test
log_d_test = np.log10(X_test[:, 0] / d0)
offset_test = 20 * np.log10(X_test[:, 1])
X_lin_test = np.column_stack((
    10 * log_d_test,
    X_test[:, 2:10]
))
y_test_adj = y_test - offset_test


Training samples: 1341431, Test samples: 335358
{np.int64(0): np.int64(268287), np.int64(1): np.int64(268286), np.int64(2): np.int64(268286), np.int64(3): np.int64(268286), np.int64(4): np.int64(268286)}

Dataset loaded successfully!



<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
  Ridge Regression Model
</p>

In [33]:
# ===================  Model Function ===================

def log_distance_path_loss_with_env_params(x, PL_d0, n, L_c, L_w,
                                           a_co2, a_hum, a_pm25,
                                           a_pres, a_temp, k_snr):
    """
    Path loss model with environmental parameters.
    x: 2D array (10, N), where:
       x[0]=distance, x[1]=frequency, x[2]=c_walls, ..., x[9]=snr
    """
    d, frequency, c_walls, w_walls, co2, humidity, pm25, pressure, temperature, snr = x
    d0 = 1  # Reference distance
    return (PL_d0
            + 10 * n * np.log10(d / d0)
            + 20 * np.log10(frequency)
            + c_walls * L_c
            + w_walls * L_w
            + a_co2 * co2
            + a_hum * humidity
            + a_pm25 * pm25
            + a_pres * pressure
            + a_temp * temperature
            + snr * k_snr)

<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 5-Fold Cross-Validation on Training Set
</p>

In [34]:
# =================== 5-Fold Cross-Validation (Training set only) ===================
from joblib import Parallel, delayed

alphas = np.logspace(-6, 6, 200)

def _eval_one_fold(fold_num):
    tr_idx = np.where(fold_assignments != fold_num)[0]
    val_idx = np.where(fold_assignments == fold_num)[0]

    X_tr       = X_lin_train[tr_idx]
    y_tr_adj   = y_train_adj[tr_idx]
    X_val      = X_lin_train[val_idx]
    y_val_adj  = y_train_adj[val_idx]
    offset_tr  = offset_train[tr_idx]
    offset_val = offset_train[val_idx]

    pipeline = make_pipeline(
        StandardScaler(),  # <<< copy=True (default) to avoid in-place mutation
        RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
    )
    pipeline.fit(X_tr, y_tr_adj)

    scaler   = pipeline.named_steps['standardscaler']
    ridge_cv = pipeline.named_steps['ridgecv']

    coef_sc  = ridge_cv.coef_.astype(float)
    b_sc     = float(ridge_cv.intercept_)
    coef_un  = coef_sc / scaler.scale_
    b_un     = b_sc - np.sum((coef_sc * scaler.mean_) / scaler.scale_)
    coeffs   = np.concatenate(([b_un], coef_un))

    y_tr_pred_adj = pipeline.predict(X_tr); y_tr_pred = y_tr_pred_adj + offset_tr
    rmse_train = float(np.sqrt(mean_squared_error(y_train[tr_idx], y_tr_pred)))
    r2_train   = float(r2_score(y_train[tr_idx], y_tr_pred))

    y_val_pred_adj = pipeline.predict(X_val); y_val_pred = y_val_pred_adj + offset_val
    rmse_val = float(np.sqrt(mean_squared_error(y_train[val_idx], y_val_pred)))
    r2_val   = float(r2_score(y_train[val_idx], y_val_pred))

    return {
        "fold": fold_num + 1,
        "rmse_train": rmse_train, "r2_train": r2_train,
        "rmse_val": rmse_val,     "r2_val": r2_val,
        "alpha_star": float(ridge_cv.alpha_),
        "coeffs": coeffs,
    }

fold_results = Parallel(n_jobs=min(N_JOBS, 5), backend="threading", prefer="threads")(
    delayed(_eval_one_fold)(f) for f in range(5)
)

rmse_train_folds = [r["rmse_train"] for r in fold_results]
rmse_val_folds   = [r["rmse_val"]   for r in fold_results]
r2_train_folds   = [r["r2_train"]   for r in fold_results]
r2_val_folds     = [r["r2_val"]     for r in fold_results]
cv_coeffs        = [r["coeffs"]     for r in fold_results]

for r in sorted(fold_results, key=lambda z: z["fold"]):
    print(f"Fold {r['fold']}: RMSE_train={r['rmse_train']:.4f}, RMSE_val={r['rmse_val']:.4f} (alpha={r['alpha_star']:.2e})")

print("\n=== Cross-Validation Results on the training set ===")
print(f"RMSE (Train): {np.mean(rmse_train_folds):.4f} ± {np.std(rmse_train_folds):.4f}")
print(f"RMSE (Val):   {np.mean(rmse_val_folds):.4f} ± {np.std(rmse_val_folds):.4f}")
print(f"R2 (Train):   {np.mean(r2_train_folds):.4f} ± {np.std(r2_train_folds):.4f}")
print(f"R2 (Val):     {np.mean(r2_val_folds):.4f} ± {np.std(r2_val_folds):.4f}")

Fold 1: RMSE_train=8.0718, RMSE_val=8.0796 (alpha=1.14e+01)
Fold 2: RMSE_train=8.0743, RMSE_val=8.0695 (alpha=7.49e+00)
Fold 3: RMSE_train=8.0677, RMSE_val=8.0959 (alpha=1.14e+01)
Fold 4: RMSE_train=8.0744, RMSE_val=8.0690 (alpha=1.30e+01)
Fold 5: RMSE_train=8.0784, RMSE_val=8.0531 (alpha=1.30e+01)

=== Cross-Validation Results on the training set ===
RMSE (Train): 8.0733 ± 0.0035
RMSE (Val):   8.0734 ± 0.0141
R2 (Train):   0.8172 ± 0.0002
R2 (Val):     0.8171 ± 0.0010


<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Retrain Final Model on All Training Data
</p>

In [35]:
# =================== Final Model Training (All Training Data) ===================

alphas = np.logspace(-6, 6, 200)
pipeline = make_pipeline(
    StandardScaler(),  # <<< copy=True (default)
    RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error')
)
pipeline.fit(X_lin_train, y_train_adj)

scaler   = pipeline.named_steps['standardscaler']
ridge_cv = pipeline.named_steps['ridgecv']

coef_scaled        = ridge_cv.coef_.astype(float)
intercept_scaled   = float(ridge_cv.intercept_)
coef_unscaled      = coef_scaled / scaler.scale_
intercept_unscaled = intercept_scaled - np.sum((coef_scaled * scaler.mean_) / scaler.scale_)
final_coeffs       = np.concatenate(([intercept_unscaled], coef_unscaled))

os.makedirs('Models', exist_ok=True)
with open('Models/ridge_final_coeffs.pkl', 'wb') as f:
    pickle.dump(final_coeffs, f)
print("\nFinal Ridge model coefficients saved to Models/ridge_final_coeffs.pkl")
print(f"Chosen alpha on full train: {ridge_cv.alpha_:.2e}")

PL_d0, n, L_c, L_w, a_co2, a_hum, a_pm25, a_pres, a_temp, k_snr = final_coeffs

params_final = {
    'PL(d0) [dB]': PL_d0,
    'Path loss exponent (n)': n,
    'Brick Wall Loss (L_c) [dB]': L_c,
    'Wood Wall Loss (L_w) [dB]': L_w,
    'CO2 coef. (a_co2) [dB/unit]': a_co2,
    'Humidity coef. (a_hum) [dB/%)': a_hum,
    'PM2.5 coef. (a_pm25) [dB/µg/m³]': a_pm25,
    'Pressure coef. (a_pres) [dB/hPa]': a_pres,
    'Temp. coef. (a_temp) [dB/°C]': a_temp,
    'SNR scaling (k_snr)': k_snr
}

params_final_df = pd.DataFrame({
    'Parameter': list(params_final.keys()),
    'Final Model': list(params_final.values())
})

print("\n=== Model Coefficients ===\n")
display(params_final_df)


Final Ridge model coefficients saved to Models/ridge_final_coeffs.pkl
Chosen alpha on full train: 1.30e+01

=== Model Coefficients ===



Unnamed: 0,Parameter,Final Model
0,PL(d0) [dB],2.982535
1,Path loss exponent (n),3.846061
2,Brick Wall Loss (L_c) [dB],6.873007
3,Wood Wall Loss (L_w) [dB],2.01269
4,CO2 coef. (a_co2) [dB/unit],-0.002361
5,Humidity coef. (a_hum) [dB/%),-0.087422
6,PM2.5 coef. (a_pm25) [dB/µg/m³],-0.100693
7,Pressure coef. (a_pres) [dB/hPa],-0.009539
8,Temp. coef. (a_temp) [dB/°C],-0.146799
9,SNR scaling (k_snr),-2.034711


In [36]:
print(f"Selected alpha: {ridge_cv.alpha_}")

Selected alpha: 13.049019780144016


<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Final Evaluation on Test Set
</p>

In [37]:
# =================== Final Evaluation (Test Set) ===================

# Predict on test set and reconstruct full predictions
y_test_pred_adj = pipeline.predict(X_lin_test)
y_test_pred = y_test_pred_adj + offset_test

rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)

print(f"\nTest RMSE: {rmse_test:.4f}")
print(f"Test R2:   {r2_test:.4f}")


Test RMSE: 8.0642
Test R2:   0.8172
