<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Polynomial Regresiion Fitiing
</p>

In [10]:
# ============================== Core & Data Libraries ==============================
import os                                   # File and directory operations
import pickle                               # Object serialization
import numpy as np                          # Numerical computations
import pandas as pd                         # Data manipulation and analysis

# ============================== Machine Learning & Stats ===========================
from sklearn.metrics import mean_squared_error, r2_score      # Model evaluation metrics
from sklearn.linear_model import RidgeCV                      # Ridge for regularization with poly
from sklearn.preprocessing import StandardScaler, PolynomialFeatures  # Feature scaling and poly
from sklearn.pipeline import make_pipeline                    # Pipeline for scaling and model

<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Dataset: Load Saved Splits and Fold Assignments
</p>

In [11]:
# Path to the standardized database directory
base_path = '../Extended Parametric Regression Files+Plots.'

# Load train and test splits
df_train = pd.read_csv(f"{base_path}/train.csv")
df_test = pd.read_csv(f"{base_path}/test.csv")

# Extract features and targets
feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity', 
    'pm25', 'pressure', 'temperature', 'snr'
]
X_train = df_train[feature_names].values
y_train = df_train['PL'].values
X_test = df_test[feature_names].values
y_test = df_test['PL'].values

# (Should we need 'time' for plotting)
time_train = df_train['time'].values
time_test = df_test['time'].values

# Print number of samples in train and test sets
print(f"\nTraining samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Load 5-fold assignments (array of fold numbers for each train sample)
fold_assignments = np.load(f"{base_path}/train_folds.npy")

# Print fold distribution
unique, counts = np.unique(fold_assignments, return_counts=True)
print(dict(zip(unique, counts)))

print('\nDataset loaded successfully!\n')

# Prepare linearized features and adjusted targets for linear models
# Linearization separates the non-linear frequency term and transforms distance term
d0 = 1.0

# Train
log_d_train = np.log10(X_train[:, 0] / d0)
offset_train = 20 * np.log10(X_train[:, 1])  # Fixed frequency contribution
X_lin_train = np.column_stack((
    10 * log_d_train,  # Transformed distance term for path loss exponent
    X_train[:, 2:10]   # Remaining linear features
))
y_train_adj = y_train - offset_train  # Adjust target by subtracting frequency offset

# Test
log_d_test = np.log10(X_test[:, 0] / d0)
offset_test = 20 * np.log10(X_test[:, 1])
X_lin_test = np.column_stack((
    10 * log_d_test,
    X_test[:, 2:10]
))
y_test_adj = y_test - offset_test


Training samples: 1209643, Test samples: 302411
{np.int64(0): np.int64(241929), np.int64(1): np.int64(241929), np.int64(2): np.int64(241929), np.int64(3): np.int64(241928), np.int64(4): np.int64(241928)}

Dataset loaded successfully!



<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
  Polynomial Regression Model
</p>

In [12]:
# =================== Model Function ===================

# Retained for reference; not used directly in fitting but defines the full model
def log_distance_path_loss_with_env_params(x, PL_d0, n, L_c, L_w,
                                           a_co2, a_hum, a_pm25,
                                           a_pres, a_temp, k_snr):
    """
    Path loss model with environmental parameters.
    x: 2D array (10, N), where:
       x[0]=distance, x[1]=frequency, x[2]=c_walls, ..., x[9]=snr
    """
    d, frequency, c_walls, w_walls, co2, humidity, pm25, pressure, temperature, snr = x
    d0 = 1  # Reference distance
    return (PL_d0
            + 10 * n * np.log10(d / d0)
            + 20 * np.log10(frequency)
            + c_walls * L_c
            + w_walls * L_w
            + a_co2 * co2
            + a_hum * humidity
            + a_pm25 * pm25
            + a_pres * pressure
            + a_temp * temperature
            + snr * k_snr)

<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 5-Fold Cross-Validation on Training Set
</p>

In [13]:
# =================== 5-Fold Cross-Validation (Training set only) ===================

# Range of alpha values for regularization strength
alphas = np.logspace(-6, 6, 100)

rmse_train_folds, rmse_val_folds = [], []
r2_train_folds, r2_val_folds = [], []
cv_coeffs = []

for fold_num in range(5):
    tr_idx = np.where(fold_assignments != fold_num)[0]
    val_idx = np.where(fold_assignments == fold_num)[0]
    X_tr = X_lin_train[tr_idx]
    y_tr_adj = y_train_adj[tr_idx]
    X_val = X_lin_train[val_idx]
    y_val_adj = y_train_adj[val_idx]
    offset_tr = offset_train[tr_idx]
    offset_val = offset_train[val_idx]

    # Pipeline for scaling, polynomial features, and RidgeCV
    pipeline = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error'))
    pipeline.fit(X_tr, y_tr_adj)

    # For coeffs, extract but note they are expanded and in scaled space (no simple back-transform for poly)
    # We append the full coefficients for CV tracking
    ridge_cv = pipeline.named_steps['ridgecv']
    cv_coeffs.append(np.concatenate(([ridge_cv.intercept_], ridge_cv.coef_)))

    # Training fold metrics (reconstruct full predictions)
    y_tr_pred_adj = pipeline.predict(X_tr)
    y_tr_pred = y_tr_pred_adj + offset_tr
    rmse_train = np.sqrt(mean_squared_error(y_train[tr_idx], y_tr_pred))
    r2_train = r2_score(y_train[tr_idx], y_tr_pred)
    rmse_train_folds.append(rmse_train)
    r2_train_folds.append(r2_train)

    # Validation fold metrics
    y_val_pred_adj = pipeline.predict(X_val)
    y_val_pred = y_val_pred_adj + offset_val
    rmse_val = np.sqrt(mean_squared_error(y_train[val_idx], y_val_pred))
    r2_val = r2_score(y_train[val_idx], y_val_pred)
    rmse_val_folds.append(rmse_val)
    r2_val_folds.append(r2_val)

    print(f"Fold {fold_num+1}: RMSE_train={rmse_train:.4f}, RMSE_val={rmse_val:.4f}")

print("\n=== Cross-Validation Results on the training set ===")
print(f"RMSE (Train): {np.mean(rmse_train_folds):.4f} ± {np.std(rmse_train_folds):.4f}")
print(f"RMSE (Val):   {np.mean(rmse_val_folds):.4f} ± {np.std(rmse_val_folds):.4f}")
print(f"R2 (Train):   {np.mean(r2_train_folds):.4f} ± {np.std(r2_train_folds):.4f}")
print(f"R2 (Val):     {np.mean(r2_val_folds):.4f} ± {np.std(r2_val_folds):.4f}")

Fold 1: RMSE_train=6.9840, RMSE_val=7.0129
Fold 2: RMSE_train=6.9911, RMSE_val=6.9845
Fold 3: RMSE_train=6.9937, RMSE_val=6.9741
Fold 4: RMSE_train=6.9904, RMSE_val=6.9873
Fold 5: RMSE_train=6.9887, RMSE_val=6.9936

=== Cross-Validation Results on the training set ===
RMSE (Train): 6.9896 ± 0.0032
RMSE (Val):   6.9905 ± 0.0129
R2 (Train):   0.8665 ± 0.0001
R2 (Val):     0.8664 ± 0.0004


<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Retrain Final Model on All Training Data
</p>

In [None]:
# =================== Final Model Training (All Training Data) ===================

# Use the same pipeline as in CV for consistency: full poly on all features with scaling
alphas = np.logspace(-6, 6, 100)
pipeline = make_pipeline(StandardScaler(), PolynomialFeatures(degree=2), RidgeCV(alphas=alphas, cv=5, scoring='neg_mean_squared_error'))
pipeline.fit(X_lin_train, y_train_adj)

# Extract the fitted RidgeCV for coefficient access (note: coeffs are in scaled/poly space, no simple unpacking)
ridge_cv = pipeline.named_steps['ridgecv']
final_coeffs = np.concatenate(([ridge_cv.intercept_], ridge_cv.coef_))

# ========== Save coefficients   ==========
os.makedirs('Models', exist_ok=True) # Create 'models' folder if it doesn't exist
with open('Models/poly_final_coeffs.pkl', 'wb') as f:
    pickle.dump(final_coeffs, f)
print("\nFinal Polynomial model coefficients saved to Models/poly_final_coeffs.pkl")
# No simple unpacking/display since coeffs are expanded (56 terms) and scaled; 
# for interpretability, we could consider partial polynomial instead


Final Polynomial model coefficients saved to Models/poly_final_coeffs.pkl


<p style="font-family: 'Courier New', Courier, monospace; font-size: 30px; font-weight: bold; color: blue;  text-align: left;">
 Final Evaluation on Test Set
</p>

In [15]:
# =================== Final Evaluation (Test Set) ===================

# Predict on test set and reconstruct full predictions
y_test_pred_adj = pipeline.predict(X_lin_test)
y_test_pred = y_test_pred_adj + offset_test

rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
r2_test = r2_score(y_test, y_test_pred)

print(f"\nTest RMSE: {rmse_test:.4f}")
print(f"Test R2:   {r2_test:.4f}")


Test RMSE: 7.0069
Test R2:   0.8659
