In [None]:
import pandas as pd
import numpy as np
from sklearn.metrics import mean_squared_error, r2_score
import pickle
import os
from sklearn.preprocessing import PolynomialFeatures, StandardScaler

In [5]:

# Path to the standardized database directory
base_path = '../Extended Parametric Regression Files+Plots.'

# Load train and test splits
df_train = pd.read_csv(f"{base_path}/train.csv")
df_test = pd.read_csv(f"{base_path}/test.csv")

# Extract features and targets
feature_names = [
    'distance', 'frequency', 'c_walls', 'w_walls', 'co2', 'humidity', 
    'pm25', 'pressure', 'temperature', 'snr'
]
X_train = df_train[feature_names].values
y_train = df_train['PL'].values
X_test = df_test[feature_names].values
y_test = df_test['PL'].values

# (Should we need 'time' for plotting)
time_train = df_train['time'].values
time_test = df_test['time'].values

# Print number of samples in train and test sets
print(f"\nTraining samples: {X_train.shape[0]}, Test samples: {X_test.shape[0]}")

# Load 5-fold assignments (array of fold numbers for each train sample)
fold_assignments = np.load(f"{base_path}/train_folds.npy")

# Print fold distribution
unique, counts = np.unique(fold_assignments, return_counts=True)
print(dict(zip(unique, counts)))

print('\nDataset loaded successfully!\n')


Training samples: 1209643, Test samples: 302411
{np.int64(0): np.int64(241929), np.int64(1): np.int64(241929), np.int64(2): np.int64(241929), np.int64(3): np.int64(241928), np.int64(4): np.int64(241928)}

Dataset loaded successfully!



In [6]:

# Prepare linearized features and adjusted targets for linear models
# Linearization separates the non-linear frequency term and transforms distance term
d0 = 1.0

# Train
log_d_train = np.log10(X_train[:, 0] / d0)
offset_train = 20 * np.log10(X_train[:, 1])  # Fixed frequency contribution
X_lin_train = np.column_stack((
    10 * log_d_train,  # Transformed distance term for path loss exponent
    X_train[:, 2:10]   # Remaining linear features
))
y_train_adj = y_train - offset_train  # Adjust target by subtracting frequency offset

# Test
log_d_test = np.log10(X_test[:, 0] / d0)
offset_test = 20 * np.log10(X_test[:, 1])
X_lin_test = np.column_stack((
    10 * log_d_test,
    X_test[:, 2:10]
))
y_test_adj = y_test - offset_test

# List of model names based on saved files
model_names = ['bayesian', 'elasticnet', 'lasso', 'mlr', 'poly', 'ridge']

# Directory for models
models_dir = 'Models'

# Collect results
results = []

# Number of features in X_lin
n_features = X_lin_test.shape[1]

for name in model_names:
    model_path = os.path.join(models_dir, f"{name}_final_coeffs.pkl")
    if not os.path.exists(model_path):
        print(f"Model file not found: {model_path}")
        continue
    
    # Load the coefficients (assuming ndarray)
    with open(model_path, 'rb') as f:
        coeffs = pickle.load(f)
    
    if not isinstance(coeffs, np.ndarray):
        print(f"Loaded object for {name} is not a numpy array.")
        continue
    
    if coeffs.ndim > 1:
        coeffs = coeffs.flatten()
    
    num_coeffs = len(coeffs)
    
    # Predict on test set and reconstruct full predictions
    try:
        if name == 'poly':
            # Special handling for full polynomial model: degree=2 on all features, with scaling
            scaler = StandardScaler().fit(X_lin_train)
            poly = PolynomialFeatures(degree=2).fit(X_lin_train)  # include_bias=True by default
            
            scaled_test = scaler.transform(X_lin_test)
            poly_test = poly.transform(scaled_test)
            
            if num_coeffs != poly_test.shape[1] + 1:
                raise ValueError(f"Coefficients length {num_coeffs} does not match expected for full polynomial: {poly_test.shape[1] + 1}")
            
            intercept = coeffs[0]
            poly_coef = coeffs[1:]
            y_test_pred_adj = intercept + np.dot(poly_test, poly_coef)
        else:
            # Linear models, assuming no scaling
            if num_coeffs == n_features + 1:
                X_aug_test = np.column_stack((np.ones(X_lin_test.shape[0]), X_lin_test))
                y_test_pred_adj = np.dot(X_aug_test, coeffs)
            elif num_coeffs == n_features:
                y_test_pred_adj = np.dot(X_lin_test, coeffs)
            else:
                raise ValueError(f"For linear model {name}, coefficients length {num_coeffs} does not match n_features={n_features} or n_features+1")
        
        y_test_pred = y_test_pred_adj + offset_test
        
        rmse_test = np.sqrt(mean_squared_error(y_test, y_test_pred))
        r2_test = r2_score(y_test, y_test_pred)
        
        results.append({
            'Model': name.capitalize(),
            'RMSE': rmse_test,
            'R2': r2_test
        })
    except Exception as e:
        print(f"Error evaluating {name}: {str(e)}")

# Create DataFrame and sort by RMSE ascending (best to highest)
df_results = pd.DataFrame(results)
df_results = df_results.sort_values('RMSE', ascending=True)

# Display the table with 5 decimal places
print("\nRegression Models Evaluation on Test Set (sorted by RMSE):\n")
print(df_results.to_string(index=False, float_format=lambda x: '{:.5f}'.format(x)))


Regression Models Evaluation on Test Set (sorted by RMSE):

     Model    RMSE      R2
      Poly 7.00687 0.86587
     Ridge 8.20975 0.81586
  Bayesian 8.20975 0.81586
Elasticnet 8.20975 0.81586
     Lasso 8.20975 0.81586
       Mlr 8.20975 0.81586
