In [None]:
# Import all necessary libraries
import numpy as np
import pandas as pd
from typing import Callable, Tuple, List
from sklearn.model_selection import KFold, train_test_split, GridSearchCV

from scipy.optimize import minimize
import time
from typing import Dict, Any, Tuple, List

# Import functions from Functions.py file
from Functions_11_Avino_Lombardi import (
    forward, backward,
    g1, dg1_dx, g2, dg2_dx,
    mse_loss, mape,
    initialize_parameters, unroll_params, roll_params,
    check_gradients_with_central_differences
)


# --- Define Objective Function for scipy.optimize.minimize ---

def objective_function(flat_params: np.ndarray,
                       input_data: np.ndarray,
                       target_data: np.ndarray,
                       W_shapes: List[Tuple[int, ...]],
                       b_shapes: List[Tuple[int, ...]],
                       v_shape: Tuple[int, ...],
                       activation_func: Callable[[np.ndarray], np.ndarray],
                       activation_prime: Callable[[np.ndarray], np.ndarray],
                       regularization_factor: float,
                       num_layers: int) -> Tuple[float, np.ndarray]:
    """
    Objective function for scipy.optimize.minimize.
    Returns loss and flattened gradients.
    """
    W_list, b_list, v = roll_params(flat_params, W_shapes, b_shapes, v_shape)

    # Forward pass
    y_pred, a_list, z_list = forward(input_data, W_list, b_list, v, activation_func, num_layers)

    # Calculate MSE loss (1/N * sum((y_i - y_hat_i)^2))
    loss = mse_loss(target_data, y_pred)

    # Add L2 regularization to the loss: lambda * sum( ||omega_l||_F^2 )
    regularization_term = 0
    for W in W_list:
        regularization_term += np.sum(W**2)
    regularization_term += np.sum(v**2) # Sum of squares for final output weights
    loss += regularization_factor * regularization_term 

    # Backward pass to get gradients
    grad_W_list, grad_b_list, grad_v = backward(input_data, target_data, W_list, b_list, v, a_list, z_list, activation_prime, num_layers)

    # Add regularization term to gradients
    for i in range(len(grad_W_list)):
        grad_W_list[i] += 2 * regularization_factor * W_list[i] # d/dW(||W||^2) = 2W
    grad_v += 2 * regularization_factor * v # d/dv(||v||^2) = 2v

    # Flatten gradients for the optimizer
    flattened_gradients = unroll_params(grad_W_list, grad_b_list, grad_v)

    return loss, flattened_gradients


### Data loading and splitting
After loading the data in our environment, we randomly partition it into training ( $80\% $) and testing ( $20\%$ ) sets. This separation is fundamental to evaluate the generalization capability of our trained model on unseen (test) data. 

In [None]:
# --- Data Loading ---

df = pd.read_csv('/Users/Val/Documents/GitHub/OMDS-Project/dataset/AGE_PREDICTION.csv')

# Separate features (X_full) and target (y_full) from the entire dataset
feature_columns = [f'feat_{i}' for i in range(1, 33)]
X_full = df[feature_columns].values # Shape will be [N, D]
y_full = df['gt'].values.reshape(-1, 1) # Shape will be [N, 1]


print(f"Initial X shape: {X_full.shape}")
print(f"Initial y shape: {y_full.shape}")

# Get total number of samples
N_samples_full = X_full.shape[0]
X_train, X_test, y_train, y_test = train_test_split(X_full, y_full, test_size=0.2, random_state=1234)

X_train = X_train.T # Transpose to [D, N]
y_train = y_train.T # Transpose to [1, N]
X_test = X_test.T # Transpose to [D, N]
y_test = y_test.T # Transpose to [1, N]

# Determine D_input (number of input features) - from training data
D_input = X_train.shape[0]
y_output_dim = y_train.shape[0] # Should be 1 for your problem
print(f"D_input (number of input features): {D_input}")
print(f"y_output_dim (number of output dimensions): {y_output_dim}\n")
print(f"X_train shape after split: {X_train.shape}")
print(f"y_train shape after split: {y_train.shape}") # This should be (1, N_train)

Initial X shape: (20475, 32)
Initial y shape: (20475, 1)
D_input (number of input features): 32
y_output_dim (number of output dimensions): 1

X_train shape after split: (32, 16380)
y_train shape after split: (1, 16380)


### Feature standardization

Before applying any optimization routine, we normalize our data
For each feature $x_i$:
$$x_i^{\text{normalized}}=\frac{x_i-\mu_i}{\sigma_i}$$

where $\mu_i$ and $\sigma_i$ are the **mean** and **standard deviation** of the $i^{th}$ feature, computed over the **training set**, since at this stage we do not have access to test information. The same transformation is then applied to test data.

Standardization ensures __all features contribute equally__ to the loss landscape, thus to the gradient updates, prevents issues like vanishing or exploding gradients due to varying feature scales, and accelerates the convergence of our L-BFGS-B optimizer.

In [None]:
# --FEATURES NORMALIZATION -- also with "StandardScaler"
# only computed from train data
mu = X_train.mean(axis=1, keepdims=True)
sigma = X_train.std(axis=1, keepdims=True)

# Handle cases where standard deviation might be zero (e.g.,  constant feature)
sigma[sigma == 0] = 1e-8 

# Apply the transformation to the TRAINING DATA
X_train_normalized = (X_train - mu) / sigma

# Apply the *SAME* transformation (using mu and sigma from training) to the TEST DATA
X_test_normalized = (X_test - mu) / sigma

In [None]:
# GRADIENT CHECKING
print("\nPerforming gradient check...")

# toy subset of training data for gradient check 
num_samples_for_check = min(1000, X_train_normalized.shape[1])
X_check_subset = X_train_normalized[:, :num_samples_for_check]
y_check_subset = y_train[:, :num_samples_for_check]

# representative set of hyperparameters for the check
check_L = 3
check_neurons_config = [5, 5]
check_activation_func = g1 # Using g1 for this check, then switching
check_activation_prime = dg1_dx
check_reg_factor = 0.01

# Initialize parameters for the check (ensure these are fresh initializations)
W_check_init, b_check_init, v_check_init = initialize_parameters(
    D_input, check_neurons_config, y_output_dim, check_reg_factor
)
initial_flat_params_for_check = unroll_params(W_check_init, b_check_init, v_check_init)

W_shapes_for_check = [W.shape for W in W_check_init]
b_shapes_for_check = [b.shape for b in b_check_init]
v_shape_for_check = v_check_init.shape

# Call the gradient check function
check_gradients_with_central_differences(
    initial_flat_params_for_check,
    X_check_subset, y_check_subset,
    W_shapes_for_check, b_shapes_for_check, v_shape_for_check,
    check_activation_func, check_activation_prime,
    check_reg_factor, check_L,
    objective_function # Pass a reference to your objective_function
)

print("Gradient check finished.")



Performing gradient check...
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)

--- Gradient Check (Central Differences) ---
Checking 200 parameters...
Analytical loss at initial point: 825.524543
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Shape of dLoss_dout: (1, 1000)
DEBUG IN BACKWARD: Shape of y_pred_for_grad_calc: (1, 1000)
DEBUG IN BACKWARD: Sha

### K-Fold Cross Validation

We now perform **5-fold cross-validation** on the training data to assess the optimal Multi-Layer Perceptron (MLP) network architecture. This method involves partitioning the training set into k=5 equally sized, disjoint subsets. For each fold $i\in\{1,…,5\}$, the model is trained on the data from the other 4 folds and subsequently evaluated on the held-out fold $F_i$

This cross-validation procedure is integrated with a **Grid Search** strategy. The Grid Search exhaustively explores a predefined hyperparameter space **H**, which includes combinations of:

- Number of layers ($L\in\{2,3,4\}$)

- Number of neurons per hidden layer (e.g., $[8]$ for L=2, $[8,8]$ for L=3, etc.)

- Choice of activation function ( sigmoid or hyperbolic tangent)

- Regularization factor ($\lambda$) for the L2 penalty.

For each unique combination of hyperparameters within this grid, a model is trained and evaluated $5$ times (once for each fold). The "best performance on average on validation sets" refers to the mean evaluation metric (specifically, the Mean Absolute Percentage Error, MAPE) computed across these $5$ validation folds for that particular hyperparameter combination. The **optimal MLP architecture** is then identified as the combination of hyperparameters from the grid that yields the **lowest average MAPE** across its respective cross-validation folds. This averaging process provides a more robust and reliable estimate of the model's performance by reducing the variance associated with a single train/validation split.

In [5]:
# --- Hyperparameter Search Space ---
hyperparameter_grid = {
    'num_layers': [2, 3, 4],
    'num_neurons_per_layer': {
        2: [[4], [8], [16], [32]], # For L=2
        3: [[4, 4], [8, 8], [16, 16], [32, 32]], # For L=3
        4: [[4, 4, 4], [8, 8, 8], [16, 16, 16]], # For L=4
    },
    'activation_function': [(g1, dg1_dx), (g2, dg2_dx)],
    'regularization_factor': [0.001, 0.01, 0.1],
    'learning_rate': [0.01, 0.001, 0.0001], # This was in a previous version, but might not be if you removed it for minimize.
    'num_iterations': [500, 1000, 2000] # Same as above
}

# --- K-Fold Cross-Validation Setup ---
n_splits = 5 # For 5-fold cross-validation
kf = KFold(n_splits=n_splits, shuffle=True, random_state=1234)

best_mape = float('inf')
best_hyperparameters = {}
best_training_results = {}

# --- Hyperparameter Tuning Loop ---
print("Starting Hyperparameter Tuning...\n")

for L in hyperparameter_grid['num_layers']:
    hidden_neurons_options = hyperparameter_grid['num_neurons_per_layer'][L]

    for neurons_config in hidden_neurons_options:
        for activation_tuple in hyperparameter_grid['activation_function']:
            activation_func, activation_prime = activation_tuple
            activation_name = activation_func.__name__ # For printing

            for reg_factor in hyperparameter_grid['regularization_factor']:

                print(f"Testing L={L}, Neurons={neurons_config}, Activation={activation_name}, Reg Factor={reg_factor}")

                fold_mape_scores = []
                fold_training_errors = []
                fold_optimization_times = []
                fold_iterations = []

                # Perform K-Fold Cross-Validation on the TRAINING DATA
                for fold, (train_index, val_index) in enumerate(kf.split(X_train.T)): # X_train.T to split along samples
                    X_fold_train, X_fold_val = X_train_normalized[:, train_index], X_train_normalized[:, val_index]
                    y_fold_train, y_fold_val = y_train[:, train_index], y_train[:, val_index]

                    # Initialize parameters for this fold
                    W_init, b_init, v_init = initialize_parameters(D_input, neurons_config, y_output_dim, reg_factor)
                    initial_flat_params = unroll_params(W_init, b_init, v_init)

                    # Store shapes for reshaping inside objective function
                    W_shapes = [W.shape for W in W_init]
                    b_shapes = [b.shape for b in b_init]
                    v_shape = v_init.shape

                    start_time = time.time()
                    # Optimization using scipy.optimize.minimize on fold training data
                    result = minimize(
                        fun=objective_function,
                        x0=initial_flat_params,
                        args=(X_fold_train, y_fold_train, W_shapes, b_shapes, v_shape, activation_func, activation_prime, reg_factor, L),
                        method='L-BFGS-B',
                        jac=True,
                        options={'disp': False, 'maxiter': 5000}
                    )
                    end_time = time.time()

                    optimization_time = end_time - start_time
                    fold_optimization_times.append(optimization_time)
                    fold_iterations.append(result.nit)

                    # Reshape optimized parameters
                    W_optimized, b_optimized, v_optimized = roll_params(result.x, W_shapes, b_shapes, v_shape)

                    # Evaluate on training set (without regularization for training error as per instructions)
                    y_fold_train_pred_no_reg, _, _ = forward(X_fold_train, W_optimized, b_optimized, v_optimized, activation_func, L)
                    train_error_no_reg = mse_loss(y_fold_train, y_fold_train_pred_no_reg)
                    fold_training_errors.append(train_error_no_reg)

                    # Evaluate on validation set using MAPE
                    y_fold_val_pred, _, _ = forward(X_fold_val, W_optimized, b_optimized, v_optimized, activation_func, L)
                    val_mape = mape(y_fold_val, y_fold_val_pred)
                    fold_mape_scores.append(val_mape)

                avg_mape = np.mean(fold_mape_scores)
                avg_training_error = np.mean(fold_training_errors)
                avg_optimization_time = np.mean(fold_optimization_times)
                avg_iterations = np.mean(fold_iterations)

                print(f"  Average Validation MAPE: {avg_mape:.4f}%")

                if avg_mape < best_mape:
                    best_mape = avg_mape
                    best_hyperparameters = {
                        'num_layers': L,
                        'num_neurons_per_layer': neurons_config,
                        'activation_function': activation_name,
                        'regularization_factor': reg_factor
                    }
                    best_training_results = {
                        'optimization_solver': result.message,
                        'num_iterations': avg_iterations,
                        'optimization_time': avg_optimization_time,
                        'training_error_no_reg': avg_training_error,
                        'best_model_params': result.x # Store the flat optimized parameters
                    }
                print("-" * 50)

print("\nHyperparameter Tuning Complete.")

# --- Final Evaluation and Output ---

print("\n--- Optimal Hyperparameters and Performance ---")
print(f"Optimal Number of Layers (L): {best_hyperparameters['num_layers']}")
print(f"Optimal Number of Neurons per Layer (N): {best_hyperparameters['num_neurons_per_layer']}")
print(f"Optimal Activation Function: {best_hyperparameters['activation_function']}")
print(f"Optimal Regularization Factor (lambda): {best_hyperparameters['regularization_factor']}")
print(f"Other Hyperparameters (e.g., initialization strategy): Xavier/Glorot initialization (scaled random normal)")
print(f"Optimization Solver: L-BFGS-B ({best_training_results['optimization_solver']})")
print(f"Average Number of Iterations for Optimization: {best_training_results['num_iterations']:.2f}")
print(f"Average Optimization Time per Fold: {best_training_results['optimization_time']:.4f} seconds")
print(f"Average Training Error (MSE, without regularization) of Optimal Model: {best_training_results['training_error_no_reg']:.4f}")

Starting Hyperparameter Tuning...

Testing L=2, Neurons=[4], Activation=g1, Reg Factor=0.001
  Average Validation MAPE: 20.2758%
--------------------------------------------------
Testing L=2, Neurons=[4], Activation=g1, Reg Factor=0.01
  Average Validation MAPE: 20.4946%
--------------------------------------------------
Testing L=2, Neurons=[4], Activation=g1, Reg Factor=0.1
  Average Validation MAPE: 23.5948%
--------------------------------------------------
Testing L=2, Neurons=[4], Activation=g2, Reg Factor=0.001
  Average Validation MAPE: 20.2387%
--------------------------------------------------
Testing L=2, Neurons=[4], Activation=g2, Reg Factor=0.01
  Average Validation MAPE: 20.4948%
--------------------------------------------------
Testing L=2, Neurons=[4], Activation=g2, Reg Factor=0.1
  Average Validation MAPE: 23.6110%
--------------------------------------------------
Testing L=2, Neurons=[8], Activation=g1, Reg Factor=0.001
  Average Validation MAPE: 20.2650%
-------

### Testing
Having identified the optimal set of hyperparameters, the model will be retrained using these **best-performing parameters** on the **entire available training dataset**. This step ensures the model leverages all learning opportunities before its ultimate evaluation. Subsequently, we will assess its true generalization capability by making predictions on the completely unseen test dataset.

In [None]:
print("\nRetraining optimal model on full TRAINING dataset for final evaluation...")
best_L = best_hyperparameters['num_layers']
best_neurons_config = best_hyperparameters['num_neurons_per_layer']
# Map activation function name back to the callable objects
best_activation_func = g1 if best_hyperparameters['activation_function'] == 'g1' else g2
best_activation_prime = dg1_dx if best_hyperparameters['activation_function'] == 'g1' else dg2_dx
best_reg_factor = best_hyperparameters['regularization_factor']

# Re-initialize for full training on the *entire training set* (X_train_normalized, y_train)
W_final_init, b_final_init, v_final_init = initialize_parameters(D_input, best_neurons_config, y_output_dim, best_reg_factor)
initial_flat_params_final = unroll_params(W_final_init, b_final_init, v_final_init)

W_shapes_final = [W.shape for W in W_final_init]
b_shapes_final = [b.shape for b in b_final_init]
v_shape_final = v_final_init.shape

start_time_final_train = time.time()
result_final_train = minimize(
    fun=objective_function,
    x0=initial_flat_params_final,
    args=(X_train_normalized, y_train, W_shapes_final, b_shapes_final, v_shape_final, best_activation_func, best_activation_prime, best_reg_factor, best_L),
    method='L-BFGS-B',
    jac=True,
    options={'disp': False, 'maxiter': 1000}
)
end_time_final_train = time.time()

W_final_trained, b_final_trained, v_final_trained = roll_params(result_final_train.x, W_shapes_final, b_shapes_final, v_shape_final)

# Evaluate on the TEST data (X_test_normalized, y_test)
print(f"\nFinal Test Data Shape (from split): {X_test.shape}, {y_test.shape}")
y_test_pred, _, _ = forward(X_test_normalized, W_final_trained, b_final_trained, v_final_trained, best_activation_func, best_L)
test_error_mape = mape(y_test, y_test_pred)

print(f"Test Error (MAPE) of Optimal Model: {test_error_mape:.4f}%")

# Also calculate MSE test error
test_error_mse = mse_loss(y_test, y_test_pred)
print(f"Test Error (MSE) of Optimal Model: {test_error_mse:.4f}")



Retraining optimal model on full TRAINING dataset for final evaluation...

Final Test Data Shape (from split): (32, 4095), (1, 4095)
Test Error (MAPE) of Optimal Model: 20.9393%
Test Error (MSE) of Optimal Model: 49.4616


In [8]:
print("\n--- Summary of Optimal Configuration ---")
print(f"Optimal Number of Layers (L): {best_L}")
print(f"Optimal Number of Neurons per Layer: {best_neurons_config}")
print(f"Optimal Activation Function: {best_activation_func.__name__}")
print(f"Optimal Regularization Factor: {best_reg_factor}")
best_max_iter = best_training_results['num_iterations']
print(f"Max Iterations for L-BFGS-B (from Best CV Training Results): {best_max_iter}")


# --- Training Set Performance (After Final Retraining) ---
print("\n--- Training Set Performance (Optimal Model on Full Training Data) ---")

# Training MAPE (already has y_train_pred from previous evaluation logic if you had it,
# but let's re-calculate to be explicit)
y_train_pred, _, _ = forward(X_train_normalized, W_final_trained, b_final_trained, v_final_trained, best_activation_func, best_L)
train_error_mape = mape(y_train, y_train_pred)
print(f"Training Error (MAPE): {train_error_mape:.4f}%")

# Training MSE (Regularized)
# Reuse the objective_function with the final trained parameters and full training data
train_error_mse_reg, _ = objective_function(
    result_final_train.x, # This is the flattened trained parameters from the last minimize call
    X_train_normalized,   # Your original [D,N] training data
    y_train,              # Your original [1,N] training data
    W_shapes_final, b_shapes_final, v_shape_final, # Shapes derived from final init
    best_activation_func, best_activation_prime, best_reg_factor, best_L
)
print(f"Training Error (MSE, Regularized): {train_error_mse_reg:.4f}")


# --- Average Validation Performance (From Cross-Validation) ---
print("\n--- Average Validation Performance (from GridSearchCV Optimal Combination) ---")
# This `best_mape` variable should contain the average validation MAPE from your GridSearchCV output.
print(f"Average Validation Error (MAPE): {best_mape:.4f}%")

# Average Validation MSE (Regularized)
# Note: Unless you explicitly configured GridSearchCV to also track 'neg_mean_squared_error'
# (regularized) and stored its 'mean_test_score' for the best combination, this value
# is not directly available from `best_mape`. If you did, you would access it from `grid_search.cv_results_`.
# For now, we will note this as "not directly available" if you only tracked MAPE as the scoring metric.
# If your `best_training_results` dict contains an average regularized MSE, you can use it here.
try:
    avg_val_mse_reg = best_training_results['training_error_no_reg'] # Placeholder - check your dict key carefully
    print(f"Average Validation Error (MSE, Regularized): {avg_val_mse_reg:.4f} (check if this is truly regularized MSE)")
except KeyError:
    print(f"Average Validation Error (MSE, Regularized): (Not directly available from original GridSearchCV output if only MAPE was tracked.)")


# --- Test Set Performance (Optimal Model) ---
# (test_error_mape and test_error_mse are already computed in the cell you ran)
print("\n--- Test Set Performance (Optimal Model) ---")
print(f"Test Error (MAPE): {test_error_mape:.4f}%")

# Test MSE (Regularized)
# Reuse the objective_function with the final trained parameters and test data
test_error_mse_reg, _ = objective_function(
    result_final_train.x, # Flattened trained parameters from final minimize call
    X_test_normalized,    # Original [D,N] test data
    y_test,               # Original [1,N] test data
    W_shapes_final, b_shapes_final, v_shape_final, # Shapes derived from final init
    best_activation_func, best_activation_prime, best_reg_factor, best_L
)
print(f"Test Error (MSE, Regularized): {test_error_mse_reg:.4f}")


--- Summary of Optimal Configuration ---
Optimal Number of Layers (L): 2
Optimal Number of Neurons per Layer: [32]
Optimal Activation Function: g2
Optimal Regularization Factor: 0.001
Max Iterations for L-BFGS-B (from Best CV Training Results): 2788.2

--- Training Set Performance (Optimal Model on Full Training Data) ---
Training Error (MAPE): 20.0496%
Training Error (MSE, Regularized): 46.8311

--- Average Validation Performance (from GridSearchCV Optimal Combination) ---
Average Validation Error (MAPE): 20.2029%
Average Validation Error (MSE, Regularized): 46.0791 (check if this is truly regularized MSE)

--- Test Set Performance (Optimal Model) ---
Test Error (MAPE): 20.9393%
Test Error (MSE, Regularized): 50.0484
