In [None]:
import sys, os
sys.path.insert(0, '/Users/livestorborg/Desktop/FYS-STK4155/project2/code')

import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso

from src.neural_network import NeuralNetwork
from src.activations import Sigmoid, Linear
from src.losses import MSE
from src.optimizers import Adam, RMSprop
from src.training import train
from src.metrics import mse
from src.utils import runge, polynomial_features, scale_data, inverse_scale_y
from src.plotting import lambda_eta_heatmap

In [None]:
SEED = 42
np.random.seed(SEED)

N = 100
x = np.linspace(-1, 1, N)
y_true = runge(x)
y_noise = y_true + np.random.normal(0, 0.1, N)

# Setup for Lasso Regression (using Scikit-learn)

In [None]:
X_poly = polynomial_features(x, p=9, intercept=False)

X_train_poly, X_test_poly, y_train, y_test = train_test_split(
    X_poly, y_noise, test_size=0.2, random_state=SEED
)
# Scale data
X_train_s, y_train_s, X_mean, X_std, y_mean = scale_data(X_train_poly, y_train)
X_test_s, y_test_s, _, _, _ = scale_data(
    X_test_poly, y_test, X_mean, X_std, y_mean
)

lasso_model = Lasso(alpha=0.01, max_iter=10000, random_state=SEED)
lasso_model.fit(X_train_s, y_train_s.ravel())
y_pred_lasso = lasso_model.predict(X_test_s)
lasso_mse = mse(y_test_s.reshape(-1, 1), y_pred_lasso.reshape(-1, 1))

In [None]:
X_train_raw, X_test_raw, y_train_nn, y_test_nn = train_test_split(
    x.reshape(-1, 1), y_noise.reshape(-1, 1), 
    test_size=0.2, random_state=SEED
)

# Scale for NN
X_train_s, y_train_s, X_mean, X_std, y_mean = scale_data(X_train_raw, y_train_nn)
X_test_s, y_test_s, _, _, _ = scale_data(X_test_raw, y_test_nn, X_mean, X_std, y_mean)

# Compute y_test_real once (used in all loops)
y_test_real = inverse_scale_y(y_test_s, y_mean)

# Stochastic Gradient Descent with RMSprop (L1 Regularization)

In [None]:
eta_vals = np.logspace(-5, -1, 5)
lambda_vals = np.logspace(-5, 1, 5)
hidden_layers = [50, 50]

n_eta = len(eta_vals)
n_lambda = len(lambda_vals)

# Storage
models = [[None for _ in range(n_lambda)] for _ in range(n_eta)]
train_mse = np.zeros((n_eta, n_lambda))
test_mse = np.zeros((n_eta, n_lambda))

# Grid search
for i, eta in enumerate(eta_vals):
    for j, lam in enumerate(lambda_vals):
        # Create and train model
        model = NeuralNetwork(
            network_input_size=1,
            layer_output_sizes=hidden_layers + [1],
            activations=[Sigmoid(), Sigmoid(), Linear()],
            loss=MSE(),
            seed=SEED,
            lambda_reg=lam,
            reg_type='l1' if lam > 0 else None,
            weight_init='xavier'
        )
        
        optimizer = RMSprop(eta=eta)
        
        train(
            nn=model,
            X_train=X_train_s,
            y_train=y_train_s,
            X_val=X_test_s,
            y_val=y_test_s,
            optimizer=optimizer,
            epochs=500,
            batch_size=16,
            stochastic=True,
            task='regression',
            early_stopping=True,
            patience=50,
            verbose=False,
            seed=SEED
        )
        
        models[i][j] = model
        
        # Evaluate
        y_train_pred = inverse_scale_y(model.predict(X_train_s), y_mean)
        y_test_pred = inverse_scale_y(model.predict(X_test_s), y_mean)
        y_train_real = inverse_scale_y(y_train_s, y_mean)
        
        train_mse[i, j] = mse(y_train_real, y_train_pred)
        test_mse[i, j] = mse(y_test_real, y_test_pred)

min_idx_rms = np.unravel_index(np.argmin(test_mse), test_mse.shape)
i_best_rms, j_best_rms = min_idx_rms

best_eta_rms = eta_vals[i_best_rms]
best_lambda_rms = lambda_vals[j_best_rms]
best_test_mse_rms = test_mse[i_best_rms, j_best_rms]
best_train_mse_rms = train_mse[i_best_rms, j_best_rms]
print(f'Best eta:    {best_eta_rms}')
print(f'Best lambda: {best_lambda_rms}')
print(f'Best train MSE: {best_train_mse_rms}')
print(f'Best test MSE:  {best_test_mse_rms}')

In [None]:
lambda_eta_heatmap(train_mse, eta_vals, lambda_vals, 
                   metric_name='MSE', dataset='Training')
plt.show()

lambda_eta_heatmap(test_mse, eta_vals, lambda_vals, 
                   metric_name='MSE', dataset='Testing')
plt.show()

# Stochastic Gradient Descent with Adam (L1 Regularization)

In [None]:
eta_vals = np.logspace(-5, -1, 5)
lambda_vals = np.logspace(-5, 1, 5)
hidden_layers = [50, 50]

n_eta = len(eta_vals)
n_lambda = len(lambda_vals)

# Storage
models = [[None for _ in range(n_lambda)] for _ in range(n_eta)]
train_mse = np.zeros((n_eta, n_lambda))
test_mse = np.zeros((n_eta, n_lambda))

# Grid search
for i, eta in enumerate(eta_vals):
    for j, lam in enumerate(lambda_vals):
        # Create and train model
        model = NeuralNetwork(
            network_input_size=1,
            layer_output_sizes=hidden_layers + [1],
            activations=[Sigmoid(), Sigmoid(), Linear()],
            loss=MSE(),
            seed=SEED,
            lambda_reg=lam,
            reg_type='l1' if lam > 0 else None,
            weight_init='xavier'
        )
        
        optimizer = Adam(eta=eta)
        
        train(
            nn=model,
            X_train=X_train_s,
            y_train=y_train_s,
            X_val=X_test_s,
            y_val=y_test_s,
            optimizer=optimizer,
            epochs=500,
            batch_size=16,
            stochastic=True,
            task='regression',
            early_stopping=True,
            patience=50,
            verbose=False,
            seed=SEED
        )
        
        models[i][j] = model
        
        # Evaluate
        y_train_pred = inverse_scale_y(model.predict(X_train_s), y_mean)
        y_test_pred = inverse_scale_y(model.predict(X_test_s), y_mean)
        y_train_real = inverse_scale_y(y_train_s, y_mean)
        
        train_mse[i, j] = mse(y_train_real, y_train_pred)
        test_mse[i, j] = mse(y_test_real, y_test_pred)


min_idx_adam = np.unravel_index(np.argmin(test_mse), test_mse.shape)
i_best_adam, j_best_adam = min_idx_adam

best_eta_adam = eta_vals[i_best_adam]
best_lambda_adam = lambda_vals[j_best_adam]
best_test_mse_adam = test_mse[i_best_adam, j_best_adam]
best_train_mse_adam = train_mse[i_best_adam, j_best_adam]
print(f'Best eta:    {best_eta_adam}')
print(f'Best lambda: {best_lambda_adam}')
print(f'Best train MSE: {best_train_mse_adam}')
print(f'Best test MSE:  {best_test_mse_adam}')

In [None]:
lambda_eta_heatmap(train_mse, eta_vals, lambda_vals, 
                   metric_name='MSE', dataset='Training')
plt.show()

lambda_eta_heatmap(test_mse, eta_vals, lambda_vals, 
                   metric_name='MSE', dataset='Testing')
plt.show()

# Comparing optimization algorithms with Lasso Regression

In [None]:
print("\n" + "="*50)
print("....")
print("="*50)
print(f"Lasso (Scikit-Learn):     {lasso_mse:.6f}")
print(f"NN + RMSprop:             {best_test_mse_rms:.6f}   (eta={best_eta_rms:.4f})   (lambda={best_lambda_rms:.6f})")
print(f"NN + Adam:                {best_test_mse_adam:.6f}  (eta={best_eta_adam:.4f})  (lambda={best_lambda_adam:.6f})")
print("="*50)