In [None]:
import sys
sys.path.insert(0, '/Users/livestorborg/Desktop/FYS-STK4155/project2/code')

import numpy as np
from sklearn.model_selection import train_test_split
from src.neural_network import NeuralNetwork
from src.activations import Sigmoid, Linear
from src.losses import MSE
from src.optimizers import GD, RMSprop, Adam
from src.training import train
from src.metrics import mse
from src.utils import runge, polynomial_features, scale_data, OLS_parameters, inverse_scale_y

# Setup for OLS Regression

In [None]:
SEED = 42
np.random.seed(SEED)

N = 300
x = np.linspace(-1, 1, N)
y_true = runge(x)
y_noise = y_true + np.random.normal(0, 0.01, N)

In [None]:
X_poly = polynomial_features(x, p=14, intercept=False)

X_train_poly, X_test_poly, y_train, y_test = train_test_split(
    X_poly, y_noise, test_size=0.2, random_state=SEED
)

theta_ols = OLS_parameters(X_train_poly, y_train)
y_pred_ols = X_test_poly @ theta_ols
ols_mse = mse(y_test.reshape(-1, 1), y_pred_ols.reshape(-1, 1))

print(f"OLS Test MSE: {ols_mse:.6f}")

# Setup for FFNN

In [None]:
X_train_raw, X_test_raw, y_train_nn, y_test_nn = train_test_split(
    x.reshape(-1, 1), y_noise.reshape(-1, 1), 
    test_size=0.2, random_state=SEED
)

# Scale
X_train_s, y_train_s, X_mean, X_std, y_mean = scale_data(X_train_raw, y_train_nn)
X_test_s, y_test_s, _, _, _ = scale_data(X_test_raw, y_test_nn, X_mean, X_std, y_mean)

y_test_real = inverse_scale_y(y_test_s, y_mean)

# Learning rates to test

In [None]:
eta_gd = np.logspace(-3, 1, 20)     
eta_rms = np.logspace(-3, -1, 20)    
eta_adam = np.logspace(-3, -1, 20)

# Parameters for FFNN

In [None]:
network_input_size = 1
loss = MSE()

# Experiment 1: one hidden layer and 50 hidden nodes
layer_output_sizes_1 = [50, 1]
activations_1 = [Sigmoid(), Linear()]

# Experiment 2: two hidden layers and 100 hidden nodes each
layer_output_sizes_2 = [100, 100, 1]
activations_2 = [Sigmoid(), Sigmoid(), Linear()]


num_iter = 500     # GD full batch
epochs = 500       # RMSprop and Adam (mini-batch)
batch_size = 32    # RMSprop and Adam (mini-batch)

# Full batch Gradient Descent

In [None]:
best_gd_eta_1 = None
best_gd_mse_1 = float('inf')

best_gd_eta_2 = None
best_gd_mse_2 = float('inf')

for eta in eta_gd:

    # Experiment 1: one hidden layer and 50 hidden nodes
    nn_gd_1 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_1, activations=activations_1, loss=loss, seed=SEED)
    train(nn_gd_1, X_train_s, y_train_s, X_test_s, y_test_s, GD(eta), 
          epochs=500, batch_size=len(X_train_s), verbose=False, seed=SEED) 
    y_pred_gd_s = nn_gd_1.predict(X_test_s)
    y_pred_gd = inverse_scale_y(y_pred_gd_s, y_mean)
    gd_mse = mse(y_test_real, y_pred_gd)

    print(f"  eta={eta:.4f}  MSE={gd_mse:.6f}")

    if gd_mse < best_gd_mse_1:  
        best_gd_mse_1 = gd_mse
        best_gd_eta_1 = eta

    # Experiment 2: two hidden layers and 100 hidden nodes each
    nn_gd_2 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_2, activations=activations_2, loss=loss, seed=SEED)
    train(nn_gd_2, X_train_s, y_train_s, X_test_s, y_test_s, GD(eta), 
          epochs=500, batch_size=len(X_train_s), verbose=False, seed=SEED) 
    y_pred_gd_s = nn_gd_2.predict(X_test_s)
    y_pred_gd = inverse_scale_y(y_pred_gd_s, y_mean)
    gd_mse = mse(y_test_real, y_pred_gd)

    print(f"  eta={eta:.4f}  MSE={gd_mse:.6f}")

    if gd_mse < best_gd_mse_2:  
        best_gd_mse_2 = gd_mse
        best_gd_eta_2 = eta

# Stochastic Gradient Descent with RMSprop

In [None]:
best_rms_eta_1 = None
best_rms_mse_1 = float('inf')

best_rms_eta_2 = None
best_rms_mse_2 = float('inf')

for eta in eta_rms:

    # Experiment 1: one hidden layer and 50 hidden nodes
    nn_rms_1 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_1, activations=activations_1, loss=loss, seed=SEED)
    train(nn_rms_1, X_train_s, y_train_s, X_test_s, y_test_s, RMSprop(eta), 
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_rms_s = nn_rms_1.predict(X_test_s)
    y_pred_rms = inverse_scale_y(y_pred_rms_s, y_mean)
    rms_mse = mse(y_test_real, y_pred_rms)

    print(f"  eta={eta:.4f}  MSE={rms_mse:.6f}")

    if rms_mse < best_rms_mse_1:  
        best_rms_mse_1 = rms_mse
        best_rms_eta_1 = eta

    # Experiment 2: two hidden layers and 100 hidden nodes each
    nn_rms_2 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_2, activations=activations_2, loss=loss, seed=SEED)
    train(nn_rms_2, X_train_s, y_train_s, X_test_s, y_test_s, RMSprop(eta), 
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_rms_s = nn_rms_2.predict(X_test_s)
    y_pred_rms = inverse_scale_y(y_pred_rms_s, y_mean)
    rms_mse = mse(y_test_real, y_pred_rms)

    print(f"  eta={eta:.4f}  MSE={rms_mse:.6f}")

    if rms_mse < best_rms_mse_2:  
        best_rms_mse_2 = rms_mse
        best_rms_eta_2 = eta

# Stochastic Gradient Descent with Adam

In [None]:
best_adam_eta_1 = None
best_adam_mse_1 = float('inf')

best_adam_eta_2 = None
best_adam_mse_2 = float('inf')

for eta in eta_adam:

    # Experiment 1: one hidden layer and 50 hidden nodes
    nn_adam_1 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_1, activations=activations_1, loss=loss, seed=SEED)
    train(nn_adam_1, X_train_s, y_train_s, X_test_s, y_test_s, Adam(eta),
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_adam_s = nn_adam_1.predict(X_test_s)
    y_pred_adam = inverse_scale_y(y_pred_adam_s, y_mean)
    adam_mse = mse(y_test_real, y_pred_adam)

    print(f"  eta={eta:.4f}  MSE={rms_mse:.6f}")

    if rms_mse < best_adam_mse_1:  
        best_adam_mse_1 = rms_mse
        best_adam_eta_1 = eta

    # Experiment 2: two hidden layers and 100 hidden nodes each
    nn_adam_2 = NeuralNetwork(network_input_size=network_input_size, layer_output_sizes=layer_output_sizes_2, activations=activations_2, loss=loss, seed=SEED)
    train(nn_adam_2, X_train_s, y_train_s, X_test_s, y_test_s, Adam(eta), 
          epochs=500, batch_size=batch_size, verbose=False, seed=SEED) 
    y_pred_adam_s = nn_adam_2.predict(X_test_s)
    y_pred_adam = inverse_scale_y(y_pred_adam_s, y_mean)
    adam_mse = mse(y_test_real, y_pred_adam)

    print(f"  eta={eta:.4f}  MSE={adam_mse:.6f}")

    if adam_mse < best_adam_mse_2:  
        best_adam_mse_2 = adam_mse
        best_adam_eta_2 = eta

# Results for one hidden layer and 50 hidden nodes

In [None]:
print("\n" + "="*50)
print("ONE HIDDEN LAYER & 50 HIDDEN NODES")
print("="*50)
print(f"OLS (deg 14):      {ols_mse:.6f}")
print(f"NN + GD:           {best_gd_mse_1:.6f}  (eta={best_gd_eta_1:.4f})")
print(f"NN + RMSprop:      {best_rms_mse_1:.6f}  (eta={best_rms_eta_1:.4f})")
print(f"NN + Adam:         {best_adam_mse_1:.6f}  (eta={best_adam_eta_1:.4f})")
print("="*50)

# Results for two hidden layers and 100 hidden nodes each

In [None]:
print("\n" + "="*50)
print("TWO HIDDEN LAYERS & 100 HIDDEN NODES EACH")
print("="*50)
print(f"OLS (deg 14):      {ols_mse:.6f}")
print(f"NN + GD:           {best_gd_mse_2:.6f}  (eta={best_gd_eta_2:.4f})")
print(f"NN + RMSprop:      {best_rms_mse_2:.6f}  (eta={best_rms_eta_2:.4f})")
print(f"NN + Adam:         {best_adam_mse_2:.6f}  (eta={best_adam_eta_2:.4f})")
print("="*50)