In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, Lasso_gradient, polynomial_features
import json

In [2]:
# Stochastic GD

# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
n_epochs = 1000000
stopping_criteria = [1e-10]*len(beta_r)
M = 100   #size of each minibatch
m = int(n*0.8/M) #number of minibatches
t0, t1 = 5, 50
def learning_schedule(t):
    return t0/(t+t1)

# Stochastic Gradient Descent for Lasso Regression
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_lasso = beta_gd_lasso - eta*gradients

# Stochastic Gradient Descent for OLS
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ols = beta_gd_ols - eta*gradients

# Stochastic Gradient Descent for Ridge Regression
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ridge = beta_gd_ridge - eta*gradients

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_results.json', 'w') as f:
    json.dump(dict_sgd, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.013901044020342122, MSE GD OLS: 0.013470444942618677, MSE GD Ridge: 0.013828406386372775
R2 GD Lasso: 0.8133188130154931, R2 GD OLS: 0.8191096404544187, R2 GD Ridge: 0.8143024904396763
Beta GD Lasso: [ 0.00000000e+00 -1.02815588e-03 -7.22962345e-01  5.60234355e-07
  4.61894282e-01  3.35114045e-06  2.92973095e-01  1.42769517e-05
 -8.74013097e-03  1.89483013e-05 -2.34684049e-01]
Beta GD OLS: [ 0.00000000e+00 -5.25026009e-03 -7.49042314e-01  3.99678092e-03
  4.87113854e-01  2.07421831e-03  3.20995979e-01 -6.00197867e-04
 -9.48468487e-04 -1.84911613e-03 -2.72835902e-01]
Beta GD Ridge: [ 0.00000000e+00 -5.20385769e-03 -7.26724866e-01  3.10652380e-03
  4.58602303e-01  2.79579197e-03  3.08431078e-01 -6.58993373e-05
  4.14592351e-03 -2.21554616e-03 -2.56953559e-01]
--------------------------------------------------


In [3]:
#Addition of momentum

# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
momentum = 0.3
stopping_criteria = [1e-10]*len(beta_r)
change = 0.0
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(n*0.8/M) #number of minibatches
t0, t1 = 5, 50
def learning_schedule(t):
    return t0/(t+t1)

for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break
        # take a step
        beta_gd_lasso -= new_change
        # save the change
        change = new_change

change = 0.0
# Gradient descent loop
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break
        # take a step
        beta_gd_ridge -= new_change
        # save the change
        change = new_change

change = 0.0
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break
        # take a step
        beta_gd_ols -= new_change
        # save the change
        change = new_change

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_momentum = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_momentum_results.json', 'w') as f:
    json.dump(dict_sgd_momentum, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.01262056973799782, MSE GD OLS: 0.012285719326575469, MSE GD Ridge: 0.012625390738191997
R2 GD Lasso: 0.8305095383403167, R2 GD OLS: 0.8350078144836346, R2 GD Ridge: 0.8304471649420873
Beta GD Lasso: [ 0.00000000e+00 -1.22643179e-03 -8.06382727e-01 -1.70912445e-06
  5.91605118e-01 -1.20755289e-06  3.07588732e-01 -1.01143392e-06
 -2.45148841e-02 -1.09056131e-06 -2.87338463e-01]
Beta GD OLS: [ 0.00000000e+00 -3.66729107e-03 -8.40440780e-01  1.58353935e-03
  6.25675248e-01  2.82022404e-03  3.51549380e-01 -3.47505264e-04
 -3.68625049e-02 -2.70222663e-03 -3.22930517e-01]
Beta GD Ridge: [ 0.00000000e+00 -3.70976252e-03 -8.07108793e-01  8.37345077e-04
  5.80712762e-01  3.65068169e-03  3.35079477e-01  5.90741696e-04
 -2.79617874e-02 -3.47308360e-03 -3.00482054e-01]
--------------------------------------------------


In [4]:
#ADAgrad

# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Hessian matrix for Ridge
H = (2.0/n)* X_train_s.T @ X_train_s + 2*lmbda*np.eye(X_train_s.shape[1])
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues)
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(n*0.8/M) #number of minibatches

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ridge -= update

# Hessian matrix for Ridge
H = (2.0/n)* X_train_s.T @ X_train_s
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues)

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_lasso -= update

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ols -= update

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_adagrad = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_adagrad_results.json', 'w') as f:
    json.dump(dict_sgd_adagrad, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Eigenvalues of Hessian Matrix:[7.81988102e+00 7.04651821e+00 6.45820952e-01 4.34618809e-01
 4.34213818e-02 1.99862028e-02 3.31440644e-03 2.41897568e-03
 2.00413804e-03 2.01589893e-03 2.00000000e-03]
Eigenvalues of Hessian Matrix:[7.81788102e+00 7.04451821e+00 6.43820952e-01 4.32618809e-01
 4.14213818e-02 1.79862028e-02 1.31440644e-03 4.18975683e-04
 4.13804044e-06 1.58989315e-05 0.00000000e+00]
MSE GD Lasso: 0.01888055341267186, MSE GD OLS: 0.004551946353780506, MSE GD Ridge: 0.028368686222391925
R2 GD Lasso: 0.746437706350397, R2 GD OLS: 0.9386814008148571, R2 GD Ridge: 0.6207881576758889
Beta GD Lasso: [ 0.00000000e+00 -9.31203724e-02 -1.15082557e+00  1.58078216e-01
  1.59114472e+00 -1.59703338e-02 -6.21278762e-01 -4.90739595e-02
 -8.30962834e-02 -3.03997795e-05 -6.38269226e-02]
Beta GD OLS: [  0.           0.20488812  -3.27780143  -1.27403624  15.418296
   3.55199227 -32.09378228  -4.38894342  30.4265526    1.90156448
 -10.7442997 ]
Beta GD Ridge: [ 0.          0.08484041 -1.0435166

In [5]:
#RMSprop

# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Hessian matrix for Ridge
H = (2.0/n)* X_train_s.T @ X_train_s + 2*lmbda*np.eye(X_train_s.shape[1])
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues)
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
rho = 0.99
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(n*0.8/M) #number of minibatches

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
	# Accumulated gradient
	# Scaling with rho the new and the previous results
        Giter = (rho*Giter+(1-rho)*gradients*gradients)
	# Taking the diagonal only and inverting
        update = gradients*lr/(delta+np.sqrt(Giter))
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
	# Hadamard product
        beta_gd_ridge -= update

# Hessian matrix for OLS
H = (2.0/n)* X_train_s.T @ X_train_s 
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues)

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
	# Accumulated gradient
	# Scaling with rho the new and the previous results
        Giter = (rho*Giter+(1-rho)*gradients*gradients)
	# Taking the diagonal only and inverting
        update = gradients*lr/(delta+np.sqrt(Giter))
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
	# Hadamard product
        beta_gd_lasso -= update

G_iter = 0.0

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
	# Accumulated gradient
	# Scaling with rho the new and the previous results
        Giter = (rho*Giter+(1-rho)*gradients*gradients)
	# Taking the diagonal only and inverting
        update = gradients*lr/(delta+np.sqrt(Giter))
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
	# Hadamard product
        beta_gd_ols -= update

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_rmsprop = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_rmsprop_results.json', 'w') as f:
    json.dump(dict_sgd_rmsprop, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Eigenvalues of Hessian Matrix:[7.81988102e+00 7.04651821e+00 6.45820952e-01 4.34618809e-01
 4.34213818e-02 1.99862028e-02 3.31440644e-03 2.41897568e-03
 2.00413804e-03 2.01589893e-03 2.00000000e-03]
Eigenvalues of Hessian Matrix:[7.81788102e+00 7.04451821e+00 6.43820952e-01 4.32618809e-01
 4.14213818e-02 1.79862028e-02 1.31440644e-03 4.18975683e-04
 4.13804044e-06 1.58989315e-05 0.00000000e+00]
MSE GD Lasso: 0.06700834447518494, MSE GD OLS: 0.11489694852781421, MSE GD Ridge: 0.16262036219586815
R2 GD Lasso: 0.10181174696347595, R2 GD OLS: -0.5075180556184553, R2 GD Ridge: -1.1775154195999065
Beta GD Lasso: [ 0.         -0.0610915  -3.29291909 -0.80159559  8.89824547  2.70997298
 -8.43857938 -3.88472191  0.88163617  2.18579371  1.58724708]
Beta GD OLS: [  0.          -1.34002289  -4.89408271   4.32162118  19.79836162
  -5.92651212 -33.41463376   5.13674268  25.52170907  -2.39690169
  -7.30006864]
Beta GD Ridge: [ 0.          0.11202038 -1.25201631  0.37869214  0.63543462 -0.84820825
 -0

In [6]:
# ADAM

# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Hessian matrix for Ridge
H = (2.0/n)* X_train_s.T @ X_train_s + 2*lmbda*np.eye(X_train_s.shape[1])
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues)
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
rho_1 = 0.9
rho_2 = 0.999
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(n*0.8/M) #number of minibatches
iter = 0

for epoch in range(n_epochs):
    first_moment = 0.0
    second_moment = 0.0
    iter += 1
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        # Computing moments first
        first_moment = rho_1*first_moment + (1-rho_1)*gradients
        second_moment = rho_2*second_moment+(1-rho_2)*gradients*gradients
        first_term = first_moment/(1.0-rho_1**iter)
        second_term = second_moment/(1.0-rho_2**iter)
	# Scaling with rho the new and the previous results
        update = lr*first_term/(np.sqrt(second_term)+delta)
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ridge -= update

# Hessian matrix for OLS
H = (2.0/n)* X_train_s.T @ X_train_s 
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues)
iter = 0

for epoch in range(n_epochs):
    first_moment = 0.0
    second_moment = 0.0
    iter += 1
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        # Computing moments first
        first_moment = rho_1*first_moment + (1-rho_1)*gradients
        second_moment = rho_2*second_moment+(1-rho_2)*gradients*gradients
        first_term = first_moment/(1.0-rho_1**iter)
        second_term = second_moment/(1.0-rho_2**iter)
	# Scaling with rho the new and the previous results
        update = lr*first_term/(np.sqrt(second_term)+delta)
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_lasso -= update

iter = 0

for epoch in range(n_epochs):
    first_moment = 0.0
    second_moment = 0.0
    iter += 1
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        # Computing moments first
        first_moment = rho_1*first_moment + (1-rho_1)*gradients
        second_moment = rho_2*second_moment+(1-rho_2)*gradients*gradients
        first_term = first_moment/(1.0-rho_1**iter)
        second_term = second_moment/(1.0-rho_2**iter)
	# Scaling with rho the new and the previous results
        update = lr*first_term/(np.sqrt(second_term)+delta)
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ols -= update

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_adam = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_adam_results.json', 'w') as f:
    json.dump(dict_sgd_adam, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Eigenvalues of Hessian Matrix:[7.81988102e+00 7.04651821e+00 6.45820952e-01 4.34618809e-01
 4.34213818e-02 1.99862028e-02 3.31440644e-03 2.41897568e-03
 2.00413804e-03 2.01589893e-03 2.00000000e-03]
Eigenvalues of Hessian Matrix:[7.81788102e+00 7.04451821e+00 6.43820952e-01 4.32618809e-01
 4.14213818e-02 1.79862028e-02 1.31440644e-03 4.18975683e-04
 4.13804044e-06 1.58989315e-05 0.00000000e+00]
MSE GD Lasso: 14.82022105350011, MSE GD OLS: 0.9371191468708926, MSE GD Ridge: 0.490705612375676
R2 GD Lasso: -167.38507989381998, R2 GD OLS: -10.66638843371981, R2 GD Ridge: -5.3419023415426565
Beta GD Lasso: [ 0.          0.47112312 -0.58796864  0.85083021  2.00191613  0.30718158
  0.34418783  0.5639419   0.33894091  0.30154983  0.45454693]
Beta GD OLS: [  0.           0.02370772  -2.8604561    0.19476483  11.66062018
   0.74670853 -21.65356828  -0.58057181  18.84023102   0.65319818
  -6.29624628]
Beta GD Ridge: [ 0.         -0.07077638 -0.9023026   0.2370278   0.90256986  0.47475453
 -0.34303