In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, Lasso_gradient, polynomial_features
import json

In [2]:
# Load data arrays
data = np.load('data_arrays_last.npz')
X_train_s = data['X_train_s']
X_test_s = data['X_test_s']
y_test = data['y_test']
y_train = data['y_train']
y_offset = data['y_offset']

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Hessian matrix
H_ridge = (2.0/len(X_train_s))* X_train_s.T @ X_train_s + 2*lmbda*np.eye(X_train_s.shape[1])
EigValues_ridge, EigVectors_ridge = np.linalg.eig(H_ridge)

H_OLS = (2.0/len(X_train_s))* X_train_s.T @ X_train_s 
EigValues_OLS, EigVectors_OLS = np.linalg.eig(H_OLS)

In [3]:
# Stochastic GD

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
n_epochs = 1000000
stopping_criteria = [1e-10]*len(beta_r)
M = 100   #size of each minibatch
m = int(n*0.8/M) #number of minibatches
t0, t1 = 5, 50
def learning_schedule(t):
    return t0/(t+t1)

# Stochastic Gradient Descent for Lasso Regression
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_lasso = beta_gd_lasso - eta*gradients

# Stochastic Gradient Descent for OLS
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ols = beta_gd_ols - eta*gradients

# Stochastic Gradient Descent for Ridge Regression
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ridge = beta_gd_ridge - eta*gradients

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_results.json', 'w') as f:
    json.dump(dict_sgd, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

NameError: name 'n' is not defined

In [None]:
#Addition of momentum

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
momentum = 0.3
stopping_criteria = [1e-10]*len(beta_r)
change = 0.0
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(n*0.8/M) #number of minibatches
t0, t1 = 5, 50
def learning_schedule(t):
    return t0/(t+t1)

for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        # take a step
        beta_gd_lasso -= new_change
        # save the change
        change = new_change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break
        

change = 0.0
# Gradient descent loop
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        # take a step
        beta_gd_ridge -= new_change
        # save the change
        change = new_change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break

change = 0.0
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        # take a step
        beta_gd_ols -= new_change
        # save the change
        change = new_change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_momentum = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_momentum_results.json', 'w') as f:
    json.dump(dict_sgd_momentum, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.010587090019651826, MSE GD OLS: 0.010521229260240844, MSE GD Ridge: 0.010632194203567424
R2 GD Lasso: 0.8594417592689789, R2 GD OLS: 0.8603198173781904, R2 GD Ridge: 0.8588434765844091
Beta GD Lasso: [ 0.00000000e+00  7.77174568e-07 -8.20974252e-01  8.12355502e-07
  6.10291796e-01  6.61165737e-07  3.11682329e-01  3.25102918e-07
 -2.58790718e-02 -6.21360925e-04 -2.93009177e-01]
Beta GD OLS: [ 0.00000000e+00  4.58533312e-05 -8.55973807e-01  7.11011068e-04
  6.43920257e-01  4.76451874e-04  3.58535588e-01 -1.08588305e-03
 -3.94304675e-02 -1.48149455e-03 -3.28863654e-01]
Beta GD Ridge: [ 0.00000000e+00  4.59699349e-04 -8.19418910e-01 -9.89611168e-04
  5.95604009e-01  2.11941815e-03  3.39449495e-01 -2.92580740e-04
 -3.04964114e-02 -2.40764139e-03 -3.03577085e-01]
--------------------------------------------------


In [None]:
#ADAgrad

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_ridge)
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(n*0.8/M) #number of minibatches

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        beta_gd_ridge -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_OLS)

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        beta_gd_lasso -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

for epoch in range(n_epochs):
    Giter = 0.0
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        beta_gd_ols -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_adagrad = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_adagrad_results.json', 'w') as f:
    json.dump(dict_sgd_adagrad, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.008732403716334425, MSE GD OLS: 0.010255434443650256, MSE GD Ridge: 0.011300545287658576
R2 GD Lasso: 0.8840608825903667, R2 GD OLS: 0.8642256172898259, R2 GD Ridge: 0.849973816769186
Beta GD Lasso: [ 0.          0.02371808 -1.16549808 -0.0896047   1.83630366  0.05179959
 -0.79854751  0.01435301 -0.29385334  0.00786607  0.16386655]
Beta GD OLS: [  0.           0.10143781  -3.2685619   -1.08983918  15.50101467
   3.4228095  -32.12097803  -4.40167068  30.37770345   1.91338992
 -10.73740189]
Beta GD Ridge: [ 0.          0.0359991  -0.99097522  0.03272938  1.07446492 -0.13860456
 -0.00418639 -0.05314808 -0.16670444  0.15665191 -0.1703635 ]
--------------------------------------------------


In [None]:
#RMSprop

#WHAT IS WRONG HERE?

# There are a few issues in the RMSprop code:

# 1. Giter should be initialized as a vector (np.zeros) with the same shape as beta, not as a scalar 0.0.
# 2. You should use a separate Giter for each optimizer (Ridge, Lasso, OLS), not reuse the same variable.
# 3. The learning rate (lr) should be set before each optimizer loop, as you do, but you should not reuse the same Giter variable.
# 4. Remove the unused G_iter = 0.0 line.
# 5. The update formula is correct, but make sure Giter is a vector.

# Here is the corrected code:

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_ridge)
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
rho = 0.99
n_epochs = 1000000
M = 50   # size of each minibatch
m = int(len(X_train_s)/M) # number of minibatches

# RMSprop for Ridge
Giter_ridge = np.zeros(len(beta_r))
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        Giter_ridge = rho * Giter_ridge + (1 - rho) * gradients**2
        update = lr * gradients / (delta + np.sqrt(Giter_ridge))
        beta_gd_ridge -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

# RMSprop for Lasso
lr = 1.0 / np.max(EigValues_OLS)
Giter_lasso = np.zeros(len(beta_r))
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        Giter_lasso = rho * Giter_lasso + (1 - rho) * gradients**2
        update = lr * gradients / (delta + np.sqrt(Giter_lasso))
        beta_gd_lasso -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

# RMSprop for OLS
Giter_ols = np.zeros(len(beta_r))
for epoch in range(n_epochs):
    for i in range(m):
        random_index = M*np.random.randint(m)
        xi = X_train_s[random_index:random_index+M]
        yi = y_train[random_index:random_index+M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        Giter_ols = rho * Giter_ols + (1 - rho) * gradients**2
        update = lr * gradients / (delta + np.sqrt(Giter_ols))
        beta_gd_ols -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break    

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_rmsprop = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('sgd_rmsprop_results.json', 'w') as f:
    json.dump(dict_sgd_rmsprop, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.04134056586779405, MSE GD OLS: 0.43819229748147154, MSE GD Ridge: 0.056230677949028626
R2 GD Lasso: 0.447693749420948, R2 GD OLS: -4.7162103249144325, R2 GD Ridge: 0.24253545535699939
Beta GD Lasso: [ 0.          0.03633336 -1.19801708  0.04833248  1.89385325  0.03536947
 -0.98411937  0.02615496 -0.10820222  0.02127051  0.02720054]
Beta GD OLS: [  0.           0.04811783  -2.99799412   0.06328426  12.31516304
   0.06438068 -24.19965365   0.19998768  21.39493408   0.12775397
  -7.37872624]
Beta GD Ridge: [ 0.         -0.05990633 -0.95697317 -0.06029727  1.2547493  -0.0261139
 -0.01948476 -0.00305751 -0.3596999   0.00940338  0.05647358]
--------------------------------------------------


In [None]:
# ADAM

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters for Ridge
lr = 1.0 / np.max(EigValues_ridge)
stopping_criteria = 1e-10  # Using a scalar
delta = 1e-8
rho_1 = 0.9
rho_2 = 0.99
n_epochs = 1000000
M = 50   # size of each minibatch
m = int(len(X_train_s)/M) # number of minibatches
iter = 0

# Initialize moments only once
first_moment_ridge = np.zeros(len(beta_r))
second_moment_ridge = np.zeros(len(beta_r))

for epoch in range(n_epochs):
    iter += 1
    for i in range(m):
        random_index = np.random.randint(0, n - M)  # Number of samples should be adjusted
        xi = X_train_s[random_index:random_index + M]
        yi = y_train[random_index:random_index + M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        
        # Updating moments
        first_moment_ridge = rho_1 * first_moment_ridge + (1 - rho_1) * gradients
        second_moment_ridge = rho_2 * second_moment_ridge + (1 - rho_2) * gradients * gradients
        
        # Bias-corrected moments
        first_term = first_moment_ridge / (1 - rho_1 ** (iter))
        second_term = second_moment_ridge / (1 - rho_2 ** (iter))

        # Update parameters beta
        update = (lr / (np.sqrt(second_term) + delta)) * first_term
        beta_gd_ridge -= update
        
        # Check for convergence
        if np.linalg.norm(update) < stopping_criteria:
            print("Convergence reached for Ridge at epoch", epoch)
            break

# Repeat the similar structure for Lasso and OLS

# For Lasso
lr = 1.0 / np.max(EigValues_OLS)
# Initialize moments
first_moment_lasso = np.zeros(len(beta_r))
second_moment_lasso = np.zeros(len(beta_r))
iter = 0

for epoch in range(n_epochs):
    iter += 1
    for i in range(m):
        random_index = np.random.randint(0, n - M)
        xi = X_train_s[random_index:random_index + M]
        yi = y_train[random_index:random_index + M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        
        # Updating moments
        first_moment_lasso = rho_1 * first_moment_lasso + (1 - rho_1) * gradients
        second_moment_lasso = rho_2 * second_moment_lasso + (1 - rho_2) * gradients * gradients

        first_term = first_moment_lasso / (1 - rho_1 ** (iter))
        second_term = second_moment_lasso / (1 - rho_2 ** (iter))

        # Update parameters beta
        update = (lr / (np.sqrt(second_term) + delta)) * first_term
        beta_gd_lasso -= update
        
        # Check for convergence
        if np.linalg.norm(update) < stopping_criteria:
            print("Convergence reached for Lasso at epoch", epoch)
            break

# For OLS
# Initialize moments
first_moment_ols = np.zeros(len(beta_r))
second_moment_ols = np.zeros(len(beta_r))
iter = 0

for epoch in range(n_epochs):
    iter = 0
    for i in range(m):
        random_index = np.random.randint(0, n - M)
        xi = X_train_s[random_index:random_index + M]
        yi = y_train[random_index:random_index + M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        
        # Updating moments
        first_moment_ols = rho_1 * first_moment_ols + (1 - rho_1) * gradients
        second_moment_ols = rho_2 * second_moment_ols + (1 - rho_2) * gradients * gradients
        
        first_term = first_moment_ols / (1 - rho_1 ** (iter))
        second_term = second_moment_ols / (1 - rho_2 ** (iter))

        # Update parameters beta
        update = (lr / (np.sqrt(second_term) + delta)) * first_term
        beta_gd_ols -= update
        
        # Check for convergence
        if np.linalg.norm(update) < stopping_criteria:
            print("Convergence reached for OLS at epoch", epoch)
            break

# Prediction and evaluation
y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

# Print results
print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_adam = {'MSE GD Lasso': mse_sgd_lasso,
                  'R2 GD Lasso': r2_sgd_lasso,
                  'Beta GD Lasso': beta_gd_lasso,
                  'MSE GD OLS': mse_sgd_ols,
                  'R2 GD OLS': r2_sgd_ols,
                  'Beta GD OLS': beta_gd_ols,
                  'MSE GD Ridge': mse_sgd_ridge,
                  'R2 GD Ridge': r2_sgd_ridge,
                  'Beta GD Ridge': beta_gd_ridge}

# Save results to JSON
with open('sgd_adam_results.json', 'w') as f:
    json.dump(dict_sgd_adam, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.11852301333784303, MSE GD OLS: 0.012081920512245794, MSE GD Ridge: 0.36413125928611406
R2 GD Lasso: -0.5602829348121219, R2 GD OLS: 0.8395942218334782, R2 GD Ridge: -3.7975577192891343
Beta GD Lasso: [ 0.          0.0545191  -1.28518425  0.02268624  1.92513844  0.00818364
 -0.91115743  0.02165781 -0.10878883  0.05234437 -0.12757656]
Beta GD OLS: [ 0.00000000e+00 -4.42246027e-03 -2.93663548e+00  2.19981897e-01
  1.22593123e+01 -2.16768232e-01 -2.37399061e+01  1.60263990e-01
  2.14396164e+01 -6.57634322e-02 -7.25623321e+00]
Beta GD Ridge: [ 0.          0.05390852 -1.04992198  0.22694852  1.24049142 -0.04981653
  0.060943    0.09186162 -0.26677001  0.22493202  0.01778813]
--------------------------------------------------
