In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, Lasso_gradient, polynomial_features
import json

In [None]:
# Load data arrays
data = np.load('data/data_arrays_last.npz')
X_train_s = data['X_train_s']
X_test_s = data['X_test_s']
y_test = data['y_test']
y_train = data['y_train']
y_offset = data['y_offset']

lmbda = 0.001
lr = 0.01

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

In [None]:
# Stochastic GD

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
n_epochs = 1000000
stopping_criteria = [1e-10]*len(beta_r)
M = 100   #size of each minibatch
m = int(len(X_train_s)/M) #number of minibatches
t0, t1 = 5, 50
def learning_schedule(t):
    return t0/(t+t1)
n_data = X_train_s.shape[0]

# Stochastic Gradient Descent for Lasso Regression
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_lasso = beta_gd_lasso - eta*gradients

# Stochastic Gradient Descent for OLS
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ols = beta_gd_ols - eta*gradients

# Stochastic Gradient Descent for Ridge Regression
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        eta = learning_schedule(epoch*m+i)
        if np.linalg.norm(eta*gradients) < np.linalg.norm(stopping_criteria):
            break
        beta_gd_ridge = beta_gd_ridge - eta*gradients

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('data/sgd_results.json', 'w') as f:
    json.dump(dict_sgd, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.01387925761386048, MSE GD OLS: 0.013481469136331992, MSE GD Ridge: 0.013804182505921796
R2 GD Lasso: 0.8136112368995749, R2 GD OLS: 0.8189617695489504, R2 GD Ridge: 0.814627997075275
Beta GD Lasso: [ 0.00000000e+00 -1.05572734e-03 -7.24046481e-01  3.27143607e-06
  4.62774429e-01  5.68674608e-06  2.94507923e-01  8.68292075e-06
 -8.68786673e-03  1.30973797e-05 -2.35980699e-01]
Beta GD OLS: [ 0.00000000e+00 -5.76705112e-03 -7.48426727e-01  6.87534224e-03
  4.84827752e-01 -1.41870013e-03  3.22688643e-01 -2.94970009e-03
  7.33764650e-04  1.40636672e-03 -2.74468398e-01]
Beta GD Ridge: [ 0.00000000e+00 -5.86633021e-03 -7.28332065e-01  3.71592525e-03
  4.60289786e-01  3.96337543e-03  3.09808758e-01  4.89523924e-04
  4.02385358e-03 -3.84896849e-03 -2.58544807e-01]
--------------------------------------------------


In [None]:
#Addition of momentum

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
momentum = 0.3
stopping_criteria = [1e-10]*len(beta_r)
change = 0.0
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(len(X_train_s)/M) #number of minibatches
t0, t1 = 5, 50
def learning_schedule(t):
    return t0/(t+t1)
n_data = X_train_s.shape[0]

for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        # take a step
        beta_gd_lasso -= new_change
        # save the change
        change = new_change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break
        

change = 0.0
# Gradient descent loop
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        # take a step
        beta_gd_ridge -= new_change
        # save the change
        change = new_change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break

change = 0.0
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        eta = learning_schedule(epoch*m+i)
        # calculate update
        new_change = eta*gradients+momentum*change
        # take a step
        beta_gd_ols -= new_change
        # save the change
        change = new_change
        if np.linalg.norm(new_change) < np.linalg.norm(stopping_criteria):
            break

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_momentum = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('data/sgd_momentum_results.json', 'w') as f:
    json.dump(dict_sgd_momentum, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.012637818005049466, MSE GD OLS: 0.012308260764546138, MSE GD Ridge: 0.012657346736828466
R2 GD Lasso: 0.8302766120747471, R2 GD OLS: 0.8347050733398249, R2 GD Ridge: 0.8300191493235124
Beta GD Lasso: [ 0.00000000e+00 -1.38505386e-03 -8.04944914e-01 -5.22926064e-05
  5.89353304e-01 -2.78902001e-05  3.08219683e-01 -2.17889412e-05
 -2.51155134e-02 -1.77349301e-05 -2.86193750e-01]
Beta GD OLS: [ 0.         -0.00163039 -0.83827968 -0.00669924  0.62275737  0.01080659
  0.3500451   0.00496426 -0.03652481 -0.00952961 -0.32111431]
Beta GD Ridge: [ 0.         -0.00466412 -0.80436501  0.00378578  0.57719347  0.00186503
  0.33351867 -0.00168841 -0.02802253 -0.00130649 -0.29794466]
--------------------------------------------------


In [None]:
#ADAgrad

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
n_epochs = 1000000
M = 50   #size of each minibatch
m = int(len(X_train_s)/M) #number of minibatches
n_data = X_train_s.shape[0]
Giter = 0.0

for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        beta_gd_ridge -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

Giter = 0.0

for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        beta_gd_lasso -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

Giter = 0.0

for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        Giter += gradients*gradients
        update = gradients*lr/(delta+np.sqrt(Giter))
        beta_gd_ols -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_adagrad = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('data/sgd_adagrad_results.json', 'w') as f:
    json.dump(dict_sgd_adagrad, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.009393607473463355, MSE GD OLS: 0.00790543585565678, MSE GD Ridge: 0.010079863822402946
R2 GD Lasso: 0.8737303488225683, R2 GD OLS: 0.8936694629200577, R2 GD Ridge: 0.8645308831714662
Beta GD Lasso: [ 0.00000000e+00 -4.08205466e-04 -1.08507775e+00 -5.75797484e-04
  1.35010175e+00 -3.33040449e-04 -1.05389791e-01 -2.37512293e-04
 -3.94243950e-01 -2.44381896e-04 -1.02150091e-04]
Beta GD OLS: [ 0.          0.00469368 -1.2119128  -0.01605349  1.78269107  0.01212615
 -0.39317682  0.01178905 -0.70520876 -0.01400288  0.29476575]
Beta GD Ridge: [ 0.00000000e+00 -2.21442561e-04 -9.99134405e-01 -6.53981092e-03
  1.13179234e+00  6.92987088e-03  2.11896810e-02  5.69188425e-03
 -3.38922348e-01 -7.32884736e-03 -4.02855170e-02]
--------------------------------------------------


In [None]:
#RMSprop

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
rho = 0.99
n_epochs = 1000000
M = 50   # size of each minibatch
m = int(len(X_train_s)/M) # number of minibatches
n_data = X_train_s.shape[0]

# RMSprop for Ridge
Giter_ridge = np.zeros(len(beta_r))
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        Giter_ridge = rho * Giter_ridge + (1 - rho) * gradients**2
        update = lr * gradients / (delta + np.sqrt(Giter_ridge))
        beta_gd_ridge -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

# RMSprop for Lasso
Giter_lasso = np.zeros(len(beta_r))
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        Giter_lasso = rho * Giter_lasso + (1 - rho) * gradients**2
        update = lr * gradients / (delta + np.sqrt(Giter_lasso))
        beta_gd_lasso -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break

# RMSprop for OLS
Giter_ols = np.zeros(len(beta_r))
for epoch in range(n_epochs):
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        Giter_ols = rho * Giter_ols + (1 - rho) * gradients**2
        update = lr * gradients / (delta + np.sqrt(Giter_ols))
        beta_gd_ols -= update
        if np.linalg.norm(update) < np.linalg.norm(stopping_criteria):
            break    

y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_rmsprop = {'MSE GD Lasso': mse_sgd_lasso,
                        'R2 GD Lasso': r2_sgd_lasso,
                        'Beta GD Lasso': beta_gd_lasso,
                        'MSE GD OLS': mse_sgd_ols,
                        'R2 GD OLS': r2_sgd_ols,
                        'Beta GD OLS': beta_gd_ols,
                        'MSE GD Ridge': mse_sgd_ridge,
                        'R2 GD Ridge': r2_sgd_ridge,
                        'Beta GD Ridge': beta_gd_ridge}
with open('data/sgd_rmsprop_results.json', 'w') as f:
    json.dump(dict_sgd_rmsprop, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.009313853142163547, MSE GD OLS: 0.0034020824948612673, MSE GD Ridge: 0.01749971150747832
R2 GD Lasso: 0.8747619889316359, R2 GD OLS: 0.9541791283759, R2 GD Ridge: 0.7646073642028023
Beta GD Lasso: [ 0.         -0.0023125  -1.19221432  0.01268693  1.63392988  0.00912297
 -0.48614708 -0.00449687 -0.26878401 -0.01356541  0.08761167]
Beta GD OLS: [ 0.00000000e+00 -1.80956727e-03 -3.00920608e+00 -5.78940863e-02
  1.29990867e+01 -8.57375617e-02 -2.56868920e+01  2.17300068e-01
  2.36916472e+01 -9.48929962e-02 -8.21823579e+00]
Beta GD Ridge: [ 0.          0.01572337 -1.0820716  -0.02426346  1.10936926  0.05128045
 -0.00547691  0.01812964 -0.31361575 -0.09693796  0.01582343]
--------------------------------------------------


In [None]:
# ADAM

# Initialize weights for gradient descent
beta_gd_ols = np.zeros(len(beta_r))
beta_gd_ridge = np.zeros(len(beta_r))
beta_gd_lasso = np.zeros(len(beta_r))

# Initialize hyperparameters for Ridge
stopping_criteria = 1e-10  # Using a scalar
delta = 1e-8
rho_1 = 0.9
rho_2 = 0.99
n_epochs = 1000000
M = 50   # size of each minibatch
m = int(len(X_train_s)/M) # number of minibatches
iter = 0
n_data = X_train_s.shape[0]

# Initialize moments only once
first_moment_ridge = np.zeros(len(beta_r))
second_moment_ridge = np.zeros(len(beta_r))

for epoch in range(n_epochs):
    iter += 1
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Ridge_gradient(xi, yi, beta_gd_ridge, lmbda)
        
        # Updating moments
        first_moment_ridge = rho_1 * first_moment_ridge + (1 - rho_1) * gradients
        second_moment_ridge = rho_2 * second_moment_ridge + (1 - rho_2) * gradients * gradients
        
        # Bias-corrected moments
        first_term = first_moment_ridge / (1 - rho_1 ** (iter))
        second_term = second_moment_ridge / (1 - rho_2 ** (iter))

        # Update parameters beta
        update = (lr / (np.sqrt(second_term) + delta)) * first_term
        beta_gd_ridge -= update
        
        # Check for convergence
        if np.linalg.norm(update) < stopping_criteria:
            print("Convergence reached for Ridge at epoch", epoch)
            break

# Repeat the similar structure for Lasso and OLS

# Initialize moments
first_moment_lasso = np.zeros(len(beta_r))
second_moment_lasso = np.zeros(len(beta_r))
iter = 0

for epoch in range(n_epochs):
    iter += 1
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = Lasso_gradient(xi, yi, beta_gd_lasso, lmbda)
        
        # Updating moments
        first_moment_lasso = rho_1 * first_moment_lasso + (1 - rho_1) * gradients
        second_moment_lasso = rho_2 * second_moment_lasso + (1 - rho_2) * gradients * gradients

        first_term = first_moment_lasso / (1 - rho_1 ** (iter))
        second_term = second_moment_lasso / (1 - rho_2 ** (iter))

        # Update parameters beta
        update = (lr / (np.sqrt(second_term) + delta)) * first_term
        beta_gd_lasso -= update
        
        # Check for convergence
        if np.linalg.norm(update) < stopping_criteria:
            print("Convergence reached for Lasso at epoch", epoch)
            break

# For OLS
# Initialize moments
first_moment_ols = np.zeros(len(beta_r))
second_moment_ols = np.zeros(len(beta_r))
iter = 0

for epoch in range(n_epochs):
    iter += 1
    indices = np.random.permutation(n_data)
    x_shuffled = X_train_s[indices]
    y_shuffled = y_train[indices]
    for i in range(m):
        xi = x_shuffled[i : i + M]
        yi = y_shuffled[i : i + M]
        gradients = OLS_gradient(xi, yi, beta_gd_ols)
        
        # Updating moments
        first_moment_ols = rho_1 * first_moment_ols + (1 - rho_1) * gradients
        second_moment_ols = rho_2 * second_moment_ols + (1 - rho_2) * gradients * gradients
        
        first_term = first_moment_ols / (1 - rho_1 ** (iter))
        second_term = second_moment_ols / (1 - rho_2 ** (iter))

        # Update parameters beta
        update = (lr / (np.sqrt(second_term) + delta)) * first_term
        beta_gd_ols -= update
        
        # Check for convergence
        if np.linalg.norm(update) < stopping_criteria:
            print("Convergence reached for OLS at epoch", epoch)
            break

# Prediction and evaluation
y_gd_lasso = X_test_s @ beta_gd_lasso + y_offset
y_gd_ols = X_test_s @ beta_gd_ols + y_offset
y_gd_ridge = X_test_s @ beta_gd_ridge + y_offset

mse_sgd_ols = MSE(y_test, y_gd_ols)
mse_sgd_ridge = MSE(y_test, y_gd_ridge)
mse_sgd_lasso = MSE(y_test, y_gd_lasso)
r2_sgd_ols = R2(y_test, y_gd_ols)
r2_sgd_ridge = R2(y_test, y_gd_ridge)
r2_sgd_lasso = R2(y_test, y_gd_lasso)

# Print results
print(f"MSE GD Lasso: {mse_sgd_lasso}, MSE GD OLS: {mse_sgd_ols}, MSE GD Ridge: {mse_sgd_ridge}")
print(f"R2 GD Lasso: {r2_sgd_lasso}, R2 GD OLS: {r2_sgd_ols}, R2 GD Ridge: {r2_sgd_ridge}")
print(f"Beta GD Lasso: {beta_gd_lasso}")
print(f"Beta GD OLS: {beta_gd_ols}")
print(f"Beta GD Ridge: {beta_gd_ridge}")    
print("--------------------------------------------------")

dict_sgd_adam = {'MSE GD Lasso': mse_sgd_lasso,
                  'R2 GD Lasso': r2_sgd_lasso,
                  'Beta GD Lasso': beta_gd_lasso,
                  'MSE GD OLS': mse_sgd_ols,
                  'R2 GD OLS': r2_sgd_ols,
                  'Beta GD OLS': beta_gd_ols,
                  'MSE GD Ridge': mse_sgd_ridge,
                  'R2 GD Ridge': r2_sgd_ridge,
                  'Beta GD Ridge': beta_gd_ridge}

# Save results to JSON
with open('data/sgd_adam_results.json', 'w') as f:
    json.dump(dict_sgd_adam, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

MSE GD Lasso: 0.015661681167338876, MSE GD OLS: 0.025607435536306147, MSE GD Ridge: 0.020221932760799092
R2 GD Lasso: 0.7892532596502582, R2 GD OLS: 0.6554148838047796, R2 GD Ridge: 0.7277653067577401
Beta GD Lasso: [ 0.          0.021385   -1.1747555  -0.01486861  1.9283405  -0.00981143
 -0.80290198  0.00280918 -0.13219289  0.00955085  0.04253397]
Beta GD OLS: [ 0.00000000e+00  3.92554842e-02 -2.99425390e+00  1.32465967e-01
  1.27975784e+01 -1.17959862e-01 -2.52914156e+01  1.26900213e-01
  2.32325987e+01 -1.03563600e-02 -8.04418427e+00]
Beta GD Ridge: [ 0.         -0.05627975 -1.0157445  -0.00316304  1.2092698   0.00398384
 -0.0023898  -0.00817765 -0.36626494 -0.03884472 -0.05670138]
--------------------------------------------------
