# Kodekladd

According to Week 36:

OLS:
$$
\nabla_{\theta} C(\theta) = \frac{2}{n}X^T(X\theta - \mathbf{y})
$$

Ridge:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+2\lambda \theta
$$

Lasso:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+\lambda sgn(\boldsymbol{\theta})
$$

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, polynomial_features
import json

In [2]:
# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
np.random.seed(42)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

# Hessian matrix
H_ridge = (2.0/len(X_train_s))* X_train_s.T @ X_train_s + 2*lmbda*np.eye(X_train_s.shape[1])
EigValues_ridge, EigVectors_ridge = np.linalg.eig(H_ridge)

H_OLS = (2.0/len(X_train_s))* X_train_s.T @ X_train_s 
EigValues_OLS, EigVectors_OLS = np.linalg.eig(H_OLS)

np.savez('data_arrays_last.npz', X_train_s=X_train_s, X_test_s=X_test_s, y_test=y_test, y_train=y_train, y_offset=y_offset)

In [None]:
#Addition of momentum

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_ridge)
num_iters = 100000000
momentum = 0.3
stopping_criteria = [1e-10]*len(beta_r)
change = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Calculate change with momentum
    new_change = lr * grad_Ridge + momentum * change
    # Update parameters beta
    beta_gd_r = beta_gd_r - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(new_change) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_OLS)
change = 0.0

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Calculate change with momentum
    new_change = lr * grad_OLS + momentum * change
    # Update parameters beta
    beta_gd_o = beta_gd_o - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(new_change) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_momentum = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('momentum_results.json', 'w') as f:
    json.dump(dict_momentum, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Eigenvalues of Hessian Matrix:[7.81988102e+00 7.04651821e+00 6.45820952e-01 4.34618809e-01
 4.34213818e-02 1.99862028e-02 3.31440644e-03 2.41897568e-03
 2.00413804e-03 2.01589893e-03 2.00000000e-03]
Convergence reached at iteration for Ridge 33485
Convergence reached at iteration for OLS 12672432
Learning rate: 0.12787918344577412
MSE GD OLS: 0.0014845451758879675, MSE GD Ridge: 0.00975592438969502
R2 GD OLS: 0.9800049877260462, R2 GD Ridge: 0.8688669087299332
Beta GD OLS: [ 0.00000000e+00  3.74361341e-03 -2.97425898e+00  9.69009393e-03
  1.25135785e+01 -7.10139824e-02 -2.40997597e+01  1.12351950e-01
  2.15579200e+01 -5.44658235e-02 -7.24697234e+00], Beta GD Ridge: [ 0.00000000e+00  1.91891745e-04 -1.02262431e+00 -6.90113712e-03
  1.21924672e+00  6.72253417e-03 -4.72644304e-02  5.79551322e-03
 -3.91829642e-01 -7.44607258e-03  1.65158549e-02]
--------------------------------------------------


In [None]:
#ADAgrad

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_ridge)
num_iters = 100000000
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
G_iter = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    G_iter += grad_Ridge*grad_Ridge
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_Ridge
    beta_gd_r = beta_gd_r - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_OLS)
G_iter = 0.0

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    G_iter += grad_OLS*grad_OLS
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_OLS
    beta_gd_o = beta_gd_o - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_adagrad = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('adagrad_results.json', 'w') as f:
    json.dump(dict_adagrad, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Eigenvalues of Hessian Matrix:[7.81988102e+00 7.04651821e+00 6.45820952e-01 4.34618809e-01
 4.34213818e-02 1.99862028e-02 3.31440644e-03 2.41897568e-03
 2.00413804e-03 2.01589893e-03 2.00000000e-03]
Convergence reached at iteration for Ridge 50479
Eigenvalues of Hessian Matrix:[7.81788102e+00 7.04451821e+00 6.43820952e-01 4.32618809e-01
 4.14213818e-02 1.79862028e-02 1.31440644e-03 4.18975683e-04
 4.13804044e-06 1.58989315e-05 0.00000000e+00]
Convergence reached at iteration for OLS 20125984
Learning rate: 0.12791189798390853
MSE GD OLS: 0.0014845451832575852, MSE GD Ridge: 0.009755924657122474
R2 GD OLS: 0.9800049876269525, R2 GD Ridge: 0.8688669051486893
Beta GD OLS: [ 0.00000000e+00  3.74361625e-03 -2.97425901e+00  9.69006813e-03
  1.25135788e+01 -7.10139059e-02 -2.40997604e+01  1.12351860e-01
  2.15579208e+01 -5.44657868e-02 -7.24697262e+00], Beta GD Ridge: [ 0.00000000e+00  1.91891916e-04 -1.02262430e+00 -6.90113739e-03
  1.21924667e+00  6.72253030e-03 -4.72644483e-02  5.79552228e

In [4]:
#RMSprop

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_ridge)
num_iters = 100000000
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
G_iter = 0.0
rho = 0.9

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    G_iter = rho*G_iter + (1-rho)*grad_Ridge*grad_Ridge
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_Ridge
    beta_gd_r = beta_gd_r - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_OLS)
G_iter = 0.0

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    G_iter = (rho*G_iter + (1-rho)*grad_OLS*grad_OLS)
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_OLS
    beta_gd_o = beta_gd_o - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_rmsprop = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('rmsprop_results.json', 'w') as f:
    json.dump(dict_rmsprop, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Learning rate: 0.10232951838712687
MSE GD OLS: 0.0931552876294208, MSE GD Ridge: 0.10201309005678538
R2 GD OLS: -0.24278558954982699, R2 GD Ridge: -0.37209131819408325
Beta GD OLS: [  0.          -0.0474212   -2.92310155  -0.04147419  12.5647975
  -0.12218037 -24.04873546   0.06118938  21.60923597  -0.10563158
  -7.19586528], Beta GD Ridge: [ 0.         -0.0509624  -0.97147008 -0.05805543  1.27040125 -0.04443175
  0.00388949 -0.04535878 -0.34067516 -0.05860036  0.06767013]
--------------------------------------------------


In [3]:
# ADAM

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_ridge)
num_iters = 100000000
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
rho_1 = 0.9
rho_2 = 0.99
first_moment = 0.0
second_moment = 0.0

# Gradient descent loop
for t in range(num_iters):
    t += 1
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Computing moments first
    first_moment = rho_1 * first_moment + (1 - rho_1) * grad_Ridge
    second_moment = rho_2 * second_moment + (1 - rho_2) * grad_Ridge * grad_Ridge
    first_term = first_moment / (1 - rho_1**(t))
    second_term = second_moment / (1 - rho_2**(t))
    # Update parameters beta
    update = (lr / (np.sqrt(second_term) + delta)) * first_term
    beta_gd_r = beta_gd_r - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
lr = 1.0 / np.max(EigValues_OLS)
first_moment = 0.0
second_moment = 0.0

for t in range(num_iters):
    t += 1
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Computing moments first
    first_moment = rho_1 * first_moment + (1 - rho_1) * grad_OLS
    second_moment = rho_2 * second_moment + (1 - rho_2) * grad_OLS * grad_OLS
    first_term = first_moment / (1 - rho_1**(t))
    second_term = second_moment / (1 - rho_2**(t))
    # Update parameters beta
    update = (lr / (np.sqrt(second_term) + delta)) * first_term
    beta_gd_o = beta_gd_o - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_adam = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('adam_results.json', 'w') as f:
    json.dump(dict_adam, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Convergence reached at iteration for Ridge 20579220
Learning rate: 0.10232951838712687
MSE GD OLS: 0.0014949855074251352, MSE GD Ridge: 0.009755557664900051
R2 GD OLS: 0.9798638841650656, R2 GD Ridge: 0.8688723565479857
Beta GD OLS: [ 0.00000000e+00  4.33871766e-03 -2.97486146e+00  1.02857261e-02
  1.25130376e+01 -7.04204568e-02 -2.41004954e+01  1.12949295e-01
  2.15574761e+01 -5.38716670e-02 -7.24762519e+00], Beta GD Ridge: [ 0.00000000e+00  2.16998659e-04 -1.02264947e+00 -6.87603043e-03
  1.21922185e+00  6.74764417e-03 -4.72899023e-02  5.82061314e-03
 -3.91854561e-01 -7.42096174e-03  1.64907360e-02]
--------------------------------------------------
