# Kodekladd

According to Week 36:

OLS:
$$
\nabla_{\theta} C(\theta) = \frac{2}{n}X^T(X\theta - \mathbf{y})
$$

Ridge:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+2\lambda \theta
$$

Lasso:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+\lambda sgn(\boldsymbol{\theta})
$$

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, polynomial_features
import json

In [2]:
# Data generation
n = 1000
np.random.seed(42)
x = np.linspace(-1,1, n)
np.random.seed(42)
y = runge(x) + 0.1*np.random.normal(0,1)

# Split into training and test sets, scale data and create polynomial features
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001
lr = 0.01

# Calculate parameters using OLS and Ridge closed form solutions
beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)

np.savez('data_arrays_last.npz', X_train_s=X_train_s, X_test_s=X_test_s, y_test=y_test, y_train=y_train, y_offset=y_offset)

In [3]:
#Addition of momentum

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
num_iters = 100000000
momentum = 0.3
stopping_criteria = [1e-10]*len(beta_r)
change = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Calculate change with momentum
    new_change = lr * grad_Ridge + momentum * change
    # Update parameters beta
    beta_gd_r = beta_gd_r - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(new_change) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
change = 0.0

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Calculate change with momentum
    new_change = lr * grad_OLS + momentum * change
    # Update parameters beta
    beta_gd_o = beta_gd_o - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(new_change) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_momentum = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('momentum_results.json', 'w') as f:
    json.dump(dict_momentum, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Convergence reached at iteration for Ridge 358451
Learning rate: 0.01
MSE GD OLS: 0.0014844050649363589, MSE GD Ridge: 0.009755930977492997
R2 GD OLS: 0.9800068691929409, R2 GD Ridge: 0.8688668204969766
Beta GD OLS: [ 0.00000000e+00  3.74857078e-03 -2.97354776e+00  9.64382499e-03
  1.25083194e+01 -7.08558248e-02 -2.40861275e+01  1.12139439e-01
  2.15432490e+01 -5.43688960e-02 -7.24137358e+00], Beta GD Ridge: [ 0.00000000e+00  1.91891847e-04 -1.02262379e+00 -6.90114500e-03
  1.21924449e+00  6.72252475e-03 -4.72616115e-02  5.79556250e-03
 -3.91830338e-01 -7.44610701e-03  1.65154352e-02]
--------------------------------------------------


In [4]:
#ADAgrad

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
num_iters = 100000000
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
G_iter = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    G_iter += grad_Ridge*grad_Ridge
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_Ridge
    beta_gd_r = beta_gd_r - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
G_iter = 0.0

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    G_iter += grad_OLS*grad_OLS
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_OLS
    beta_gd_o = beta_gd_o - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_adagrad = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('adagrad_results.json', 'w') as f:
    json.dump(dict_adagrad, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Convergence reached at iteration for Ridge 369750
Learning rate: 0.01
MSE GD OLS: 0.0014845180569028782, MSE GD Ridge: 0.009755932193597897
R2 GD OLS: 0.980005351916389, R2 GD Ridge: 0.8688668041923406
Beta GD OLS: [ 0.00000000e+00  3.74423783e-03 -2.97412571e+00  9.68415136e-03
  1.25125987e+01 -7.09926302e-02 -2.40972296e+01  1.12322150e-01
  2.15552046e+01 -5.44518438e-02 -7.24593821e+00], Beta GD Ridge: [ 0.00000000e+00  1.91887126e-04 -1.02262366e+00 -6.90113717e-03
  1.21924431e+00  6.72254380e-03 -4.72619020e-02  5.79551035e-03
 -3.91829741e-01 -7.44607680e-03  1.65151868e-02]
--------------------------------------------------


In [5]:
#RMSprop

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
num_iters = 100000000
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
G_iter = np.zeros(len(beta_r))
rho = 0.9

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    G_iter = rho*G_iter + (1-rho)*grad_Ridge*grad_Ridge
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_Ridge
    beta_gd_r = beta_gd_r - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
G_iter = np.zeros(len(beta_o))

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    G_iter = (rho*G_iter + (1-rho)*grad_OLS*grad_OLS)
    # Update parameters beta
    update = (lr / (np.sqrt(G_iter) + delta)) * grad_OLS
    beta_gd_o = beta_gd_o - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_rmsprop = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('rmsprop_results.json', 'w') as f:
    json.dump(dict_rmsprop, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Learning rate: 0.01
MSE GD OLS: 0.0023747134340582495, MSE GD Ridge: 0.010707072948244595
R2 GD OLS: 0.9680245144006304, R2 GD Ridge: 0.855978969899131
Beta GD OLS: [ 0.00000000e+00 -1.25643651e-03 -2.96926631e+00  4.69057174e-03
  1.25186327e+01 -7.60156112e-02 -2.40949002e+01  1.07354141e-01
  2.15630712e+01 -5.94668213e-02 -7.24203004e+00], Beta GD Ridge: [ 0.00000000e+00 -4.80810701e-03 -1.01762437e+00 -1.19011362e-02
  1.22424696e+00  1.72253834e-03 -4.22647965e-02  7.95507315e-04
 -3.86829455e-01 -1.24460675e-02  2.15158418e-02]
--------------------------------------------------


In [7]:
# ADAM

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

# Initialize hyperparameters
num_iters = 100000000
stopping_criteria = [1e-10]*len(beta_r)
delta = 1e-8
rho_1 = 0.9
rho_2 = 0.99
first_moment = 0.0
second_moment = 0.0

# Gradient descent loop
for t in range(num_iters):
    t += 1
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Computing moments first
    first_moment = rho_1 * first_moment + (1 - rho_1) * grad_Ridge
    second_moment = rho_2 * second_moment + (1 - rho_2) * grad_Ridge * grad_Ridge
    first_term = first_moment / (1 - rho_1**(t))
    second_term = second_moment / (1 - rho_2**(t))
    # Update parameters beta
    update = (lr / (np.sqrt(second_term) + delta)) * first_term
    beta_gd_r = beta_gd_r - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

# Initialize hyperparameters
first_moment = 0.0
second_moment = 0.0

for t in range(num_iters):
    t += 1
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Computing moments first
    first_moment = rho_1 * first_moment + (1 - rho_1) * grad_OLS
    second_moment = rho_2 * second_moment + (1 - rho_2) * grad_OLS * grad_OLS
    first_term = first_moment / (1 - rho_1**(t))
    second_term = second_moment / (1 - rho_2**(t))
    # Update parameters beta
    update = (lr / (np.sqrt(second_term) + delta)) * first_term
    beta_gd_o = beta_gd_o - update
    # Check for convergence
    if (np.abs(update) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o + y_offset
y_gd_ridge = X_test_s @ beta_gd_r + y_offset

mse_gd_ols = MSE(y_test, y_gd_ols)
mse_gd_ridge = MSE(y_test, y_gd_ridge)
r2_gd_ols = R2(y_test, y_gd_ols)
r2_gd_ridge = R2(y_test, y_gd_ridge)

print(f"Learning rate: {lr}")
print(f"MSE GD OLS: {mse_gd_ols}, MSE GD Ridge: {mse_gd_ridge}")
print(f"R2 GD OLS: {r2_gd_ols}, R2 GD Ridge: {r2_gd_ridge}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

dict_adam = {'Beta Ridge': beta_gd_r, 'Beta OLS': beta_gd_o, 'R2 Ridge': r2_gd_ridge, 'R2 OLS': r2_gd_ols,
                 'MSE Ridge': mse_gd_ridge, 'MSE OLS': mse_gd_ols}
with open('adam_results.json', 'w') as f:
    json.dump(dict_adam, f, indent=4, default=lambda x: x.tolist() if hasattr(x, 'tolist') else x)

Convergence reached at iteration for Ridge 5182022
Convergence reached at iteration for OLS 2175289
Learning rate: 0.01
MSE GD OLS: 0.0014844716177567874, MSE GD Ridge: 0.009755593889671099
R2 GD OLS: 0.9800059441048328, R2 GD Ridge: 0.8688718153835356
Beta GD OLS: [ 0.00000000e+00  3.78450056e-03 -2.97430724e+00  9.73150898e-03
  1.25135918e+01 -7.09746739e-02 -2.40999412e+01  1.12395078e-01
  2.15580303e+01 -5.44258841e-02 -7.24707098e+00], Beta GD Ridge: [ 0.00000000e+00  2.14372859e-04 -1.02264685e+00 -6.87865622e-03
  1.21922448e+00  6.74501838e-03 -4.72872765e-02  5.81798735e-03
 -3.91851935e-01 -7.42358753e-03  1.64933618e-02]
--------------------------------------------------
