# Kodekladd

According to Week 36:

OLS:
$$
\nabla_{\theta} C(\theta) = \frac{2}{n}X^T(X\theta - \mathbf{y})
$$

Ridge:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+2\lambda \theta
$$

Lasso:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+\lambda sgn(\boldsymbol{\theta})
$$

In [2]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, polynomial_features

In [3]:
#Addition of momentum

n = 1000

np.random.seed(42)

x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

lmbda = 0.001

beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)
print('Ridge parameters:', beta_r)
print('OLS parameters:', beta_o)
y_ols = X_test_s @ beta_o
y_ridge = X_test_s @ beta_r
mse_ols = MSE(y_test, y_ols)
mse_ridge = MSE(y_test, y_ridge)
r2_ols = R2(y_test, y_ols)
r2_ridge = R2(y_test, y_ridge)

lr = 0.2
num_iters = 100000000
momentum = 0.3

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

stopping_criteria = [1e-10]*len(beta_r)
change = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Calculate change with momentum
    new_change = lr * grad_Ridge + momentum * change
    # Update parameters beta
    beta_gd_r = beta_gd_r - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_Ridge) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

change = 0.0
for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Calculate change with momentum
    new_change = lr * grad_OLS + momentum * change
    # Update parameters beta
    beta_gd_o = beta_gd_o - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_OLS) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o
y_gd_ridge = X_test_s @ beta_gd_r

print(f"Learning rate: {lr}")
print(f"MSE OLS: {mse_ols}, MSE Ridge: {mse_ridge}, MSE GD OLS: {MSE(y_test, y_gd_ols)}, MSE GD Ridge: {MSE(y_test, y_gd_ridge)}")
print(f"R2 OLS: {r2_ols}, R2 Ridge: {r2_ridge}, R2 GD OLS: {R2(y_test, y_gd_ols)}, R2 GD Ridge: {R2(y_test, y_gd_ridge)}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

Ridge parameters: [ 0.00000000e+00  5.54300134e-03 -2.59074814e+00 -7.44146292e-03
  9.69505866e+00 -9.50339784e-03 -1.68243883e+01  2.65540645e-02
  1.37520890e+01 -1.42341245e-02 -4.27508408e+00]
OLS parameters: [ 0.00000000e+00  3.74356233e-03 -2.97426630e+00  9.69057072e-03
  1.25136327e+01 -7.10156122e-02 -2.40999002e+01  1.12354140e-01
  2.15580712e+01 -5.44668224e-02 -7.24703004e+00]
Convergence reached at iteration for Ridge 22131
Convergence reached at iteration for OLS 8405293
Learning rate: 0.2
MSE OLS: 0.10997884456748792, MSE Ridge: 0.10913833858639004, MSE GD OLS: 0.10997883040673997, MSE GD Ridge: 0.10770193829327168
R2 OLS: 0.39816788476202536, R2 Ridge: 0.3991367745434813, R2 GD OLS: 0.3981679182266413, R2 GD Ridge: 0.37507265066214635
Beta GD OLS: [ 0.00000000e+00  3.74359499e-03 -2.97426162e+00  9.69026586e-03
  1.25135981e+01 -7.10145701e-02 -2.40998104e+01  1.12352739e-01
  2.15579745e+01 -5.44661837e-02 -7.24699315e+00], Beta GD Ridge: [ 0.00000000e+00  1.91891764

# STOPED HERE

The examples of how to implement the methods bellow all use automatic differentiation. Can we use it? 

For now, I'll skip it.

In [None]:
#ADAgrad

n = 1000

np.random.seed(42)

x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

lmbda = 0.001

beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)
print('Ridge parameters:', beta_r)
print('OLS parameters:', beta_o)
y_ols = X_test_s @ beta_o
y_ridge = X_test_s @ beta_r
mse_ols = MSE(y_test, y_ols)
mse_ridge = MSE(y_test, y_ridge)
r2_ols = R2(y_test, y_ols)
r2_ridge = R2(y_test, y_ridge)

num_iters = 100000000

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

stopping_criteria = [1e-10]*len(beta_r)

# Hessian matrix
H = (2.0/n)* X_train_s.T @ X_train_s + 2*lmbda*np.eye(X_train_s.shape[1])
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

lr = 1.0 / np.max(EigValues)

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Calculate change with momentum
    new_change = lr * grad_Ridge + momentum * change
    # Update parameters beta
    beta_gd_r = beta_gd_r - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_Ridge) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

H = (2.0/n)* X_train_s.T @ X_train_s 
EigValues, EigVectors = np.linalg.eig(H)
print(f"Eigenvalues of Hessian Matrix:{EigValues}")

lr = 1.0 / np.max(EigValues)

for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Calculate change with momentum
    new_change = lr * grad_OLS + momentum * change
    # Update parameters beta
    beta_gd_o = beta_gd_o - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_OLS) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o
y_gd_ridge = X_test_s @ beta_gd_r

print(f"Learning rate: {lr}")
print(f"MSE OLS: {mse_ols}, MSE Ridge: {mse_ridge}, MSE GD OLS: {MSE(y_test, y_gd_ols)}, MSE GD Ridge: {MSE(y_test, y_gd_ridge)}")
print(f"R2 OLS: {r2_ols}, R2 Ridge: {r2_ridge}, R2 GD OLS: {R2(y_test, y_gd_ols)}, R2 GD Ridge: {R2(y_test, y_gd_ridge)}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

In [None]:
#RMSprop

n = 1000

np.random.seed(42)

x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

lmbda = 0.001

beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)
print('Ridge parameters:', beta_r)
print('OLS parameters:', beta_o)
y_ols = X_test_s @ beta_o
y_ridge = X_test_s @ beta_r
mse_ols = MSE(y_test, y_ols)
mse_ridge = MSE(y_test, y_ridge)
r2_ols = R2(y_test, y_ols)
r2_ridge = R2(y_test, y_ridge)

lr = 0.2
num_iters = 100000000
momentum = 0.3

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

stopping_criteria = [1e-10]*len(beta_r)
change = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Calculate change with momentum
    new_change = lr * grad_Ridge + momentum * change
    # Update parameters beta
    beta_gd_r = beta_gd_r - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_Ridge) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

change = 0.0
for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Calculate change with momentum
    new_change = lr * grad_OLS + momentum * change
    # Update parameters beta
    beta_gd_o = beta_gd_o - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_OLS) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o
y_gd_ridge = X_test_s @ beta_gd_r

print(f"Learning rate: {lr}")
print(f"MSE OLS: {mse_ols}, MSE Ridge: {mse_ridge}, MSE GD OLS: {MSE(y_test, y_gd_ols)}, MSE GD Ridge: {MSE(y_test, y_gd_ridge)}")
print(f"R2 OLS: {r2_ols}, R2 Ridge: {r2_ridge}, R2 GD OLS: {R2(y_test, y_gd_ols)}, R2 GD Ridge: {R2(y_test, y_gd_ridge)}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")

In [None]:
# ADAM

n = 1000

np.random.seed(42)

x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

lmbda = 0.001

beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)
print('Ridge parameters:', beta_r)
print('OLS parameters:', beta_o)
y_ols = X_test_s @ beta_o
y_ridge = X_test_s @ beta_r
mse_ols = MSE(y_test, y_ols)
mse_ridge = MSE(y_test, y_ridge)
r2_ols = R2(y_test, y_ols)
r2_ridge = R2(y_test, y_ridge)

lr = 0.2
num_iters = 100000000
momentum = 0.3

# Initialize weights for gradient descent
beta_gd_r = np.zeros(len(beta_r))
beta_gd_o = np.zeros(len(beta_o))

stopping_criteria = [1e-10]*len(beta_r)
change = 0.0

# Gradient descent loop
for t in range(num_iters):
    # Compute gradients for Ridge
    grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
    # Calculate change with momentum
    new_change = lr * grad_Ridge + momentum * change
    # Update parameters beta
    beta_gd_r = beta_gd_r - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_Ridge) < stopping_criteria).all():
        print("Convergence reached at iteration for Ridge", t)
        break

change = 0.0
for t in range(num_iters):
    # Compute gradients for OLS
    grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
    # Calculate change with momentum
    new_change = lr * grad_OLS + momentum * change
    # Update parameters beta
    beta_gd_o = beta_gd_o - new_change
    # Save change for next iteration
    change = new_change
    # Check for convergence
    if (np.abs(- lr * grad_OLS) < stopping_criteria).all():
        print("Convergence reached at iteration for OLS", t)
        break

y_gd_ols = X_test_s @ beta_gd_o
y_gd_ridge = X_test_s @ beta_gd_r

print(f"Learning rate: {lr}")
print(f"MSE OLS: {mse_ols}, MSE Ridge: {mse_ridge}, MSE GD OLS: {MSE(y_test, y_gd_ols)}, MSE GD Ridge: {MSE(y_test, y_gd_ridge)}")
print(f"R2 OLS: {r2_ols}, R2 Ridge: {r2_ridge}, R2 GD OLS: {R2(y_test, y_gd_ols)}, R2 GD Ridge: {R2(y_test, y_gd_ridge)}")
print(f"Beta GD OLS: {beta_gd_o}, Beta GD Ridge: {beta_gd_r}")
print("--------------------------------------------------")