# Kodekladd

According to Week 36:

OLS:
$$
\nabla_{\theta} C(\theta) = \frac{2}{n}X^T(X\theta - \mathbf{y})
$$

Ridge:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+2\lambda \theta
$$

Lasso:
$$
\frac{\partial C(\boldsymbol{X},\boldsymbol{\theta})}{\partial \boldsymbol{\theta}}=-\frac{2}{n}\boldsymbol{X}^T(\boldsymbol{y}-\boldsymbol{X}\boldsymbol{\theta})+\lambda sgn(\boldsymbol{\theta})
$$

In [1]:
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, polynomial_features

In [None]:
#Analysis of role of learning rate

n = 1000

np.random.seed(42)

x = np.linspace(-1,1, n)
y = runge(x) + 0.1*np.random.normal(0,1)

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 10)
X_test = polynomial_features(x_test, 10)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)
y_offset = np.mean(y_train)

lmbda = 0.001

beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)
print('Ridge parameters:', beta_r)
print('OLS parameters:', beta_o)
y_ols = X_test_s @ beta_o + y_offset
y_ridge = X_test_s @ beta_r + y_offset
mse_ols = MSE(y_test, y_ols)
mse_ridge = MSE(y_test, y_ridge)
r2_ols = R2(y_test, y_ols)
r2_ridge = R2(y_test, y_ridge)

learning_rates = [0.01, 0.1, 0.2, 0.3]

mse_gd_ols = []
mse_gd_ridge = []

r2_gd_ols = []
r2_gd_ridge = []

beta_gd_ols = []
beta_gd_ridge = []

for lr in learning_rates:

    num_iters = 100000000

    # Initialize weights for gradient descent
    beta_gd_r = np.zeros(len(beta_r))
    beta_gd_o = np.zeros(len(beta_o))

    stopping_criteria = [1e-10]*len(beta_r)

    # Gradient descent loop
    for t in range(num_iters):
        # Compute gradients for Ridge
        grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
        # Update parameters beta
        beta_gd_r = beta_gd_r - lr * grad_Ridge
        if (np.abs(- lr * grad_Ridge) < stopping_criteria).all():
            print("Convergence reached at iteration for Ridge", t)
            break
    
    for t in range(num_iters):
        # Compute gradients for OLS
        grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
        # Update parameters beta
        beta_gd_o = beta_gd_o - lr * grad_OLS
        if (np.abs(- lr * grad_OLS) < stopping_criteria).all():
            print("Convergence reached at iteration for OLS", t)
            break

    beta_gd_ols.append(beta_gd_o)
    beta_gd_ridge.append(beta_gd_r)

    y_gd_ols = X_test_s @ beta_gd_ols[-1] + y_offset
    y_gd_ridge = X_test_s @ beta_gd_ridge[-1] + y_offset

    mse_gd_ols.append(MSE(y_test, y_gd_ols))
    mse_gd_ridge.append(MSE(y_test, y_gd_ridge))

    r2_gd_ols.append(R2(y_test, y_gd_ols))
    r2_gd_ridge.append(R2(y_test, y_gd_ridge))

    print(f"Learning rate: {lr}")
    print(f"MSE OLS: {mse_ols}, MSE Ridge: {mse_ridge}, MSE GD OLS: {MSE(y_test, y_gd_ols)}, MSE GD Ridge: {MSE(y_test, y_gd_ridge)}")
    print(f"R2 OLS: {r2_ols}, R2 Ridge: {r2_ridge}, R2 GD OLS: {R2(y_test, y_gd_ols)}, R2 GD Ridge: {R2(y_test, y_gd_ridge)}")
    for beta3, beta4 in zip(beta_gd_ols, beta_gd_ridge):
        print(f"Beta GD OLS: {beta3}, Beta GD Ridge: {beta4}")
    print("--------------------------------------------------")

Ridge parameters: [ 0.00000000e+00  5.54300134e-03 -2.59074814e+00 -7.44146292e-03
  9.69505866e+00 -9.50339784e-03 -1.68243883e+01  2.65540645e-02
  1.37520890e+01 -1.42341245e-02 -4.27508408e+00]
OLS parameters: [ 0.00000000e+00  3.74356233e-03 -2.97426630e+00  9.69057072e-03
  1.25136327e+01 -7.10156122e-02 -2.40999002e+01  1.12354140e-01
  2.15580712e+01 -5.44668224e-02 -7.24703004e+00]
Convergence reached at iteration for Ridge 496103
Learning rate: 0.01
MSE OLS: 0.001484546631866829, MSE Ridge: 0.0017395603039358032, MSE GD OLS: 0.001483336902967267, MSE GD Ridge: 0.00975593448081572
R2 OLS: 0.9800049681741101, R2 Ridge: 0.976567517228484, R2 GD OLS: 0.980021209527389, R2 GD Ridge: 0.8688667735767661
Beta GD OLS: [ 0.00000000e+00  3.78952978e-03 -2.96767152e+00  9.26153929e-03
  1.24648671e+01 -6.95490864e-02 -2.39734940e+01  1.10383619e-01
  2.14220328e+01 -5.35680562e-02 -7.19511517e+00], Beta GD Ridge: [ 0.00000000e+00  1.91892035e-04 -1.02262351e+00 -6.90115050e-03
  1.219243

  gradient = 2.0/n * (X.T @ (X @ theta) - X.T @ y) + 2*lam*theta
  gradient = 2.0/n * (X.T @ (X @ theta) - X.T @ y) + 2*lam*theta
  gradient = 2.0/n * (X.T @ X @ theta - X.T @ y)
  gradient = 2.0/n * (X.T @ X @ theta - X.T @ y)
  beta_gd_o = beta_gd_o - lr * grad_OLS


Learning rate: 0.3
MSE OLS: 0.001484546631866829, MSE Ridge: 0.0017395603039358032, MSE GD OLS: nan, MSE GD Ridge: nan
R2 OLS: 0.9800049681741101, R2 Ridge: 0.976567517228484, R2 GD OLS: nan, R2 GD Ridge: nan
Beta GD OLS: [ 0.00000000e+00  3.78952978e-03 -2.96767152e+00  9.26153929e-03
  1.24648671e+01 -6.95490864e-02 -2.39734940e+01  1.10383619e-01
  2.14220328e+01 -5.35680562e-02 -7.19511517e+00], Beta GD Ridge: [ 0.00000000e+00  1.91892035e-04 -1.02262351e+00 -6.90115050e-03
  1.21924335e+00  6.72252324e-03 -4.72602678e-02  5.79558509e-03
 -3.91830521e-01 -7.44612405e-03  1.65151350e-02]
Beta GD OLS: [ 0.00000000e+00  3.74362765e-03 -2.97425693e+00  9.68996101e-03
  1.25135634e+01 -7.10135281e-02 -2.40997206e+01  1.12351339e-01
  2.15578779e+01 -5.44655451e-02 -7.24695626e+00], Beta GD Ridge: [ 0.00000000e+00  1.91891734e-04 -1.02262430e+00 -6.90113722e-03
  1.21924665e+00  6.72253352e-03 -4.72643298e-02  5.79551506e-03
 -3.91829683e-01 -7.44607374e-03  1.65158504e-02]
Beta GD OLS: 

In [3]:
#Validation of everything

from sklearn.linear_model import LinearRegression
# Imports
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from functions import runge, MSE, R2, Ridge_parameters, OLS_parameters
from functions import OLS_gradient, Ridge_gradient, polynomial_features

#My gradient descent implementation

n = 1000

np.random.seed(42)

x = np.linspace(-1,1, n)
#y = runge(x) + 0.1*np.random.normal(0,1, x.shape)
y = 1 + 2*x

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)
scaler = StandardScaler()
X_train = polynomial_features(x_train, 1)
X_test = polynomial_features(x_test, 1)
scaler.fit(X_train)
X_train_s = scaler.transform(X_train)
X_test_s = scaler.transform(X_test)

# Scale data before fitting the OLS model
scaler_x = StandardScaler()
X_train_s = scaler_x.fit_transform(X_train)
ols_model = LinearRegression(fit_intercept=False)
ols_model.fit(X_train_s, y_train)
print("OLS coefficients (scikit-learn):", ols_model.coef_)

lmbda = 0.001

beta_r = Ridge_parameters(X_train_s, y_train, lmbda)
beta_o = OLS_parameters(X_train_s, y_train)
print('Ridge parameters:', beta_r)
print('OLS parameters:', beta_o)
y_ols = X_test_s @ beta_o
y_ridge = X_test_s @ beta_r
mse_ols = MSE(y_test, y_ols)
mse_ridge = MSE(y_test, y_ridge)
r2_ols = R2(y_test, y_ols)
r2_ridge = R2(y_test, y_ridge)

learning_rates = [0.001] #, 0.01, 0.1, 0.2, 0.3]

mse_gd_ols = []
mse_gd_ridge = []

r2_gd_ols = []
r2_gd_ridge = []

beta_gd_ols = []
beta_gd_ridge = []

for lr in learning_rates:

    num_iters = 100000000

    # Initialize weights for gradient descent
    beta_gd_r = np.zeros(len(beta_r))
    beta_gd_o = np.zeros(len(beta_o))

    stopping_criteria = [1e-10]*len(beta_r)

    # Gradient descent loop
    for t in range(num_iters):
        # Compute gradients for Ridge
        grad_Ridge = Ridge_gradient(X_train_s, y_train, beta_gd_r, lmbda)
        # Update parameters beta
        beta_gd_r = beta_gd_r - lr * grad_Ridge
        if (np.abs(- lr * grad_Ridge) < stopping_criteria).all():
            print("Convergence reached at iteration for Ridge", t)
            break
    
    for t in range(num_iters):
        # Compute gradients for OLS
        grad_OLS = OLS_gradient(X_train_s, y_train, beta_gd_o)
        # Update parameters beta
        beta_gd_o = beta_gd_o - lr * grad_OLS
        if (np.abs(- lr * grad_OLS) < stopping_criteria).all():
            print("Convergence reached at iteration for OLS", t)
            break

    print("Final beta GD OLS:", beta_gd_o)
    print("Final beta GD Ridge:", beta_gd_r)
    beta_gd_ols.append(beta_gd_o)
    beta_gd_ridge.append(beta_gd_r)

    y_gd_ols = X_test_s @ beta_gd_ols[-1]
    y_gd_ridge = X_test_s @ beta_gd_ridge[-1]

    mse_gd_ols.append(MSE(y_test, y_gd_ols))
    mse_gd_ridge.append(MSE(y_test, y_gd_ridge))

    r2_gd_ols.append(R2(y_test, y_gd_ols))
    r2_gd_ridge.append(R2(y_test, y_gd_ridge))

    print(f"Learning rate: {lr}")
    print(f"MSE OLS: {mse_ols}, MSE Ridge: {mse_ridge}, MSE GD OLS: {MSE(y_test, y_gd_ols)}, MSE GD Ridge: {MSE(y_test, y_gd_ridge)}")
    print(f"R2 OLS: {r2_ols}, R2 Ridge: {r2_ridge}, R2 GD OLS: {R2(y_test, y_gd_ols)}, R2 GD Ridge: {R2(y_test, y_gd_ridge)}")
    for beta3, beta4 in zip(beta_gd_ols, beta_gd_ridge):
        print(f"Beta GD OLS: {beta3}, Beta GD Ridge: {beta4}")
    print("--------------------------------------------------")

OLS coefficients (scikit-learn): [0.         1.15848065]
Ridge parameters: [0.         1.15847921]
OLS parameters: [0.         1.15848065]
Convergence reached at iteration for Ridge 8463
Convergence reached at iteration for OLS 8471
Final beta GD OLS: [0.        1.1584806]
Final beta GD Ridge: [0.         1.15732328]
Learning rate: 0.001
MSE OLS: 0.9834027777777774, MSE Ridge: 0.9834028810783095, MSE GD OLS: 0.9834027813351598, MSE GD Ridge: 0.9834866472041199
R2 OLS: 0.5712624740324754, R2 Ridge: 0.5712624483045504, R2 GD OLS: 0.5712624731464901, R2 GD Ridge: 0.5712413422155926
Beta GD OLS: [0.        1.1584806], Beta GD Ridge: [0.         1.15732328]
--------------------------------------------------
