In [1]:
import numpy as np
import scipy.linalg
import sklearn.metrics
from sklearn.linear_model import Ridge, Lasso
import torch

import metrics
import utils

np.random.seed(12)

# Linear regression with regularization

## Ridge regression

The error function is $E(\beta) = MSE(\beta) + \alpha * ||\beta||_2^2$
$$E(\beta) = \frac{1}{n} \sum_{i=1}^n(y_i - X_i\beta)^2 + \alpha * ||\beta||_2^2$$

With $\alpha$ real number, the normalization coefficient

We can solve it with gradient descent.  
See linear_regresion to compute the gradient of MSE.  
$$\frac{\partial}{\partial \beta} \alpha * ||\beta||_2^2 = 2 * \alpha * \beta$$

$$\frac{\partial E(\beta)}{\partial \beta} = \frac{\partial MSE(\beta)}{\partial \beta} + 2 * \alpha * \beta$$

In [2]:
def mse(y, y_pred):
    n = len(y)
    return (1 / n) * np.sum((y - y_pred)**2)

def ridge_error(X, y, w, alpha):
    y_pred = X @ w
    return mse(y, y_pred) + alpha * w.T @ w

In [3]:
def mse_dy_pred(y, y_pred):
    n = len(y)
    return (2 / n) * (y_pred - y)

def matmul_dw(X, dout):
    return X.T @ dout

def norm2_prime(x):
    return 2 * x

In [4]:
X = np.random.randn(73, 4)
y = 3 * X[:, 0] + 2 * X[:, 1]**2 + 5 * X[:, 2] - 3.4 * X[:, 3]
y += np.random.randn(73) * 0.2
w = np.random.randn(4)
alpha = 0.3
print(X.shape)
print(y.shape)
print(w.shape)

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=False)
tw = torch.tensor(w, requires_grad=True)

ty_pred = torch.matmul(tX, tw)
criterion = torch.nn.MSELoss()
tmse = criterion(ty_pred, ty)
tloss = tmse + alpha * torch.dot(tw, tw)
tloss.backward()

(73, 4)
(73,)
(4,)


In [5]:
loss = ridge_error(X, y, w, alpha)
loss_sol = tloss.data.numpy()

print(loss)
print(loss_sol)
print(metrics.tdist(loss, loss_sol))

54.418558937558814
54.4185589375588
1.4210854715202004e-14


In [6]:
y_pred = X @ w
dy_pred = mse_dy_pred(y, y_pred)
dw = matmul_dw(X, dy_pred) + alpha * norm2_prime(w)
dw_sol = tw.grad.data.numpy()

print(dw)
print(dw_sol)
print(metrics.tdist(dw, dw_sol))

[ -3.17626852   2.52904771 -12.1927451    2.38583691]
[ -3.17626852   2.52904771 -12.1927451    2.38583691]
3.66205343881779e-15


In [7]:
def fit_sk_ridge(X, y, alpha):
    m = Ridge(alpha, fit_intercept=False)
    m.fit(X, y)
    return m.coef_

def fit_sgd(X, y, alpha, lr, nepochs):
    w = np.random.randn(X.shape[1])
    
    for i in range(nepochs):
        y_pred = X @ w
        dy_pred = mse_dy_pred(y, y_pred)
        dw = matmul_dw(X, dy_pred) + alpha * norm2_prime(w)
        w -= lr * dw
    
    return w

w1 = fit_sk_ridge(X, y, alpha)
w2 = fit_sgd(X, y, alpha, 0.001, 10000)
print(w1)
print(w2)
print(metrics.tdist(w1, w2))

[ 2.85207477 -0.3950604   4.78187303 -3.77590846]
[ 2.19942873 -0.36638889  3.69827639 -2.69610033]
1.6634110056006282


## Lasso regression

The error function is $E(\beta) = MSE(\beta) + \alpha * ||\beta||_1$
$$E(\beta) = \frac{1}{n} \sum_{i=1}^n(y_i - X_i\beta)^2 + \alpha * ||\beta||_1$$

With $\alpha$ real number, the normalization coefficient

We can solve it with gradient descent.  
See linear_regresion to compute the gradient of MSE.  
$$\frac{\partial}{\partial \beta} \alpha * ||\beta||_1 =  \alpha * sign(\beta)$$

$$\frac{\partial E(\beta)}{\partial \beta} = \frac{\partial MSE(\beta)}{\partial \beta} + \alpha * sign(\beta)$$

In [14]:
def mse(y, y_pred):
    n = len(y)
    return (1 / n) * np.sum((y - y_pred)**2)

def lasso_error(X, y, w, alpha):
    y_pred = X @ w
    return mse(y, y_pred) + alpha * np.linalg.norm(w, ord=1)

def norm1_prime(x):
    return np.sign(x)

In [10]:
X = np.random.randn(73, 4)
y = 3 * X[:, 0] + 2 * X[:, 1]**2 + 5 * X[:, 2] - 3.4 * X[:, 3]
y += np.random.randn(73) * 0.2
w = np.random.randn(4)
alpha = 0.3
print(X.shape)
print(y.shape)
print(w.shape)

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=False)
tw = torch.tensor(w, requires_grad=True)

ty_pred = torch.matmul(tX, tw)
criterion = torch.nn.MSELoss()
tmse = criterion(ty_pred, ty)
tloss = tmse + alpha * torch.norm(tw, p=1)
tloss.backward()

(73, 4)
(73,)
(4,)


In [15]:
loss = lasso_error(X, y, w, alpha)
loss_sol = tloss.data.numpy()

print(loss)
print(loss_sol)
print(metrics.tdist(loss, loss_sol))

41.24025952134388
41.24025952134388
0.0


In [17]:
y_pred = X @ w
dy_pred = mse_dy_pred(y, y_pred)
dw = matmul_dw(X, dy_pred) + alpha * norm1_prime(w)
dw_sol = tw.grad.data.numpy()

print(dw)
print(dw_sol)
print(metrics.tdist(dw, dw_sol))

[-3.49040297 -0.9919511  -9.04640128  4.16354108]
[-3.49040297 -0.9919511  -9.04640128  4.16354108]
1.831026719408895e-15


In [18]:
def fit_sk_lasso(X, y, alpha):
    m = Lasso(alpha, fit_intercept=False)
    m.fit(X, y)
    return m.coef_

def fit_lasso(X, y, alpha, lr, nepochs):
    w = np.random.randn(X.shape[1])
    
    for i in range(nepochs):
        y_pred = X @ w
        dy_pred = mse_dy_pred(y, y_pred)
        dw = matmul_dw(X, dy_pred) + alpha * norm1_prime(w)
        w -= lr * dw
    
    return w

w1 = fit_sk_lasso(X, y, alpha)
w2 = fit_lasso(X, y, alpha, 0.001, 10000)
print(w1)
print(w2)
print(metrics.tdist(w1, w2))

[ 2.02274191  0.47450009  4.70531987 -3.3719941 ]
[ 2.23302879  0.64797166  4.86608616 -3.53701612]
0.3569187942384015
