In [1]:
import numpy as np
import scipy.linalg
import sklearn.metrics
from sklearn.linear_model import LinearRegression, SGDRegressor
import torch

import metrics
import utils

np.random.seed(12)

# Linear regression

Let $X$ training input of size $n * p$.  
It contains $n$ examples, each with $p$ features.  
Let $y$ traiing target of size $n$.  
Each input $X_i$, vector of size $p$, is associated with it's target, $y_i$, a real number.  
Linear regression tries to fit a linear model to predict the target $y$ of a new input vector $x$.

The predictions of the model are denoted $\hat{y}$.
$$\hat{y_i} = X_i\beta = \sum_{j=1}^{p} X_{ij}\beta_j$$
$$\hat{y} = X\beta$$

Where $\beta$, vector of size $p$, are the model parameters.  
The goal is to find $\beta$ in order the minimizes the Mean Squared Error.
$$MSE(\beta) = \frac{1}{n} \sum_{i=1}^n(y_i - \hat{y}_i)^2$$
$$MSE(\beta) = \frac{1}{n} \sum_{i=1}^n(y_i - X_i\beta)^2$$

In [2]:
X = np.random.randn(73, 4)
y = 3 * X[:, 0] + 2 * X[:, 1]**2 + 5 * X[:, 2] - 3.4 * X[:, 3]
y += np.random.randn(73) * 0.2
w = np.random.randn(4)
print(X.shape)
print(y.shape)
print(w.shape)

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=False)
tw = torch.tensor(w, requires_grad=True)

(73, 4)
(73,)
(4,)


In [3]:
def mse(y, y_pred):
    n = len(y)
    return (1 / n) * np.sum((y - y_pred)**2)

In [4]:
y_pred = X @ w
ty_pred = torch.matmul(tX, tw)
utils.save_grad(ty_pred)

criterion = torch.nn.MSELoss()
tloss = criterion(ty_pred, ty)
tloss.backward()

err = mse(y, y_pred)
err_sol = tloss.data.numpy()
print(err)
print(err_sol)
print(metrics.tdist(err, err_sol))

53.37392132995149
53.37392132995148
1.4210854715202004e-14


## Closed form solution

It's possible to find the direct solution by solving for the gradient of the MSE relative to the weights equals 0 (gradient details in gradient descent part).

$$\frac{\partial MSE(\beta)}{\partial \beta} = \frac{2}{n} X^T (\hat{y} - y)$$
$$X^T (y - X\beta) = 0$$
$$\hat{\beta} = (X^TX)^{-1}X^Ty$$

$\hat{\beta}$ are the best parameters

In [5]:
def fit_sk_lq(X, y):
    m = LinearRegression(fit_intercept=False)
    m.fit(X, y)
    return m.coef_

def fit_cf(X, y):
    return np.linalg.inv(X.T @ X) @ X.T @ y

w1 = fit_sk_lq(X, y)
w2 = fit_cf(X, y)
print(w1)
print(w2)
print(metrics.tdist(w1, w2))

[ 2.86409491 -0.39515774  4.80155806 -3.7966973 ]
[ 2.86409491 -0.39515774  4.80155806 -3.7966973 ]
9.064649753153016e-15


## Closest solution by orthogonal projection

Another solution use orthogonal projection.  
Compute the $QR$ projection of $X$: $X = QR$.  
$$\hat{\beta} = R^{-1}Q^Ty$$

$\hat{\beta}$ can be found by solving the upper triangular system $R\hat{\beta}=c$, with $c = Q^Ty$

In [6]:
def fit_qr(X, y):
    Q, R = np.linalg.qr(X)
    c = np.dot(Q.T, y)
    return scipy.linalg.solve_triangular(R, c)

w1 = fit_sk_lq(X, y)
w2 = fit_qr(X, y)
print(w1)
print(w2)
print(metrics.tdist(w1, w2))

[ 2.86409491 -0.39515774  4.80155806 -3.7966973 ]
[ 2.86409491 -0.39515774  4.80155806 -3.7966973 ]
1.1107918681964452e-14


## Gradient descent

This technique works by computing the gradient of the error relative to the 
weights, and update the weights to minimize the eror.

$$\frac{\partial MSE(\beta)}{\partial \hat{y}_j} = \frac{2}{n}(\hat{y_j} - y_j)$$
$$\frac{\partial MSE(\beta)}{\partial \hat{y}} = \frac{2}{n}(\hat{y} - y)$$
$$\frac{\partial MSE(\beta)}{\partial y_j} = \frac{2}{n}(y_j - \hat{y_j})$$
$$\frac{\partial MSE(\beta)}{\partial y} = \frac{2}{n}(y - \hat{y})$$

In [7]:
def mse_dy_pred(y, y_pred):
    n = len(y)
    return (2 / n) * (y_pred - y)

def mse_dy(y, y_pred):
    n = len(y)
    return (2 / n) * (y - y_pred)

In [8]:
dy_pred = mse_dy_pred(y, y_pred)
dy_pred_sol = utils.get_grad(ty_pred).data.numpy()
print(metrics.tdist(dy_pred, dy_pred_sol))

9.484532191350646e-17


The chain rule of partial derivatives:
$$\frac{\partial E}{\partial u} = \sum_{x_i \in preds(E)} \frac{\partial E}{\partial x_i} * \frac{\partial x_i}{\partial u}$$

Where $preds(E)$ representents the required variables to compute $E$.

$$\frac{\partial MSE(\beta)}{\partial \beta_i} = \sum_{j=1}^n \frac{\partial MSE(\beta)}{\partial \hat{y}_j} * \frac{\partial \hat{y}_j}{\partial \beta_i}$$

$$\frac{\partial \hat{y}_j}{\partial \beta_i} = X_{ji}$$
Let $e = \frac{\partial MSE(\beta)}{\partial \hat{y}} = \frac{2}{n}(\hat{y} - y)$

$$\frac{\partial MSE(\beta)}{\partial \beta_i} = \sum_{j=1}^n X^T_{ij} e_j$$ 
$$\frac{\partial MSE(\beta)}{\partial \beta} = X^T e = \frac{2}{n} X^T (\hat{y} - y)$$

In [9]:
def matmul_dw(X, dout):
    return X.T @ dout

In [10]:
dw = matmul_dw(X, dy_pred)
dw_sol = tw.grad.data.numpy()
print(metrics.tdist(dw, dw_sol))

3.66205343881779e-15


## Batch Gradient descent
All the dataset is fit at each epoch.  
Let $\mu$ the learning rate, number close to $0$.  
At each epoch, compute the gradient of the whole dataset and perform the following update:
$$\beta = \beta - \mu * \frac{\partial MSE(\beta)}{\partial \beta}$$

In [11]:
def fit_sk_sgd(X, y, w_init, lr, nepochs):
    
    m = SGDRegressor(fit_intercept=False, learning_rate='constant',
                    eta0=lr)
    
    m.coef_ = w_init.copy()
    m.intercept_ = np.array([0])
    
    for i in range(nepochs):
        m.partial_fit(X, y)
    
    return m.coef_

def fit_sgd(X, y, w_init, lr, nepochs):
    
    w = w_init.copy()
    
    for i in range(nepochs):
        y_pred = X @ w
        dy_pred = mse_dy_pred(y, y_pred)
        dw = matmul_dw(X, dy_pred)
        w -= lr * dw
    
    return w


w_init = np.random.randn(X.shape[1])
w1 = fit_sk_lq(X, y)
w2 = fit_sk_sgd(X, y, w_init, 0.01 / X.shape[0] * 2, 1000)
w3 = fit_sgd(X, y, w_init, 0.01, 1000)

print(w1)
print(w2)
print(w3)
print(metrics.tdist(w1, w2))
print(metrics.tdist(w1, w3))
print(metrics.tdist(w2, w3))

[ 2.86409491 -0.39515774  4.80155806 -3.7966973 ]
[ 2.86394295 -0.39633776  4.80088141 -3.7964394 ]
[ 2.86409476 -0.39515788  4.80155796 -3.79669702]
0.0013928060078077804
3.649091577064423e-07
0.0013925685810385509


## Linear regresion with multiple targets

Linear regression can be extend to multiple targets for each input, it's the same that solving one linear regression problem for each target.  
Let $X$ training input of size $n * p$.  
It contains $n$ examples, each with $p$ features.  
Let $y$ traiing target, matrix of size $n*m$. Each example has $m$ targets.    
Each input $X_i$, vector of size $p$, is associated with it's target, $y_i$, a vector of size $n$. 

The predictions of the model are denoted $\hat{y}$.
$$\hat{y_i} = X_i W$$
$$\hat{y} = XW$$

Where $W$, matrice of size $p * m$, are the model parameters.  
The goal is to find $W$ in order the minimizes the Mean Squared Error.
$$MSE(W) = \frac{1}{n*m} \sum_{i=1}^n||y_i - \hat{y}_i||^2$$
$$MSE(W) = \frac{1}{n*m} \sum_{i=1}^n||y_i - X_iW||^2$$

In [12]:
X = np.random.randn(73, 4)
y =  3 * X[:, 0] + 2 * X[:, 1] * X[:, 1] + 5 * X[:, 2] - 3.4 * X[:, 3]
y += np.random.randn(73) * 0.2

y2 =  3.3 * X[:, 0] - 0.4 * X[:, 1] * X[:, 1] + 1 * X[:, 2] - 2.5 * X[:, 3]
y2 += np.random.randn(73) * 0.2
y3 =  0.2*X[:, 0] - 0.4*X[:, 1]**3 * + 1.9 * X[:, 2] - 2.56 * X[:, 3]
y3 += np.random.randn(73) * 0.2

y = np.column_stack((y, y2, y3))
W = np.random.randn(4, 3)
print(X.shape)
print(y.shape)
print(W.shape)

tX = torch.tensor(X, requires_grad=True)
ty = torch.tensor(y, requires_grad=False)
tW = torch.tensor(W, requires_grad=True)

(73, 4)
(73, 3)
(4, 3)


The formulas to compute the gradient are similar.

$$\frac{\partial MSE(W)}{\partial \hat{y}} = \frac{2}{n*m}(\hat{y} - y)$$
$$\frac{\partial MSE(W)}{\partial y} = \frac{2}{n*m}(y - \hat{y})$$

Let $e = \frac{\partial MSE(W)}{\partial \hat{y}} = \frac{2}{n*m}(\hat{y} - y)$

$$\frac{\partial MSE(W)}{\partial W_{ij}} = \sum_{k=1}^n X^T_{ik} e_{kj}$$ 
$$\frac{\partial MSE(W)}{\partial W} = X^T e = \frac{2}{n*m} X^T (\hat{y} - y)$$

In [13]:
def mse(y, y_pred):
    n = y.shape[0]
    m = y.shape[1]
    return (1 / (n*m)) * np.sum((y - y_pred)**2)

def mse_dy_pred(y, y_pred):
    n = y.shape[0]
    m = y.shape[1]
    return (2 / (n*m)) * (y_pred - y)

def mse_dy(y, y_pred):
    n = y.shape[0]
    m = y.shape[1]
    return (2 / (n*m)) * (y - y_pred)

In [14]:
y_pred = X @ W
ty_pred = torch.matmul(tX, tW)
utils.save_grad(ty_pred)

criterion = torch.nn.MSELoss()
tloss = criterion(ty_pred, ty)
tloss.backward()

err = mse(y, y_pred)
err_sol = tloss.data.numpy()
print(err)
print(err_sol)
print(metrics.tdist(err, err_sol))

46.36640883620705
46.36640883620706
7.105427357601002e-15


In [15]:
dy_pred = mse_dy_pred(y, y_pred)
dy_pred_sol = utils.get_grad(ty_pred).data.numpy()
print(metrics.tdist(dy_pred, dy_pred_sol))

4.787324280642369e-17


In [16]:
dW = matmul_dw(X, dy_pred)
dW_sol = tW.grad.data.numpy()
print(metrics.tdist(dW, dW_sol))

1.5059797815742444e-15


For Gradient descent, the algorithms are the same

In [18]:
w_init = np.random.randn(X.shape[1], y.shape[1])
w1 = fit_sk_lq(X, y).T
w2 = fit_sgd(X, y, w_init, 0.01, 10000)

print(w1)
print(w2)
print(metrics.tdist(w1, w2))

[[ 3.21515455  3.26578491  0.19849848]
 [-0.17254224  0.07420195 -0.88803732]
 [ 4.94163407  0.96301013  0.04748709]
 [-3.79287317 -2.40189836 -2.77119892]]
[[ 3.21515455  3.26578491  0.19849848]
 [-0.17254224  0.07420195 -0.88803732]
 [ 4.94163407  0.96301013  0.04748709]
 [-3.79287317 -2.40189836 -2.77119892]]
1.646949566984573e-13
