In [12]:
import time
import numpy as np
import torch
from torch import nn

First, we assume that the relationship
between features $\mathbf{x}$ and target $y$
is approximately linear,
i.e., that the conditional mean $E[Y \mid X=\mathbf{x}]$
can be expressed as a weighted sum
of the features $\mathbf{x}$.
This setup allows that the target value
may still deviate from its expected value
on account of observation noise.
Next, we can impose the assumption that any such noise
is well-behaved, following a Gaussian distribution.
Typically, we will use $n$ to denote
the number of examples in our dataset.
We use superscripts to enumerate samples and targets,
and subscripts to index coordinates.
More concretely,
$\mathbf{x}^{(i)}$ denotes the $i$-th sample
and $x_j^{(i)}$ denotes its $j$-th coordinate.

In [15]:
# generate synthetic data
class RegressionData():
    def __init__(self, w, b, noise=0.01, num_train=1000, num_val=1000, batch_size=32):
        self.w = w
        self.b = b
        self.num_train = num_train
        self.num_val = num_val
        self.batch_size = batch_size

        self.n = self.num_train + self.num_val
        self.X = torch.randn(self.n, self.w.shape[0])
        self.noise = torch.randn(self.n, 1) * noise
        self.Y = self.X @ w.reshape(-1, 1) + b + self.noise

        self.X_train, self.Y_train = self.X[:self.num_train], self.Y[:self.num_train]
        self.X_val, self.Y_val = self.X[self.num_train:], self.Y[self.num_train:]

    def get_data(self, batch_size, train=True):
        return (self.X_train, self.Y_train) if train else (self.X_val, self.Y_val)

In [19]:
gen = RegressionData(
    w = torch.tensor([2, -3.4]),
    b = 4.2,
)

X_train, Y_train = gen.get_data(gen.batch_size, train=True)

In [24]:
dataset = torch.utils.data.TensorDataset(X_train, Y_train)
data_iter = torch.utils.data.DataLoader(dataset, batch_size=gen.batch_size, shuffle=True)

In [25]:
len(dataset), len(data_iter)

(1000, 32)

In [26]:
class LinearModel(nn.Module):
    def __init__(self, num_inputs, sigma=0.01):
        super().__init__()
        self.w = torch.normal(0, sigma, size=(num_inputs, 1), requires_grad=True)
        self.b = torch.zeros(1, requires_grad=True)

    def forward(self, X):
        return X @ self.w + self.b

In [33]:
model = LinearModel(num_inputs=2)

In [43]:
class SGD(torch.optim.Optimizer):
    def __init__(self, params, lr):
        defaults = dict(lr=lr)
        super().__init__(params, defaults)

    def step(self, closure=None):
        """
        Performs a single optimization step.
        """
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                p.data -= group['lr'] * p.grad.data

    def zero_grad(self):
        """
        clear the gradients of all optimized parameters
        """
        for group in self.param_groups:
            for p in group['params']:
                if p.grad is None:
                    continue
                p.grad.data.zero_()

In [44]:
# loss
def squared_loss(y_hat, y):
    return (y_hat - y.reshape(y_hat.shape)) ** 2 / 2

In [50]:
# optimization
def train(model, data_iter, loss, optimizer, num_epochs):
    for epoch in range(1, num_epochs + 1):
        for X, y in data_iter:
            y_hat = model(X)
            l = loss(y_hat, y)
            l.sum().backward()
            optimizer.step()
            optimizer.zero_grad()
        print(f'epoch {epoch}, loss: {l.mean():f}')

In [51]:
train(model, data_iter, squared_loss, SGD([model.w, model.b], lr=0.03), num_epochs=3)

epoch 1, loss: 0.000066
epoch 2, loss: 0.000044
epoch 3, loss: 0.000020


In [57]:
W_true = gen.w
b_true = gen.b

W_model = model.w
b_model = model.b

with torch.no_grad():
    print(f"error in estimating w: {W_true - W_model.reshape(-1)}")
    print(f"error in estimating b: {b_true - b_model.reshape(-1)}")

# Here we have used `torch.no_grad()` to avoid tracking the gradient. 
# When performing any external operations outside of training loop, 
# we don't want to accumulate the gradient of those operations. 

error in estimating w: tensor([-0.0002, -0.0008])
error in estimating b: tensor([0.0021])
