In [None]:
%load_ext jupyter_black

In [None]:
import matplotlib.pyplot as plt
import numpy as np
import torch
from torch import nn

# Generate Data

In [None]:
# Definition of the function to be estimated
def function_to_be_estimated(
    x: torch.Tensor, random_seed: int = 0, noise: bool = True, noise_level: float = 5
):
    torch.manual_seed(random_seed)
    y = 2 * x - 1
    if noise:
        y += noise_level * torch.randn(x.shape[0], x.shape[1])
    return y

In [None]:
# Random sample generation
torch.manual_seed(0)
n_samples = 50
x_train = 20 * torch.rand(n_samples, 1) - 5
y_train = function_to_be_estimated(x_train)

# Plot the samples
plt.figure(figsize=(5, 5))
plt.scatter(x_train, y_train)
plt.grid("on")
plt.xlabel("x")
plt.ylabel("y")

# Define the Model

In [None]:
class LinearModel(nn.Module):
    def __init__(self):
        super().__init__()
        self.linear_layer = nn.Linear(1, 1, bias=True)

    def forward(self, X):
        return self.linear_layer(X)

In [None]:
# Create model and print initial values (random init)
model = LinearModel()
print(model.linear_layer.weight)
print(model.linear_layer.bias)

# Learn the model with observations
## Gradient descent
We have some data and the model. To learn the model parameter we need two additional ingredients: the loss function and the optimizer. In this simple case, we will the square error and the objective function will be the mean square error. For the optimizer, we will use a simple gradient descent.

In [None]:
loss_fn = nn.MSELoss()
n_epochs = 100
optimizer = torch.optim.SGD(model.parameters(), lr=0.001)

We can iterate gradient update until convergence and make use of autograd to let pytorch computes the derivative by itself.

In [None]:
model.train()
loss_val = []
for epoch in range(n_epochs):
    # Forward pass
    y_hat = model(x_train)

    # Compute objective
    loss = loss_fn(y_train, y_hat)
    loss_val.append(loss.item())

    # Backpropagation
    loss.backward()
    optimizer.step()
    optimizer.zero_grad()

    if epoch % 10 == 0:
        print(f" Epoch {epoch}: {loss.item():.5f}")

In [None]:
plt.plot(loss_val)
plt.grid("on")

In [None]:
print(model.linear_layer.weight)
print(model.linear_layer.bias)

### Plot the final estimation

In [None]:
model.eval()
x = torch.linspace(-5, 15, 1000)
y_true = function_to_be_estimated(x, noise=False)
model.eval()
with torch.no_grad():
    y_hat = model(torch.unsqueeze(x, 1))
plt.plot(x, y_true, "k")
plt.plot(x, y_hat.detach().numpy(), "r")
plt.scatter(x_train, y_train)
plt.grid("on")

## Some points to investigate before going further
- Change the learning rate of the optimizer to see how it changes the convergence.
- Change to batch gradient update, using pytorch dataloader (https://pytorch.org/docs/stable/data.html#torch.utils.data.DataLoader)
- Try to improve your fit:
    - Increase the number of training samples: what does it change in the final result ? 
    - Add another linear layer in your model: what does it change in the final result ?

# Visualizing convergence - optional

In [None]:
def plot_convergence(
    X_train: torch.Tensor,
    y_train: torch.Tensor,
    model: LinearModel,
    loss_fn: nn.MSELoss,
    lr: float = 0.001,
):
    """helper function to plot gradient update w.r.t the loss landscape"""
    from sklearn.linear_model import LinearRegression as LR

    assert (
        len(list(model.parameters())) == 2
    ), "This code works only for a linear 1D model"

    # Plot the error function
    n_w, n_b = 100, 100
    W = torch.linspace(-1, 4, n_w)
    B = torch.linspace(-1, 1, n_b)

    objective = torch.zeros((n_w, n_b))
    for i, w in enumerate(W):
        for j, b in enumerate(B):
            y_hat = w * X_train + b
            objective[i, j] = np.log(loss_fn(y_train, y_hat).item())
    plt.contourf(W, B, objective.T)

    # Plot the iteration
    optimizer = torch.optim.SGD(model.parameters(), lr=lr)
    # optimizer = torch.optim.Adam(model.parameters(), lr = lr)
    model.train
    res = []
    for epoch in range(100):
        if epoch % 10 == 0:
            plt.plot(model.linear_layer.weight.data, model.linear_layer.bias.data, "*r")
        y_hat = model(X_train)
        loss = loss_fn(y_train, y_hat)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
    plt.plot(np.asarray(res), "r*")

    # Plot optimal solution
    skmodel = LR().fit(X_train, y_train)
    plt.plot(skmodel.coef_, skmodel.intercept_, "b*")

In [None]:
torch.manual_seed(10)
model = LinearModel()
plot_convergence(x_train, y_train, model, loss_fn, lr=0.001)