In this notebook, we will try to better understand how stochastic gradient works. We fit a very simple non-convex model to data generated from a linear ground truth model.

We will also observe how the (stochastic) loss landscape changes when selecting different samples.

In [None]:
import matplotlib.pyplot as plt
import torch
import torch.nn as nn
from torch.nn import Parameter
from torch.nn.functional import mse_loss

import numpy as np

from torch.autograd import Variable
from torch.nn.functional import relu

Data is generated from a simple model:
$$y=  2x + \epsilon$$

where:

- $\epsilon \sim \mathcal{N}(0, .1)$
- $x \sim \mathcal{U}(-1, 1)$

In [None]:
def sample_from_ground_truth(n_samples=100):
    x = torch.FloatTensor(n_samples, 1).uniform_(-1, 1)
    epsilon = torch.FloatTensor(n_samples, 1).normal_(0, .1)
    y = 2 * x + epsilon
    return x, y


x, y = sample_from_ground_truth()

We propose a minimal single hidden layer perceptron model with a single hidden unit and no bias. The model has two tunable parameters $w_1$, and $w_2$, such that:

$$f(x) = w_1 \cdot \sigma(w_2 \cdot x)$$

where $\sigma$ is the ReLU function.

In [None]:
class SimpleMLP(nn.Module):
    def __init__(self, w=None):
        super(SimpleMLP, self).__init__()
        self.w1 = Parameter(torch.FloatTensor((1, )))
        self.w2 = Parameter(torch.FloatTensor((1, )))
        if w is None:
            self.reset_parameters()
        else:
            self.set_parameters(w)

    def reset_parameters(self):
        self.w1.data.uniform_(-.1, .1)
        self.w2.data.uniform_(-.1, .1)

    def set_parameters(self, w):
        self.w1.data[0] = w[0]
        self.w2.data[0] = w[1]

    def forward(self, x):
        return self.w1 * relu(self.w2 * x)

As in the previous notebook, we define a function to sample from and plot loss landscapes.

In [None]:
def make_grids(x, y, grid_size=100):
    n_samples = len(x)
    assert len(x) == len(y)

    # Grid logic
    x_max, y_max, x_min, y_min = 3, 3, -3, -3
    w1 = np.linspace(x_min, x_max, grid_size, dtype=np.float32)
    w2 = np.linspace(y_min, y_max, grid_size, dtype=np.float32)
    W1, W2 = np.meshgrid(w1, w2)
    W = np.concatenate((W1[:, :, None], W2[:, :, None]), axis=2)
    W = torch.from_numpy(W)

    # We will store the results in this tensor
    grids = torch.FloatTensor(n_samples, grid_size, grid_size)

    # Make x a variable. volatile=True tells pytorch to keep no
    # information for gradient computation.
    x = Variable(x, volatile=True)
    y = Variable(y, volatile=True)

    for i in range(grid_size):
        for j in range(grid_size):
            model = SimpleMLP(W[i, j])
            pred = model(x)
            loss = mse_loss(pred, y, reduce=False).data
            grids[:, i, j] = loss
    return W1, W2, grids

- `G[k, i, j]` holds the single sample loss value $\ell(f(w_1 = i , w_2 = j, x_k), y_k)$

- `G_mean[i, j]` corresponds to the empirical risk:

$$ \frac{1}{n} \sum_{k=1}^{n} \ell(f(w_1 = i , w_2 = j, x_k), y_k)$$

In [None]:
W1, W2, G = make_grids(x, y)
G_mean = torch.mean(G, dim=0)

Let's define our train loop and train our model

In [None]:
from torch.optim import SGD

def train(x, y, init, lr=.1):
    model = SimpleMLP(init)
    optimizer = SGD(model.parameters(), lr=lr)
    iterate_rec = []
    grad_rec = []
    for this_x, this_y in zip(x, y):
        this_x = this_x[None, :]
        this_y = this_y[None, :]
        this_x = Variable(this_x)
        this_y = Variable(this_y)
        optimizer.zero_grad()
        pred = model(this_x)
        loss = mse_loss(pred, this_y)
        loss.backward()
        iterate_rec.append([model.w1.data[0], model.w2.data[0]])
        grad_rec.append([model.w1.grad.data[0], model.w2.grad.data[0]])
        optimizer.step()
    return np.array(iterate_rec), np.array(grad_rec)

iterate_rec, grad_rec = train(x, y, lr=.05, init=torch.FloatTensor([1.2, -2.3]))

We now plot the empirical mean on the left side, and the sample loss at iteration $k$ on the right side

In [None]:
from ipywidgets import interact, interactive, fixed, interact_manual
import ipywidgets as widgets

def plot_map(sample):
    fig, (ax1, ax2) = plt.subplots(ncols=2, figsize=(12, 6))
    ax1.contourf(W1, W2, torch.log(G[sample]))
    ax1.scatter(iterate_rec[sample, 0], iterate_rec[sample, 1], color='orange')
    ax1.arrow(iterate_rec[sample, 0], iterate_rec[sample, 1], -5 * grad_rec[sample, 0], -5 * grad_rec[sample, 1],
      head_width=0.1, head_length=0.2, fc='orange', ec='orange')
    ax2.contourf(W1, W2, torch.log(G_mean))
    ax2.plot(iterate_rec[:sample, 0], iterate_rec[:sample, 1], linestyle='-', marker='o', markersize=8,
             color='orange', linewidth=4)
    plt.show()

In [None]:
%matplotlib inline
interactive_plot = interactive(plot_map, sample=(0, 99))
output = interactive_plot.children[-1]
interactive_plot

Observe and comment. Perform interesting change in the models and the initialisation to observe interesting behaviors.