In [1]:
%matplotlib inline
import copy
from IPython.core.display import HTML, display
import plotly
import plotly.graph_objects as go
import noise
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import axes3d
%matplotlib qt 
# for this run pip install pyqt5

In [2]:
def generate_loss_surface(shape, scale, octaves, persistence, lacunarity):
    surface = np.zeros(shape)
    for i in range(shape[0]):
        for j in range(shape[1]):
            surface[i][j] = noise.pnoise2(i/scale, 
                                        j/scale, 
                                        octaves=octaves, 
                                        persistence=persistence, 
                                        lacunarity=lacunarity, 
                                        repeatx=1024, 
                                        repeaty=1024, 
                                        base=42)
    return surface

In [3]:
# generate grid interpolation
from scipy.interpolate import RegularGridInterpolator


def interpolate_loss_surface(surface, shape):
    lin_x = np.linspace(0, 1, shape[0], endpoint=False)
    lin_y = np.linspace(0, 1, shape[1], endpoint=False)
    interp_loss_surface = RegularGridInterpolator((lin_x, lin_y), surface)
    return interp_loss_surface


def plot_interp_loss_surface(interp_surface, shape, optim_trajectory=None):

    lin_x = np.linspace(0, 1, shape[0], endpoint=False)
    lin_y = np.linspace(0, 1, shape[1], endpoint=False)
    x, y = np.meshgrid(lin_x, lin_y)
    xy = np.stack([x, y], axis=2)
    z = interp_surface(xy)
    fig = plt.figure()
    # contour plot
    ax0 = plt.subplot(1, 2, 1)
    ax0.contourf(x, y, z, cmap='terrain')
    # 3d plot
    ax1 = plt.subplot(1, 2, 2, projection='3d')
    ax1.plot_surface(x, y, z, alpha=0.6, cmap='terrain')
    if optim_trajectory:
        thetas = np.stack([x.theta for x in optim_trajectory])
        ax0.plot(thetas[:, 0], thetas[:, 1], 'o-', lw=3, c='r')
        loss_values = np.array([x.value for x in optim_trajectory]) #+ 0.02 # add offset such that it is plot on top
        ax1.plot3D(thetas[:, 0],
                   thetas[:, 1],
                   loss_values[:, 0],
                   'o-',
                   lw=3,
                   c='r')
    return fig

In [4]:
shape = (50, 50)
scale = 100.0
octaves = 6
persistence = 0.5
lacunarity = 2.0
surface = generate_loss_surface(shape, scale, octaves, persistence, lacunarity)
interp_noise_loss = interpolate_loss_surface(surface, shape)
# fig = plot_interp_loss_surface(interp_noise_loss, shape)

In [5]:
# optimize quadratic loss for testing
def quadratic_loss(x):
    assert len(x) == 2
    z = x[0]**2 + x[1]**2
    return z
lin_x = np.linspace(0,1,shape[0],endpoint=False)
lin_y = np.linspace(0,1,shape[1],endpoint=False)
x,y = np.meshgrid(lin_x, lin_y, sparse=False)
z = x**2+y**2
interp_quadratic_loss = interpolate_loss_surface(z, shape)
# fig = plot_interp_loss_surface(interp_quadratic_loss, shape)

In [6]:
from typing import Callable, NamedTuple, List

class DescentStep(NamedTuple):
    theta: np.ndarray
    value: float

class GradientDescent(object):
    def __init__(self, loss_fn: Callable[[np.ndarray], np.ndarray], 
                    theta: np.ndarray, lr: float, gradient_noise_std: float=0.1, fd_h: float = 1e-3):
        self.loss_fn = loss_fn
        self.theta = theta
        self.lr = lr # stepsize
        self.finite_difference_h = fd_h
        self.gradient_noise_std = gradient_noise_std
        self.rng = np.random.default_rng()

        self.at_boundary = False
        self.last_step = DescentStep(theta_0, loss_fn(theta_0))

    def _compute_gradient(self):
        # forward finite difference gradient
        h = self.finite_difference_h
        f = self.loss_fn
        x = self.theta
        gradient = np.zeros_like(x)
        try:
            for i in range(len(gradient)):
                h_vec = np.zeros_like(x)
                h_vec[i] = h
                gradient[i] = (f(x+h_vec) - f(x))/(h)
        except ValueError:
            print(f'Try to compute gradient at function support boundary at {str(x)}. Setting gradient to zero!')
            gradient = np.zeros_like(x)
        return gradient

    def _safe_decent_step_creation(self, theta):
        try:
            loss = self.loss_fn(theta)
            return DescentStep(theta.copy(), loss)
        except ValueError:
            print('Reached function support boundary!')
            self.at_boundary = True
            return None 

    def _get_descent_step(self, theta):
        descent_step = self._safe_decent_step_creation(self.theta)
        if descent_step is None or self.at_boundary:
            print('At function support boundary, using last step.')
            return copy.deepcopy(self.last_step)
        else:
            self.last_step = descent_step
        return descent_step

    def step(self):
        if not self.at_boundary:
            grad = self._compute_gradient()
            self.theta = self.theta - self.lr * grad
        return self._get_descent_step(self.theta)

        
    def noisy_step(self):
        if not self.at_boundary:
            grad_noise = self.rng.normal(loc=0, scale=self.gradient_noise_std)
            grad = self._compute_gradient() + grad_noise
            self.theta = self.theta - self.lr * grad
        return self._get_descent_step(self.theta)

In [7]:
# use quadratic loss
theta_0 = np.array([0.8,0.6])
interp_loss = interp_quadratic_loss
optim = GradientDescent(loss_fn=interp_loss, theta=theta_0, lr=0.1)
optim_trajectory = []
print(f'Start from theta_0: {theta_0}, Loss: {interp_loss(theta_0)}')
optim_trajectory.append(DescentStep(theta_0.copy(), interp_loss(theta_0)))
for i in range(10):
    step = optim.step()
    optim_trajectory.append(step)
    print(f'Step {i}: theta: {step.theta} Loss: {step.value}')

fig = plot_interp_loss_surface(interp_loss, shape, optim_trajectory)

Start from theta_0: [0.8 0.6], Loss: [1.]
Step 0: theta: [0.638 0.478] Loss: [0.6356]
Step 1: theta: [0.512 0.384] Loss: [0.40976]
Step 2: theta: [0.41  0.306] Loss: [0.26192]
Step 3: theta: [0.328 0.244] Loss: [0.16728]
Step 4: theta: [0.262 0.194] Loss: [0.1064]
Step 5: theta: [0.208 0.156] Loss: [0.06776]
Step 6: theta: [0.166 0.126] Loss: [0.0436]
Step 7: theta: [0.132 0.1  ] Loss: [0.02752]
Step 8: theta: [0.106 0.078] Loss: [0.01744]
Step 9: theta: [0.084 0.064] Loss: [0.01128]


In [8]:
# use noise loss
theta_0 = np.array([0.58,0.63])
interp_loss = interp_noise_loss
optim = GradientDescent(loss_fn=interp_loss, theta=theta_0, lr=0.01)
optim_trajectory = []
print(f'Start from theta_0: {theta_0}, Loss: {interp_loss(theta_0)}')
optim_trajectory.append(DescentStep(theta_0.copy(), interp_loss(theta_0)))
for i in range(80):
    step = optim.step()
    optim_trajectory.append(step)
    print(f'Step {i}: theta: {step.theta} Loss: {step.value}')

fig = plot_interp_loss_surface(interp_loss, shape, optim_trajectory)

Start from theta_0: [0.58 0.63], Loss: [0.09264963]
Step 0: theta: [0.58327346 0.6281772 ] Loss: [0.09125477]
Step 1: theta: [0.58651957 0.62640353] Loss: [0.08989509]
Step 2: theta: [0.58973907 0.62467858] Loss: [0.08856937]
Step 3: theta: [0.59293267 0.62300194] Loss: [0.08727638]
Step 4: theta: [0.59610111 0.62137323] Loss: [0.08601495]
Step 5: theta: [0.59924511 0.61979207] Loss: [0.0847517]
Step 6: theta: [0.6025331  0.61793574] Loss: [0.08299507]
Step 7: theta: [0.60634857 0.61501361] Loss: [0.08071798]
Step 8: theta: [0.61007866 0.61220295] Loss: [0.07856727]
Step 9: theta: [0.61372665 0.60950127] Loss: [0.07653538]
Step 10: theta: [0.6172957  0.60690616] Loss: [0.07461517]
Step 11: theta: [0.62078894 0.60441532] Loss: [0.07277625]
Step 12: theta: [0.62450904 0.60194044] Loss: [0.07070625]
Step 13: theta: [0.62842694 0.59916825] Loss: [0.06819665]
Step 14: theta: [0.632586 0.59489 ] Loss: [0.06434913]
Step 15: theta: [0.63718788 0.58993997] Loss: [0.05954535]
Step 16: theta: [0.

In [9]:
# use noise loss
theta_0 = np.array([0.28,0.36])
interp_loss = interp_noise_loss
optim = GradientDescent(loss_fn=interp_loss, theta=theta_0, lr=0.01)
optim_trajectory = []
print(f'Start from theta_0: {theta_0}, Loss: {interp_loss(theta_0)}')
optim_trajectory.append(DescentStep(theta_0.copy(), interp_loss(theta_0)))
for i in range(200):
    step = optim.step()
    optim_trajectory.append(step)
    print(f'Step {i}: theta: {step.theta} Loss: {step.value}')

fig = plot_interp_loss_surface(interp_loss, shape, optim_trajectory)

Start from theta_0: [0.28 0.36], Loss: [0.06214212]
Step 0: theta: [0.28153945 0.35826598] Loss: [0.06159741]
Step 1: theta: [0.28309966 0.35649141] Loss: [0.06103576]
Step 2: theta: [0.2846811  0.35469816] Loss: [0.0604607]
Step 3: theta: [0.28628402 0.35288599] Loss: [0.05987189]
Step 4: theta: [0.28790862 0.35105463] Loss: [0.05926901]
Step 5: theta: [0.28955515 0.34920382] Loss: [0.05865171]
Step 6: theta: [0.29122383 0.34733331] Loss: [0.05801964]
Step 7: theta: [0.2929149  0.34544282] Loss: [0.05737244]
Step 8: theta: [0.2946286  0.34353208] Loss: [0.05670976]
Step 9: theta: [0.29636517 0.34160084] Loss: [0.0560312]
Step 10: theta: [0.29812486 0.33964881] Loss: [0.05527856]
Step 11: theta: [0.29990329 0.33709735] Loss: [0.05403922]
Step 12: theta: [0.30457145 0.33347956] Loss: [0.05035463]
Step 13: theta: [0.3097309  0.32963247] Loss: [0.04611303]
Step 14: theta: [0.3150834  0.32552645] Loss: [0.04145187]
Step 15: theta: [0.32064196 0.32115183] Loss: [0.03636552]
Step 16: theta: 

In [10]:
# use noise loss + noise gradient steps
theta_0 = np.array([0.28,0.36])
interp_loss = interp_noise_loss
optim = GradientDescent(loss_fn=interp_loss, theta=theta_0, lr=0.01, gradient_noise_std=0.7)
optim_trajectory = []
print(f'Start from theta_0: {theta_0}, Loss: {interp_loss(theta_0)}')
optim_trajectory.append(DescentStep(theta_0.copy(), interp_loss(theta_0)))
for i in range(150):
    step = optim.noisy_step()
    optim_trajectory.append(step)
    print(f'Step {i}: theta: {step.theta} Loss: {step.value}')

fig = plot_interp_loss_surface(interp_loss, shape, optim_trajectory)

Start from theta_0: [0.28 0.36], Loss: [0.06214212]
Step 0: theta: [0.28531776 0.36204428] Loss: [0.06162735]
Step 1: theta: [0.28643368 0.36003912] Loss: [0.06115729]
Step 2: theta: [0.28581477 0.35644442] Loss: [0.0605978]
Step 3: theta: [0.27698963 0.34421153] Loss: [0.05860903]
Step 4: theta: [0.2595599  0.32766386] Loss: [0.04970715]
Step 5: theta: [0.25026392 0.31709125] Loss: [0.04465323]
Step 6: theta: [0.25432416 0.31764088] Loss: [0.04538479]
Step 7: theta: [0.24570206 0.30537311] Loss: [0.0388733]
Step 8: theta: [0.25081778 0.3067029 ] Loss: [0.03985342]
Step 9: theta: [0.2475895  0.29954303] Loss: [0.03637483]
Step 10: theta: [0.23754481 0.28566207] Loss: [0.03182949]
Step 11: theta: [0.23358001 0.27656933] Loss: [0.02995273]
Step 12: theta: [0.23022583 0.26890683] Loss: [0.02879002]
Step 13: theta: [0.22928522 0.26369508] Loss: [0.02772577]
Step 14: theta: [0.22854052 0.25871609] Loss: [0.02649989]
Step 15: theta: [0.23108484 0.25561581] Loss: [0.02480357]
Step 16: theta: 

In [11]:
# TODO for final plot:

# - use starting point as in figure
# - Crop perlin noise to contain roughly only the gradient trajectory
# - for SGD: use noisy steps with std=0.7
# - for SubGD: calculate subspace as line from start point to end point
# - project gradient steps of SGD onto this line
# - use plotly for final plot