# **11.1 Optimization and Deep Learning**

11.1.1 Goal of Optimization

In [None]:
%matplotlib inline
import numpy as np
import torch
from mpl_toolkits import mplot

In [None]:
def f(x):
  return x * torch.cos(np.pi * x)
  
def g(x):
  return f(x) + 0.2 * torch.cos(5 * np.pi * x)

In [None]:
def annotate(text, xy, xytext): #@save
d2l.plt.gca().annotate(text, xy=xy, xytext=xytext,
      arrowprops=dict(arrowstyle='->'))
x = torch.arange(0.5, 1.5, 0.01)
d2l.set_figsize((4.5, 2.5))
d2l.plot(x, [f(x), g(x)], 'x', 'risk')
annotate('min of\nempirical risk', (1.0, -1.2), (0.5, -1.1))
annotate('min of risk', (1.1, -1.05), (0.95, -0.5))

11.1.2 Optimization Challenges in Deep Learning

In [None]:
x = torch.arange(-1.0, 2.0, 0.01)
d2l.plot(x, [f(x), ], 'x', 'f(x)')
annotate('local minimum', (-0.3, -0.25), (-0.77, -1.0))
annotate('global minimum', (1.1, -0.95), (0.6, 0.8))

In [None]:
x = torch.arange(-2.0, 2.0, 0.01)
d2l.plot(x, [x**3], 'x', 'f(x)')
annotate('saddle point', (0, -0.2), (-0.52, -5.0))

In [None]:
x, y = torch.meshgrid(
        torch.linspace(-1.0, 1.0, 101), torch.linspace(-1.0, 1.0, 101))
z = x**2 - y**2

ax = d2l.plt.figure().add_subplot(111, projection='3d')
ax.plot_wireframe(x, y, z, **{'rstride': 10, 'cstride': 10})
ax.plot([0], [0], [0], 'rx')
ticks = [-1, 0, 1]
d2l.plt.xticks(ticks)
d2l.plt.yticks(ticks)
ax.set_zticks(ticks)
d2l.plt.xlabel('x')
d2l.plt.ylabel('y');

In [None]:
x = torch.arange(-2.0, 5.0, 0.01)
d2l.plot(x, [torch.tanh(x)], 'x', 'f(x)')
annotate('vanishing gradient', (4, 1), (2, 0.0))

# **11.2 Convexity**

11.2.1 Definitions

In [None]:
%matplotlib inline
import numpy as np
import torch
from mpl_toolkits import mplot3d
from d2l import torch as d2l

In [None]:
f = lambda x: 0.5 * x**2 # Convex
g = lambda x: torch.cos(np.pi * x) # Nonconvex
h = lambda x: torch.exp(0.5 * x) # Convex

x, segment = torch.arange(-2, 2, 0.01), torch.tensor([-1.5, 1])
d2l.use_svg_display()
_, axes = d2l.plt.subplots(1, 3, figsize=(9, 3))
for ax, func in zip(axes, [f, g, h]):
  d2l.plot([x, segment], [func(x), func(segment)], axes=ax)

11.2.2 Properties

In [None]:
f = lambda x: (x - 1) ** 2
d2l.set_figsize()
d2l.plot([x, segment], [f(x), f(segment)], 'x', 'f(x)')

11.2.3 Constraints

# **11.3 Gradient Descent**

11.3.1 One-Dimensional Gradient Descent

In [None]:
%matplotlib inline
import numpy as np
import torch
from d2l import torch as d2l

In [None]:
def f(x): # Objective function
  return x ** 2
def f_grad(x): # Gradient (derivative) of the objective function
  return 2 * x

In [None]:
def gd(eta, f_grad):
  x = 10.0
  results = [x]
  for i in range(10):
    x -= eta * f_grad(x)
    results.append(float(x))
    print(f'epoch 10, x: {x:f}')
  return results
results = gd(0.2, f_grad)

In [None]:
def show_trace(results, f):
  n = max(abs(min(results)), abs(max(results)))
  f_line = torch.arange(-n, n, 0.01)
  d2l.set_figsize()
  d2l.plot([f_line, results], [[f(x) for x in f_line], [
      f(x) for x in results]], 'x', 'f(x)', fmts=['-', '-o'])
  
show_trace(results, f)

In [None]:
show_trace(gd(0.05, f_grad), f)

In [None]:
show_trace(gd(1.1, f_grad), f)

In [None]:
c = torch.tensor(0.15 * np.pi)

def f(x): # Objective function
  return x * torch.cos(c * x)

def f_grad(x): # Gradient of the objective function
  return torch.cos(c * x) - c * x * torch.sin(c * x)
  
show_trace(gd(2, f_grad), f)

11.3.2 Multivariate Gradient Descent

In [None]:
def train_2d(trainer, steps=20, f_grad=None): #@save
  """Optimize a 2D objective function with a customized trainer."""
  # `s1` and `s2` are internal state variables that will be used later
  x1, x2, s1, s2 = -5, -2, 0, 0
  results = [(x1, x2)]
  for i in range(steps):
    if f_grad:
      x1, x2, s1, s2 = trainer(x1, x2, s1, s2, f_grad)
    else:
      x1, x2, s1, s2 = trainer(x1, x2, s1, s2)
    results.append((x1, x2))
  print(f'epoch {i + 1}, x1: {float(x1):f}, x2: {float(x2):f}')
  return results

def show_trace_2d(f, results): #@save
  """Show the trace of 2D variables during optimization."""
  d2l.set_figsize()
  d2l.plt.plot(*zip(*results), '-o', color='#ff7f0e')
  x1, x2 = torch.meshgrid(torch.arange(-5.5, 1.0, 0.1),
      torch.arange(-3.0, 1.0, 0.1))
  d2l.plt.contour(x1, x2, f(x1, x2), colors='#1f77b4')
  d2l.plt.xlabel('x1')
  d2l.plt.ylabel('x2')

In [None]:
def f_2d(x1, x2): # Objective function
  return x1 ** 2 + 2 * x2 ** 2
def f_2d_grad(x1, x2): # Gradient of the objective function
  return (2 * x1, 4 * x2)
def gd_2d(x1, x2, s1, s2, f_grad):
  g1, g2 = f_grad(x1, x2)
  return (x1 - eta * g1, x2 - eta * g2, 0, 0)
  
eta = 0.1
show_trace_2d(f_2d, train_2d(gd_2d, f_grad=f_2d_grad))

11.3.3 Adaptive Methods

In [None]:
c = torch.tensor(0.5)

def f(x): # Objective function
  return torch.cosh(c * x)
def f_grad(x): # Gradient of the objective function
  return c * torch.sinh(c * x)
def f_hess(x): # Hessian of the objective function
  return c**2 * torch.cosh(c * x)
def newton(eta=1):
  x = 10.0
  results = [x]
  for i in range(10):
    x -= eta * f_grad(x) / f_hess(x)
    results.append(float(x))
  print('epoch 10, x:', x)
  return results
  
show_trace(newton(), f)

In [None]:
c = torch.tensor(0.15 * np.pi)

def f(x): # Objective function
  return x * torch.cos(c * x)
def f_grad(x): # Gradient of the objective function
  return torch.cos(c * x) - c * x * torch.sin(c * x)
def f_hess(x): # Hessian of the objective function
  return - 2 * c * torch.sin(c * x) - x * c**2 * torch.cos(c * x)

show_trace(newton(), f)

In [None]:
show_trace(newton(0.5), f)