# 2-3 自动微分机制

神经网络通常依赖反向传播求梯度来更新网络参数, 求梯度的过程又十分复杂且容易出错, 所以TensorFlow这种框架最突出的一个特点就是自动完成求梯度的运算。
TensorFlow一般使用梯度磁带tf.GradientType来记录正向运算过程, 然后反向传播得到梯度值。这种利用tf.GradientType求微分的方法叫做Tensorflow的自动微分机制。

## 1 利用梯度磁带求导数

In [1]:
import tensorflow as tf
import numpy as np

# 对 y = a*x**2 + b*x + c求导
x = tf.Variable(0.0, name = "x", dtype = tf.float32)
a = tf.constant(1.0, name = "a")
b = tf.constant(-2.0, name = "b")
c = tf.constant(1.0, name = "c")

with tf.GradientTape() as tape:
    y = a * x**2 + b * x + c
    
dy_dx, dy_da = tape.gradient(y, [x, a])
print(dy_dx)
print(dy_da)

tf.Tensor(-2.0, shape=(), dtype=float32)
None


In [2]:
# 如果要对常量进行求导需要添加watch
with tf.GradientTape() as tape:
    tape.watch([a, b, c])
    y = a * x**2 + b * x + c

dy_dx, dy_da, dy_db, dy_dc = tape.gradient(y, [x, a, b, c])  # tape.gradient() can only be called once
print(dy_da)
print(dy_dc)

tf.Tensor(0.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)


In [3]:
# 求二阶导数
with tf.GradientTape() as tape2:
    with tf.GradientTape() as tape1:
        y = a * x**2 + b * x + c
    dy_dx = tape1.gradient(y, x)
dy2_dx2 = tape2.gradient(dy_dx, x)
print(dy2_dx2)

tf.Tensor(2.0, shape=(), dtype=float32)


## 二 利用梯度磁带和最优化器求最小值

### 使用optimizer.apply_gradients求最小值

In [4]:
# 求f(x) = a*x**2 + b*x + c的最小值
x = tf.Variable(0.0, name = "x", dtype = tf.float32)
a = tf.constant(1.0, name = "a")
b = tf.constant(-2.0, name = "b")
c = tf.constant(1.0, name = "c")

opt = tf.keras.optimizers.SGD(learning_rate = 0.01)
for _ in range(1000):
    with tf.GradientTape() as tape:
        y = a * x**2 + b * x + c
    dy_dx = tape.gradient(y, x)
    opt.apply_gradients(grads_and_vars = [(dy_dx, x)])
tf.print("y = ", y, "; x = ", x)

y =  0 ; x =  0.999998569


### 使用optimizer.minimize求最小值

In [5]:
x = tf.Variable(0.0, name = "x", dtype = tf.float32)

# 因为minimize是一个简单的实现, 所以输入的loss函数不能有参数, 如果要更自定义的操作就需要用gradientTape和apply_gradients
def f():
    a = tf.constant(1.0, name = "a")
    b = tf.constant(-2.0, name = "b")
    c = tf.constant(1.0, name = "c")
    return a * x**2 + b * x + c

opt  = tf.keras.optimizers.SGD(learning_rate = 0.01)

for _ in range(1000):
    opt.minimize(f, [x])

tf.print("y = ", f(), "; x = ", x)

y =  0 ; x =  0.999998569


### 在AutoGraph中完成最小值求解

In [6]:
# 使用apply_gradients最优化
x = tf.Variable(0.0, name = "x", dtype = tf.float32)
opt = tf.keras.optimizers.Adam(learning_rate = 0.01)

@tf.function
def minimizef_apply_gradients():
    a = tf.constant(1.0, name = "a")
    b = tf.constant(-2.0, name = "b")
    c = tf.constant(1.0, name = "c")
    
    for _ in tf.range(1000):  # 注意这里的循环控制需要用tf.range
        with tf.GradientTape() as tape:
            y = a * x**2 + b * x + c
        dy_dx = tape.gradient(y, x)
        opt.apply_gradients([(dy_dx, x)])
    return a * x**2 + b * x + c  # 返回最优化后的结果

tf.print(minimizef_apply_gradients())
tf.print(x)

0
1.00000083


In [7]:
# 使用minimize最优化, 有点慢
x = tf.Variable(0.0, name = "x", dtype = tf.float32)
opt = tf.keras.optimizers.Adam(learning_rate = 0.01)

@tf.function
def f():
    a = tf.constant(1.0, name = "a")
    b = tf.constant(-2.0, name = "b")
    c = tf.constant(1.0, name = "c")
    return a * x**2 + b * x + c

@tf.function
def minimizef_minimize():
    for _ in range(100):
        opt.minimize(f, [x])
    return f()

tf.print(minimizef_minimize())

0.0503761768
