In [1]:
import tensorflow as tf
from tensorflow import keras

In [None]:
# tf AutoGrad
# 这里persistent如果为false, 每次tape.gradient()之后会自动释放资源，True则需要手动释放
# 如果需要多次计算梯度（tape.gradient()）则需要persistent=True
# with tf.GradientTape(persistent=True) as tape:
# grads = tape.gradient(loss, [w1, b1, w2, b2, w3, b3])


### 链式法则

反向传播的本质，其实就是梯度

$ \frac{dy}{dx} = \frac{dy}{du}\frac{du}{dx} $

In [11]:
# 链式法则
x = tf.constant(1.)
w1 = tf.constant(2.)
b1 = tf.constant(1.)
w2 = tf.constant(2.)
b2 = tf.constant(1.)


with tf.GradientTape(persistent=True) as tape:

	tape.watch([w1, b1, w2, b2])

	y1 = x * w1 + b1
	y2 = y1 * w2 + b2

dy2_dy1 = tape.gradient(y2, [y1])[0]
dy1_dw1 = tape.gradient(y1, [w1])[0]
dy2_dw1 = tape.gradient(y2, [w1])[0]


print(dy2_dy1 * dy1_dw1)
print(dy2_dw1)

tf.Tensor(2.0, shape=(), dtype=float32)
tf.Tensor(2.0, shape=(), dtype=float32)


In [2]:
# 二阶梯度 （二次求导）
w = tf.Variable(1.0)
b = tf.Variable(2.0)
x = tf.Variable(3.0)

with tf.GradientTape() as t1:
  with tf.GradientTape() as t2:
    y = x * w + b
  dy_dw, dy_db = t2.gradient(y, [w, b])
d2y_dw2 = t1.gradient(dy_dw, w)

print(dy_dw)
print(dy_db)
print(d2y_dw2)

tf.Tensor(3.0, shape=(), dtype=float32)
tf.Tensor(1.0, shape=(), dtype=float32)
None


### 激活函数及其梯度
#### Sigmoid / Logistic

sigmoid很常用，但当x取值在[-3, 3]之外时也会遇到梯度离散的情况（梯度消失）

常用于二分类且只输出1个值时（比如True的概率）;

还常用于对预测值的限制上，比如Yolo。这是为了防止预测值过于离谱或发生突变。

![sigmoid](./images/sigmoid.png)
![sigmoid_derivative](./images/sigmoid_derivative.png)

In [4]:
# sigmoid
a = tf.linspace(-10., 10., 10)

with tf.GradientTape() as tape:
    tape.watch(a) # 因为调用了watch 所以不需要将a用tf.Variable包裹
    y = tf.sigmoid(a)

grads = tape.gradient(y, [a])
print(a)
print(grads)

tf.Tensor(
[-10.         -7.7777777  -5.5555553  -3.333333   -1.1111107   1.1111116
   3.333334    5.5555563   7.7777786  10.       ], shape=(10,), dtype=float32)
[<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([4.5395806e-05, 4.1859140e-04, 3.8362024e-03, 3.3258736e-02,
       1.8632649e-01, 1.8632638e-01, 3.3258699e-02, 3.8362255e-03,
       4.1860685e-04, 4.5416677e-05], dtype=float32)>]


#### tanh
在RNN中很常用
![tanh](./images/tanh.png)
![tanh_derivative](./images/tanh_derivative.png)

In [None]:
# tanh
a = tf.linspace(-5., 5., 10)
tf.tanh(a)

#### Rectified Linear Unit(ReLU)
ReLU梯度很好计算，而且当>0时，梯度为1，最大程度的减少了梯度爆炸和梯度离散的发生。
![relu](./images/relu.png)
![relu_derivative](./images/relu_derivative.png)


In [5]:
#relu
a = tf.linspace(-1., 1., 10)
tf.nn.relu(a)
# 当x<0时，会从某个值开始逐步降为接近0，防止梯度消失
tf.nn.leaky_relu(a)

<tf.Tensor: shape=(10,), dtype=float32, numpy=
array([-0.2       , -0.15555556, -0.11111112, -0.06666666, -0.02222222,
        0.11111116,  0.33333337,  0.5555556 ,  0.7777778 ,  1.        ],
      dtype=float32)>

#### Softmax
常常和CrossEntropy一起使用，适用于多分类。它可以保证转换之后的值域之和为1。它可以让大的更大，小的更小。
![softmax](./images/softmax_explain.png)
![softmax_derivative](./images/softmax_derivative.png)
![softmax_derivativeij](./images/softmax_derivativeij.png)

### Loss及其梯度
MSE与Cross Entropy Loss


In [9]:
# MSE gradient
x=tf.random.normal([1,3])

w=tf.ones([3,2])

b=tf.ones([2])

y = tf.constant([0, 1])


with tf.GradientTape() as tape:

	tape.watch([w, b])

	# logits = tf.sigmoid(x@w+b) 
	logits = tf.nn.softmax(x@w+b, axis=1)
	loss = tf.reduce_mean(tf.losses.MSE(y, logits))

grads = tape.gradient(loss, [w, b])
print('w grad:', grads[0])

print('b grad:', grads[1])

w grad: tf.Tensor(
[[-0.02904284  0.02904284]
 [-0.02101646  0.02101646]
 [ 0.19016582 -0.19016582]], shape=(3, 2), dtype=float32)
b grad: tf.Tensor([ 0.25 -0.25], shape=(2,), dtype=float32)


In [15]:
# Crossentropy gradient
tf.random.set_seed(4323)

x=tf.random.normal([2,4])

w=tf.random.normal([4,3])

b=tf.random.normal([3])

y = tf.constant([2, 1])


with tf.GradientTape() as tape:

	tape.watch([w, b])
	logits = tf.nn.softmax(x@w+b, axis=1)
	loss = tf.reduce_mean(tf.losses.categorical_crossentropy(tf.one_hot(y, depth=3), logits, from_logits=True))

grads = tape.gradient(loss, [w, b])
print('w grad:', grads[0])
print('b grad:', grads[1])

w grad: tf.Tensor(
[[ 0.00861339 -0.01581438  0.00720099]
 [ 0.02661874  0.12178067 -0.14839941]
 [ 0.02604978 -0.15760535  0.13155556]
 [-0.01489652  0.11392362 -0.09902708]], shape=(4, 3), dtype=float32)
b grad: tf.Tensor([ 0.02462438 -0.05216555  0.02754116], shape=(3,), dtype=float32)


### 单层感知机及梯度  也就是最简单的1层神经网络  课时65

$ y =  XW + b $

$ \frac{\partial{E}}{\partial{w_{j0}}} = (O_0 - t)O_0(1-O_0)x_j^0 $

![single](./images/single_cell.png)



In [12]:
x=tf.random.normal([1,3])

w=tf.ones([3,1])

b=tf.ones([1])

y = tf.constant([1])


with tf.GradientTape() as tape:

	tape.watch([w, b])
	logits = tf.sigmoid(x@w+b) 
	loss = tf.reduce_mean(tf.losses.MSE(y, logits))

grads = tape.gradient(loss, [w, b])
print('w grad:', grads[0])

print('b grad:', grads[1])

w grad: tf.Tensor(
[[-0.0684463 ]
 [ 0.26229477]
 [ 0.3117205 ]], shape=(3, 1), dtype=float32)
b grad: tf.Tensor([-0.15517864], shape=(1,), dtype=float32)


### 多输出感知机及梯度 课时66

在梯度计算中，主要点是，由于是求和，只有相关的权重求导，有效，其他支线的权重求导都是0.

$ \frac{\partial{E}}{\partial{w_{jk}}} = (O_k - t_k)O_k(1-O_k)x_j^0 $

![multiple](./images/multiple_cell.png)

In [14]:
x=tf.random.normal([2, 4])

w=tf.ones([4, 3])

b=tf.ones([3])

y = tf.constant([2, 0])


with tf.GradientTape() as tape:

	tape.watch([w, b])
	# x@w+b 得到的shape是[b, 3], b是batch, 我们需要在axis=1维度上进行softmax
	logits = tf.nn.softmax(x@w+b, axis=1)
	loss = tf.reduce_mean(tf.losses.MSE(tf.one_hot(y, depth=3), logits))

grads = tape.gradient(loss, [w, b])
print('w grad:', grads[0])

print('b grad:', grads[1])

w grad: tf.Tensor(
[[-0.06555234  0.04063531  0.02491703]
 [ 0.03510559  0.01809573 -0.05320131]
 [ 0.18592252 -0.09184361 -0.09407891]
 [-0.03733121 -0.02389582  0.06122703]], shape=(4, 3), dtype=float32)
b grad: tf.Tensor([-0.03703704  0.07407407 -0.03703704], shape=(3,), dtype=float32)


### 反向传播  多(K)层感知机

$ \frac{\partial{E}}{\partial{W_{jk}}} = (O_k - t_k)O_k(1-O_k)O_j^J $

$ \delta_k^K = (O_k - t_k)O_k(1-O_k) $

$ \frac{\partial{E}}{\partial{W_{jk}}} = \delta_k^KO_j^J $

$ \frac{\partial{E}}{\partial{W_{ij}}} = O_j(1 - O_j)O_i\sum_{k\in{K}}((O_k - t_k)O_k(1-O_k)W_{jk}) $

$ \frac{\partial{E}}{\partial{W_{ij}}} = O_j(1 - O_j)O_i\sum_{k\in{K}}(\delta_kW_{jk}) $

图中$x_n^J$是$x_2^0$与$w_{ij}^J$的加权求和， $O_n^J$是$x_n^J$经过sigma函数得到的

![multi-layer](./images/multi_layer.png)

![bp-conclusion](./images/bp-conclusion.png)