In [2]:
import numpy as np
import matplotlib.pylab as plt
import pickle

from IPython.core.interactiveshell import InteractiveShell 
InteractiveShell.ast_node_interactivity = "all"

本章我们将学习一个能够高效计算权重参数的梯度的方法——误差反向传播法。

要正确理解误差反向传播法，我个人认为有两种方法：
- 一种是基于数学式；
- 另一种是基于计算图（computational graph）。

前者是比较常见的方法，机器学习相关的图书中多数都是以数学式为中心展开论述的。因为这种方法严密且简洁，所以确实非常合理，但如果一上来就围绕数学式进行探讨，会忽略一些根本的东西，止步于式子的罗列。因此，本章希望大家通过计算图，直观地理解误差反向传播法。然后，再结合实际的代码加深理解，相信大家一定会有种“原来如此！”的感觉。

# 计算图

<img src="img/5_3.png" alt="Drawing" style="width: 500px;"/>


综上，用计算图解题的情况下，需要按如下流程进行。

1. 构建计算图。
2. 在计算图上，从左向右进行计算。

- 这里的第2歩“从左向右进行计算”是一种正方向上的传播，简称为正向传播（forward propagation）。正向传播是从计算图出发点到结束点的传播。
- 既然有正向传播这个名称，当然也可以考虑反向（从图上看的话，就是从右向左）的传播。实际上，这种传播称为反向传播（backward propagation）。反向传播将在接下来的导数计算中发挥重要作用。

计算图的特征是可以通过传递**“局部计算”**获得最终结果。“局部”这个词的意思是“与自己相关的某个小范围”。局部计算是指，无论全局发生了什么，都能只根据与自己相关的信息输出接下来的结果。

# 链式法则

# 反向传播

# 简单层的实现

### 乘法层

In [4]:
# 乘法层的实现
class MulLayer:
    def __init__(self):
        self.x = None
        self.y = None
        
    def forward(self, x, y):
        self.x = x
        self.y = y
        out = x * y
        return out
    
    def backward(self, dout):
        dx = dout * self.y # 翻转x和y
        dy = dout * self.x
        return dx, dy


In [31]:
# 乘法层实例测试
apple = 100
apple_num = 2
tax = 1.1

# layer
mul_apple_layer = MulLayer() # 评估总价计算层
mul_tax_layer = MulLayer() # 总价税收计算层

# forward
# cc：在这一步除了正向传播计算之外，实际还定义了每一层的x 和 y，这样在执行backward的时候，就可以计算出对应“导数”结果；
apple_price = mul_apple_layer.forward(apple, apple_num)
print(apple_price)
price = mul_tax_layer.forward(apple_price, tax)
print(price) # 220

# backward
dprice = 1
dapple_price, dtax = mul_tax_layer.backward(dprice)
print(dapple_price, dtax)

dapple, dapple_num = mul_apple_layer.backward(dapple_price)
print(dapple, dapple_num, dtax) # 2.2 110 200

200
220.00000000000003
1.1 200
2.2 110.00000000000001 200


### 加法层

In [8]:
class AddLayer:
    def __init__(self):
        pass
    
    def forward(self, x, y):
        out = x + y
        return out
    
    def backward(self, dout):
        dx = dout * 1
        dy = dout * 1
        return dx, dy
    

In [10]:
# 加法层 + 乘法层 实例测试
apple = 100
apple_num = 2
orange = 150
orange_num = 3
tax = 1.1

# layer
mul_apple_layer = MulLayer()
mul_orange_layer = MulLayer()
add_apple_orange_layer = AddLayer()
mul_tax_layer = MulLayer()

# forward
apple_price = mul_apple_layer.forward(apple, apple_num) #(1)
orange_price = mul_orange_layer.forward(orange, orange_num) #(2)
all_price = add_apple_orange_layer.forward(apple_price, orange_price) #(3)
price = mul_tax_layer.forward(all_price, tax) #(4)

# backward
dprice = 1
dall_price, dtax = mul_tax_layer.backward(dprice) #(4)
dapple_price, dorange_price = add_apple_orange_layer.backward(dall_price) #(3)
dorange, dorange_num = mul_orange_layer.backward(dorange_price) #(2)
dapple, dapple_num = mul_apple_layer.backward(dapple_price) #(1)

print(price) # 715
print(dapple_num, dapple, dorange, dorange_num, dtax) # 110 2.2 3.3 165 650



715.0000000000001
110.00000000000001 2.2 3.3000000000000003 165.0 650


# 激活函数层的实现

### ReLU层

In [32]:
class Relu:
    def __init__(self):
        self.mask = None
        
    def forward(self, x):
        self.mask = (x <= 0)
        out = x.copy()
        out[self.mask] = 0
        return out
    
    def backward(self, dout):
        dout[self.mask] = 0
        dx = dout
        return dx
    
    

In [20]:
# 详细解释
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
print(x)
mask = (x <= 0)
print(mask)

# 上文代码中，会将为True的地方（<=0）设置为0 

[[ 1.  -0.5]
 [-2.   3. ]]
[[False  True]
 [ True False]]


In [38]:
# Relu 测试
relu_test = Relu()
a = relu_test.forward(x)
a
dout = np.array([[1, 1], [1, 1]])
b = relu_test.backward(dout)
b

array([[1., 0.],
       [0., 3.]])

array([[1, 0],
       [0, 1]])

### Sigmoid 层

In [40]:
class Sigmoid:
    def __init__(self):
        self.out = None
        
    def forward(self, x):
        out = 1 / (1 + np.exp(-x))
        self.out = out
        return out
    
    def backward(self, dout):
        dx = dout * (1.0 - self.out) * self.out
        return dx
    
    

In [42]:
# test
sig_test = Sigmoid()
x = np.array([[1.0, -0.5], [-2.0, 3.0]])
forward = sig_test.forward(x)
print(forward)

dout = np.array([[1, 1], [1, 1]])
backward = sig_test.backward(dout)
print(backward)


[[0.73105858 0.37754067]
 [0.11920292 0.95257413]]
[[0.19661193 0.23500371]
 [0.10499359 0.04517666]]


# Affine/Softmax层的实现

### Affine 层

In [43]:
# 和偏置B有关一点知识：
X_dot_W = np.array([[0, 0, 0], [10, 10, 10]])
B = np.array([1, 2, 3])   
X_dot_W
X_dot_W + B

# 正向传播时，偏置被加到X·W的各个数据上。


array([[ 0,  0,  0],
       [10, 10, 10]])

array([[ 1,  2,  3],
       [11, 12, 13]])

array([[1, 2, 3],
       [4, 5, 6]])

array([5, 7, 9])

In [44]:
dY = np.array([[1, 2, 3,], [4, 5, 6]])
dY

dB = np.sum(dY, axis=0)
dB

array([[1, 2, 3],
       [4, 5, 6]])

array([5, 7, 9])

In [25]:
# 批版本的Affine层
class Affine:
    def __init__(self, W, b):
        self.W = W
        self.b = b
        self.x = None
        self.dW = None
        self.db = None
        
    def forward(self, x):
        self.x = x
        out = np.dot(x, self.W) + self.b
        return out
    
    def backward(self, dout):
        dx = np.dot(dout, self.W.T)
        self.dW = np.dot(self.x.T, dout)
        self.db = np.sum(dout, axis=0)
        return dx
    
    

### Softmax-with-Loss 层

In [None]:
# 代码实现
class SoftmaxWithLoss:
    def __init__(self):
        self.loss = None # 损失
        self.y = None # softmax的输出
        self.t = None # 监督数据（one-hot vector）
        
    def forward(self, x, t):
        self.t = t
        self.y = softmax(x)
        self.loss = cross_entropy_error(self.y, self.t)
        return self.loss
    
    def backward(self, dout=1):
        batch_size = self.t.shape[0]
        dx = (self.y - self.t) / batch_size
        return dx


# 误差反向传播法的实现

In [None]:
# 应误差反向传播法的神经网络的实现

import sys, os
sys.path.append(os.pardir)
import numpy as np
from common.layers import *
from common.gradient import numerical_gradient
from collections import OrderedDict


class TwoLayerNet:
    def __init__(self, input_size, hidden_size, output_size, weight_init_std=0.01):
        # 初始化权重
        self.params = {}
        self.params['W1'] = weight_init_std * np.random.randn(input_size, hidden_size)
        self.params['b1'] = np.zeros(hidden_size)
        self.params['W2'] = weight_init_std * np.random.randn(hidden_size, output_size)
        self.params['b2'] = np.zeros(output_size)
        
        # 生成层
        self.layers = OrderedDict()
        self.layers['Affine1'] = Affine(self.params['W1'], self.params['b1'])
        self.layers['Relu1'] = Relu()
        self.layers['Affine2'] = Affine(self.params['W2'], self.params['b2'])
        self.lastLayer = SoftmaxWithLoss()
        
    def predict(self, x):
        for layer in self.layers.values():
            x = layer.forward(x)
        return x

    # x:输入数据, t:监督数据
     def loss(self, x, t):
        y = self.predict(x)
        return self.lastLayer.forward(y, t)
    
    
    def accuracy(self, x, t):
        y = self.predict(x)
        y = np.argmax(y, axis=1)
        if t.ndim != 1 : t = np.argmax(t, axis=1)
        accuracy = np.sum(y == t) / float(x.shape[0])
        return accuracy
            
    # x:输入数据, t:监督数据
    def numerical_gradient(self, x, t):
        loss_W = lambda W: self.loss(x, t)
        grads = {}
        grads['W1'] = numerical_gradient(loss_W, self.params['W1'])
        grads['b1'] = numerical_gradient(loss_W, self.params['b1'])
        grads['W2'] = numerical_gradient(loss_W, self.params['W2'])
        grads['b2'] = numerical_gradient(loss_W, self.params['b2'])
        return grads
    
    def gradient(self, x, t):
        # forward
        self.loss(x, t)
        # backward
        dout = 1
        dout = self.lastLayer.backward(dout)
        layers = list(self.layers.values())
        layers.reverse()
        for layer in layers:
            dout = layer.backward(dout)
            
        # 设定
        grads = {}
        grads['W1'] = self.layers['Affine1'].dW
        grads['b1'] = self.layers['Affine1'].db
        grads['W2'] = self.layers['Affine2'].dW
        grads['b2'] = self.layers['Affine2'].db
        return grads

In [46]:
# 误差反向传播法的梯度确认: 微分法 和 解析法 求差值；
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet

# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label = True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
x_batch = x_train[:3]

t_batch = t_train[:3]
grad_numerical = network.numerical_gradient(x_batch, t_batch)
grad_backprop = network.gradient(x_batch, t_batch)

# 求各个权重的绝对误差的平均值
for key in grad_numerical.keys():
    diff = np.average( np.abs(grad_backprop[key] - grad_numerical[key]))
    print(key + ":" + str(diff))
    print('grad_numerical:',grad_numerical[key])
    print('grad_backprop:', grad_backprop[key])

# 从这个结果可以看出，通过数值微分和误差反向传播法求出的梯度的差非常小。

W1:4.01485263996341e-10
grad_numerical: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
grad_backprop: [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
b1:2.429664222683989e-09
grad_numerical: [-3.37947164e-03  4.88785920e-03 -6.94066149e-04 -2.29474873e-03
  0.00000000e+00  0.00000000e+00 -2.67046518e-05 -4.33291440e-03
  2.76217285e-03  4.73053130e-04  3.53758641e-03  4.09163211e-03
  5.89817531e-03 -1.84379453e-03  0.00000000e+00  0.00000000e+00
 -6.21474051e-03  0.00000000e+00 -5.12573816e-03  2.15976792e-03
  0.00000000e+00  5.80148352e-04  0.00000000e+00  6.31418314e-03
 -2.09430849e-03  1.40039111e-03 -3.27826762e-03 -4.21013644e-04
 -1.10998321e-03  3.15677806e-03  4.06703081e-03  5.79197049e-03
  6.90318001e-03  1.88692855e-03  0.00000000e+00 -3.73503519e-03
 -4.610

In [30]:
# 使用误差反向传播法的学习
import sys, os
sys.path.append(os.pardir)
import numpy as np
from dataset.mnist import load_mnist
from two_layer_net import TwoLayerNet
# 读入数据
(x_train, t_train), (x_test, t_test) = load_mnist(normalize=True, one_hot_label=True)
network = TwoLayerNet(input_size=784, hidden_size=50, output_size=10)
iters_num = 10000
train_size = x_train.shape[0]
batch_size = 100
learning_rate = 0.1
train_loss_list = []
train_acc_list = []
test_acc_list = []

iter_per_epoch = max(train_size / batch_size, 1)

for i in range(iters_num):
    batch_mask = np.random.choice(train_size, batch_size)
    x_batch = x_train[batch_mask]
    t_batch = t_train[batch_mask]
    
    # 通过误差反向传播法求梯度
    grad = network.gradient(x_batch, t_batch)
    # 更新
    for key in ('W1', 'b1', 'W2', 'b2'):
        network.params[key] -= learning_rate * grad[key]
        
    loss = network.loss(x_batch, t_batch)
    train_loss_list.append(loss)
    
    if i % iter_per_epoch == 0:
        train_acc = network.accuracy(x_train, t_train)
        test_acc = network.accuracy(x_test, t_test)
        train_acc_list.append(train_acc)
        test_acc_list.append(test_acc)
        print(train_acc, test_acc)


0.05608333333333333 0.0538
0.9048833333333334 0.9068
0.92255 0.9233
0.9353666666666667 0.9334
0.9452166666666667 0.9441
0.9520666666666666 0.9502
0.9578 0.9533
0.9617666666666667 0.9576
0.9637166666666667 0.959
0.96805 0.9627
0.9703 0.9653
0.97155 0.9647
0.9744666666666667 0.9674
0.9746666666666667 0.9675
0.9762833333333333 0.9672
0.97805 0.9703
0.9792 0.9692
