In [1]:
from mxnet import nd 
def pure_batch_norm(X,gamma,beta,eps=1e-5):
    assert len(X.shape) in (2,4)
    # 全连接：batch_size x feature 
    if len(X.shape)==2:
        mean=X.mean(axis=0)
        variance=((X-mean)**2).mean(axis=0) 
    else:
        mean=X.mean(axis=(0,2,3),keepdims=True)
        variance=((X-mean)**2).mean(axis=(0,2,3),keepdims=True) 
    # 均一化
    X_hat= (X-mean)/nd.sqrt(variance+eps) 
    # 拉升和偏移 
    return gamma.reshape(mean.shape)*X_hat+beta.reshape(mean.shape) 

In [2]:
A=nd.arange(6).reshape((3,2))
A


[[0. 1.]
 [2. 3.]
 [4. 5.]]
<NDArray 3x2 @cpu(0)>

In [3]:
pure_batch_norm(A,gamma=nd.array([1,11]),beta=nd.array([0,0]))


[[ -1.2247427 -13.472169 ]
 [  0.          0.       ]
 [  1.2247427  13.472169 ]]
<NDArray 3x2 @cpu(0)>

下面我们定义二维卷积网络层的输入是这样的。

In [4]:
B=nd.arange(18).reshape((1,2,3,3))
B


[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]

  [[ 9. 10. 11.]
   [12. 13. 14.]
   [15. 16. 17.]]]]
<NDArray 1x2x3x3 @cpu(0)>

结果也如预期那样，我们对每个通道做了归一化

In [5]:
pure_batch_norm(B,gamma=nd.array([1,11]),beta=nd.array([0,0]))


[[[[ -1.5491922   -1.1618942   -0.7745961 ]
   [ -0.38729805   0.           0.38729805]
   [  0.7745961    1.1618942    1.5491922 ]]

  [[-17.041115   -12.780836    -8.520557  ]
   [ -4.2602787    0.           4.2602787 ]
   [  8.520557    12.780836    17.041115  ]]]]
<NDArray 1x2x3x3 @cpu(0)>

# 批量归一化 
事实上，我们测试时还是需要继续使用批量归一化的，只是需要做些改动。
当训练数据很大时。我们用移动平均的方法来近似计算。
（参见实现中的moving_mean 和 moving_variance) 

In [6]:
def batch_norm(X,gamma,beta,is_training,moving_mean,moving_variance,eps=1e-5,moving_momentum=0.9):
    assert len(X.shape) in (2,4) 
    # 全连接：batch_size x feature 
    if len(X.shape)==2:
        # 每个输入纬度在样本上的平均和方差 
        mean=X.mean(axis=0)
        variance=((X-mean)**2).mean(axis=0)
    #2D 卷积；batch_size * channel * height * width 
    else:
        # 对每个通道算均值和方差，需要保持4D形状使得可以正确的广播 
        mean=X.mean(axis=(0,2,3),keepdims=True) 
        variance=((X-mean)**2).mean(axis=(0,2,3),keepdims=True) 
        # 变形使得可以正确的广播 
        moving_mean=moving_mean.reshape(mean.shape)
        moving_variance=moving_variance.reshape(mean.shape) 
    # 均一化 
    if is_training:
        X_hat=(X-mean)/nd.sqrt(variance+eps) 
        # !!! 更新全局的均值和方差 
        moving_mean[:]=moving_momentum+moving_mean+( 
        1.0*-moving_momentum)*variance 
    else:
        #!!! 测试阶段使用全局的均值和方差 
        X_hat=(X-moving_mean)/nd.sqrt(moving_variance+eps) 
    # 拉升和偏移 
    return gamma.reshape(mean.shape)*X_hat+beta.reshape(mean.shape) 
    

# 定义模型 
我们尝试使用GPU运行本教程代码

In [7]:
import sys 
sys.path.append('..') 
import utils 
ctx=utils.try_gpu()
ctx 

aaaaaaaa


cpu(0)

先定义参数 

In [8]:
weight_scale=0.01 
# output channels=20,kernel=(5,5) 
c1=20 
W1=nd.random.normal(shape=(c1,1,5,5),scale=weight_scale,ctx=ctx) 
b1=nd.zeros(c1,ctx=ctx) 
# batch norm 1 
gamma1=nd.random.normal(shape=c1,scale=weight_scale,ctx=ctx) 
beta1=nd.random.normal(shape=c1,scale=weight_scale,ctx=ctx) 
moving_mean1=nd.zeros(c1,ctx=ctx) 
mvoing_variance1=nd.zeros(c1,ctx=ctx) 
# output channels = 50 ,kernel=(3,3) 
c2=50 
W2=nd.random_normal(shape=(c2,c1,3,3),scale=weight_scale,ctx=ctx)
b2=nd.zeros(c2,ctx=ctx) 

# batch norm 2 
gamma2=nd.random.normal(shape=c2,scale=weight_scale,ctx=ctx) 
beta2=nd.random.normal(shape=c2,scale=weight_scale,ctx=ctx) 
moving_mean2=nd.zeros(c2,ctx=ctx )
moving_variance2=nd.zeros(c2,ctx=ctx) 

# output dim = 128 
o3=128 
W3=nd.random.normal(shape=(1250,o3),scale=weight_scale,ctx=ctx) 
b3=nd.zeros(o3,ctx=ctx) 

# output dim = 10 
W4=nd.random_normal(shape=(W3.shape[1],10),scale=weight_scale,
                   ctx=ctx) 
b4=nd.zeros(W4.shape[1],ctx=ctx) 

# 注意这里moving_* 是不需要更新的恶
params=[W1,b1,gamma1,beta1,
       W2,b2,gamma2,beta2,
       W3,b3,W4,b4]
for param in params: 
    param.attach_grad()



下面定义模型。我们添加了批量归一化层。特别是注意我们添加的位置：在
卷积层后，在激活函数前 

In [9]:
def net(X,is_training=False,verbose=False):
    X=X.as_in_context(W1.context)
    # 第一层卷积 
    h1_conv=nd.Convolution(data=X,weight=W1,bias=b1,kernel=W1.shape[2:],num_filter=c1) 
    ### 添加了批量归一化层 
    h1_bn=batch_norm(h1_conv,gamma1,beta1,is_training,
                    moving_mean1,mvoing_variance1)
    h1_activation=nd.relu(h1_bn)
    h1=nd.Pooling(
        data=h1_activation,pool_type='max',kernel=(2,2),stride=(2,2))
    #  第二层 
    h2_conv=nd.Convolution(
        data=h1,weight=W2,bias=b2,kernel=W2.shape[2:],num_filter=c2) 
    ## 添加了批量归一化 
    h2_bn=batch_norm(h2_conv,gamma2,beta2,is_training,
                    moving_mean2,moving_variance2) 
    h2_activation=nd.relu(h2_bn) 
    h2=nd.Pooling(data=h2_activation,pool_type='max',kernel=(2,2),stride=(2,2))
    h2=nd.flatten(h2) 
    # 第一层全连接 
    h3_linear=nd.dot(h2,W3)+b3 
    h3=nd.relu(h3_linear) 
    # 第二层全连接 
    h4_linear=nd.dot(h3,W4)+b4 
    if verbose:
        print('lst conv block:',h1.shape) 
        print('2nd conv block:',h2.shape) 
        print('lst dense:',h3.shape) 
        print('2nd dense:',h4_linear.shape) 
        print('output:',h4_linear) 
    return h4_linear 
    

In [10]:
from mxnet import autograd
from mxnet import gluon 
from time import * 
batch_size=256 
train_data,test_data=utils.load_data_fashion_mnist(batch_size) 
softmax_cross_entropy=gluon.loss.SoftmaxCrossEntropyLoss()
learning_rate=0.2 
start=time()  
for epoch in range(5):
    train_loss=0.
    train_acc= 0. 
    for data,label in train_data:
        label =label.as_in_context(ctx)
        with autograd.record():
            output=net(data,is_training=True)
            loss=softmax_cross_entropy(output,label)
        loss.backward()
        utils.SGD(params,learning_rate/batch_size) 
        train_loss+=nd.mean(loss).asscalar()
        train_acc+=utils.accuracy(output,label)
    test_acc=utils.evaluate_accuracy(test_data,net,ctx) 
    print("Epoch %d . loss :%f ,train acc %f,Test acc%f "% 
          (epoch,train_loss/len(train_data),train_acc/len(train_data),test_acc)
          ) 
end=time()  
print(end-start) 

Epoch 0 . loss :2.198269 ,train acc 0.150574,Test acc0.100060 
Epoch 1 . loss :0.644039 ,train acc 0.756193,Test acc0.100060 
Epoch 2 . loss :0.408433 ,train acc 0.848708,Test acc0.100060 
Epoch 3 . loss :0.347054 ,train acc 0.870777,Test acc0.100060 
Epoch 4 . loss :0.312347 ,train acc 0.884732,Test acc0.100060 
191.68880820274353


# 总结 
相比 卷积神经网络--从0 开始来说。通过加入批量归一化层，即使是同样的参数，
测试精度也有明显的提升，尤其是最开始几轮 