In [5]:
from mxnet import nd
from mxnet.gluon import nn
from mxnet import gluon


def puer_batch_norm(x, gamma, beta, eps=1e-5):
    lenth=len(x.shape)
    assert lenth in (2, 4)
    # 全连接的情况
    if 2 == lenth:
        mean = x.mean(axis=0)
        variance = ((x - mean) ** 2).mean(axis=0)
    else:
        # 2D卷积擦偶哦，针对每个通道求均值和方差
        mean = x.mean(axis=(0,2,3), keepdims=True)
        variance = ((x - mean) ** 2).mean(
            axis=(0,2,3), keepdims=True)

    # 均一化
    x_hat = (x -mean) / nd.sqrt(variance + eps)
    return gamma.reshape(
        mean.shape) * x_hat +beta.reshape(mean.shape)

In [6]:
a = nd.arange(6).reshape((3,2))

In [7]:
puer_batch_norm(a, gamma=nd.array([1,1]), beta=nd.array([0,0]))


[[-1.2247427 -1.2247427]
 [ 0.         0.       ]
 [ 1.2247427  1.2247427]]
<NDArray 3x2 @cpu(0)>

In [9]:
b = nd.arange(18).reshape((1, 2, 3,3))
print(b)


[[[[ 0.  1.  2.]
   [ 3.  4.  5.]
   [ 6.  7.  8.]]

  [[ 9. 10. 11.]
   [12. 13. 14.]
   [15. 16. 17.]]]]
<NDArray 1x2x3x3 @cpu(0)>


In [10]:
puer_batch_norm(b, gamma=nd.array([1,1]), beta=nd.array([0,0]))


[[[[-1.5491922  -1.1618942  -0.7745961 ]
   [-0.38729805  0.          0.38729805]
   [ 0.7745961   1.1618942   1.5491922 ]]

  [[-1.5491922  -1.1618942  -0.7745961 ]
   [-0.38729805  0.          0.38729805]
   [ 0.7745961   1.1618942   1.5491922 ]]]]
<NDArray 1x2x3x3 @cpu(0)>

In [11]:
def batch_norm(x, gamma, beta, is_training, moving_mean,
    moving_variance, eps=1e-5, moving_momentum=0.9):
    assert len(x.shape) in (2, 4)
    if 2 == len(x.shape):
        mean = x.mean(axis=0)
        variance = ((x - mean) ** 2).mean(axis=0)
    else:
        mean = x.mean(axis=(0,2,3), keepdims=True)
        variance = ((x - mean) ** 2).mean(
            axis=(0, 2, 3), keepdims=True)
        moving_mean = moving_mean.reshape(mean.shape)
        moving_variance = moving_variance.reshape(mean.shape)
    if is_training:
        x_hat = (x - mean)/ nd.sqrt(variance + eps)
        moving_mean[:] = moving_momentum * moving_mean + (
            1.0 - moving_momentum) * mean
        moving_variance[:] = moving_momentum * moving_variance + (
            1.0 - moving_momentum) * variance
    else:
        x_hat = (x - moving_mean) / nd.sqrt(moving_variance + eps)

    return gamma.reshape(
        mean.shape) * x_hat + beta.reshape(mean.shape)

In [12]:
import utils
ctx = utils.try_gpu()

In [14]:
weight_scale = .01
c1 = 20
w1 = nd.random_normal(shape=(c1, 1, 5, 5),
                      scale=weight_scale, ctx=ctx)
b1 = nd.zeros(c1, ctx=ctx)

gamma1 = nd.random_normal(shape=c1, scale=weight_scale, ctx=ctx)
beta1 = nd.random_normal(shape=c1, scale=weight_scale, ctx=ctx)
moving_mean1 = nd.zeros(c1, ctx=ctx)
moving_variance1 = nd.zeros(c1, ctx=ctx)

c2 = 50
w2 = nd.random_normal(shape=(c2, c1, 3, 3),
                      scale=weight_scale, ctx=ctx)
b2 = nd.zeros(c2, ctx=ctx)

gamma2 = nd.random_normal(shape=c2, scale=weight_scale, ctx=ctx)
beta2 = nd.random_normal(shape=c2, scale=weight_scale, ctx=ctx)
moving_mean2 = nd.zeros(c2, ctx=ctx)
moving_variance2 = nd.zeros(c2, ctx=ctx)

o3 = 128
w3 = nd.random_normal(shape=(1250, o3), scale=weight_scale, ctx=ctx)
b3 = nd.zeros(o3, ctx=ctx)

w4 = nd.random_normal(
    shape=(w3.shape[1], 10), scale=weight_scale, ctx=ctx)
b4 = nd.zeros(w4.shape[1], ctx=ctx)

params = [w1, b2, gamma1, beta1,
          w2, b2, gamma2, beta2,
          w3, b3, w4, b4]

for param in params:
    param.attach_grad()

In [17]:
def net(x, is_training=False, verbose=False):
    x = x.as_in_context(w1.context)
    h1_conv = nd.Convolution(data=x, weight=w1, bias=b1,
                             kernel=w1.shape[2:], num_filter=c1)
    h1_bn = batch_norm(
        h1_conv, gamma1, beta1, is_training,
        moving_mean1, moving_variance1)
    h1_activation = nd.relu(h1_bn)
    h1 = nd.Pooling(data=h1_activation, pool_type='max',
                    kernel=(2,2), stride=(2,2))

    h2_conv = nd.Convolution(data=h1, weight=w2, bias=b2,
                             kernel=w2.shape[2:], num_filter=c2)
    h2_bn = batch_norm(h2_conv, gamma2, beta2, is_training,
                       moving_mean2, moving_variance2)
    h2_activation = nd.relu(h2_bn)
    h2 = nd.Pooling(data=h2_activation, pool_type='max',
                    kernel=(2,2), stride=(2, 2))
    h2 = nd.flatten(h2)

    h3_linear = nd.dot(h2, w3) + b3
    h3 = nd.relu(h3_linear)

    h4_linear = nd.dot(h3, w4) + b4
    if verbose:
        print('1st conv block: ', h1.shape)
        print('2nd conv block: ', h2.shape)
        print('1st dense: ', h3.shape)
        print('2nd dense: ', h4.shape)
    return h4_linear

In [18]:
from mxnet import autograd
from mxnet import gluon

batch_size = 256
train_data, test_data = utils.load_data_fashion_mnist(batch_size)
softmax_cross_entropy = gluon.loss.SoftmaxCrossEntropyLoss()
learning_rate = 0.2
for epoch in range(5):
    train_loss = 0.
    train_acc = 0.
    for data, label in train_data:
        label = label.as_in_context(ctx)
        with autograd.record():
            output = net(data, is_training=True)
            loss = softmax_cross_entropy(output, label)
        loss.backward()
        utils.SGD(params, learning_rate/batch_size)
        train_loss += nd.mean(loss).asscalar()
        train_acc += utils.accuracy(output, label)
    test_acc = utils.evaluate_accuracy(test_data, net, ctx)
    print('Epoch %d, loss: %f, train acc :%f, test acc: %f' % (
        epoch, train_loss/len(train_data),
        train_acc/len(train_data), test_acc))

Epoch 0, loss: 2.057808, train acc :0.223105, test acc: 0.705762
Epoch 1, loss: 0.582053, train acc :0.778773, test acc: 0.833594
Epoch 2, loss: 0.407154, train acc :0.849723, test acc: 0.862695
Epoch 3, loss: 0.348530, train acc :0.870750, test acc: 0.866797
Epoch 4, loss: 0.316601, train acc :0.882724, test acc: 0.884961
