In [31]:
import torch
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")


def batch_norm(is_training, X, gamma, beta, moving_mean, moving_var, eps, momentum):
    # 判断当前模式是训练模式还是预测模式
    if not is_training:
        #如果是在预测模式下，直接使用传入的移动平均所得的均值和方差
        X_hat = (X - moving_mean) / torch.sqrt(moving_var + eps)
    else:
        # 前一层需要为全连接层或卷积层
        assert len(X.shape) in (2, 4)
        # 全连接层
        if len(X.shape) == 2:
            # 沿纵向求均值，（1，特征个数）
            # 注意：逐特征求均值
            mean = X.mean(dim=0)
            # 广播机制
            var = ((X - mean) ** 2).mean(dim=0)
        else:
            # 使用二维卷积层的情况，计算通道维上（axis=1）的均值和方差。
            # 这里我们需要保持X的形状以便后面可以做广播运算
            mean = X.mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            var = ((X - mean) ** 2).mean(dim=0, keepdim=True).mean(dim=2, keepdim=True).mean(dim=3, keepdim=True)
            # 以上代码可优化为
            # mean=X.mean(dim=(0,2,3),keepdim=True)
        # 训练模式下用当前的平均值和方差做标准化
        X_hat = (X - mean) / torch.sqrt(var + eps)
        # 一阶指数平滑算法
        moving_mean = momentum * moving_mean + (1.0 - momentum) * mean
        moving_var = momentum * moving_var + (1.0 - momentum) * var
    # 拉伸和平移
    Y = gamma * X_hat + beta
    return Y, moving_mean, moving_var

In [32]:
class BatchNorm(nn.Module):
    def __init__(self, num_features, num_dims):
        super(BatchNorm, self).__init__()
        # 全连接层
        if num_dims == 2:
            shape = (1, num_features)
        # 卷积层
        else:
            shape = (1, num_features, 1, 1)
        # 参与求梯度和迭代的拉伸和偏移参数，分别初始化成0和1
        self.gamma = nn.Parameter(torch.ones(shape))
        self.beta = nn.Parameter(torch.zeros(shape))
        # 不参与求梯度和迭代的变量，全在内存上初始化成 0
        self.moving_mean = torch.zeros(shape)
        self.moving_var = torch.zeros(shape)

    def forward(self, x):
        # 如果X不在显存上，将moving_mean和moving_var复制到X所在显存上
        if self.moving_mean.device != x.device:
            self.moving_mean = self.moving_mean.to(x.device)
            self.moving_var = self.moving_var.to(x.device)
        # 保存更新过的moving_mean和moving_var
        # Module实例的training属性默认为true，调用.eval（）后设成false
        Y, self.moving_mean, self.moving_var = batch_norm(
            self.training, x, self.gamma, self.beta, self.moving_mean, self.moving_var, eps=1e-5, momentum=0.9)
        return Y

In [33]:
import d2lzh_pytorch as d2l

net = nn.Sequential(
    # in_channels,out_channels,kernek_size
    nn.Conv2d(1, 6, 5),
    BatchNorm(6, num_dims=4),
    nn.Sigmoid(),
    # kernel_size,stride
    nn.MaxPool2d(2, 2),
    nn.Conv2d(6, 16, 5),
    BatchNorm(16, num_dims=4),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    d2l.FlattenLayer(),
    nn.Linear(16 * 4 * 4, 120),
    BatchNorm(120, num_dims=2),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    BatchNorm(84, num_dims=2),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)

In [34]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size,
                                                    root='F:\python_code\pycharm_project\DiveIntoDL\chapter2\data\FashionMNIST')
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9616, train acc 0.803, test acc 0.846, time 8.9 sec
epoch 2, loss 0.4449, train acc 0.867, test acc 0.788, time 8.1 sec
epoch 3, loss 0.3597, train acc 0.880, test acc 0.844, time 8.8 sec
epoch 4, loss 0.3241, train acc 0.889, test acc 0.835, time 8.3 sec
epoch 5, loss 0.3046, train acc 0.893, test acc 0.876, time 8.7 sec


In [35]:
net[1].gamma.view((-1,)), net[1].beta.view((-1))

(tensor([0.9761, 1.0390, 1.1254, 1.0454, 1.3530, 0.9203], device='cuda:0',
        grad_fn=<ViewBackward0>),
 tensor([-0.4698, -0.3157,  0.1119,  0.4376,  0.1128, -0.1553], device='cuda:0',
        grad_fn=<ViewBackward0>))

In [36]:
net = nn.Sequential(
    # in_channels,out_channels,kernel_size
    nn.Conv2d(1, 6, 5),
    nn.BatchNorm2d(6),
    nn.Sigmoid(),
    # kernel_size,stride
    nn.MaxPool2d(2, 2),
    nn.Conv2d(6, 16, 5),
    nn.BatchNorm2d(16),
    nn.Sigmoid(),
    nn.MaxPool2d(2, 2),
    d2l.FlattenLayer(),
    nn.Linear(16 * 4 * 4, 120),
    nn.BatchNorm1d(120),
    nn.Sigmoid(),
    nn.Linear(120, 84),
    nn.BatchNorm1d(84),
    nn.Sigmoid(),
    nn.Linear(84, 10)
)

In [37]:
batch_size = 256
train_iter, test_iter = d2l.load_data_fashion_mnist(batch_size,
                                                    root='F:\python_code\pycharm_project\DiveIntoDL\chapter2\data\FashionMNIST')
lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
d2l.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.9909, train acc 0.792, test acc 0.786, time 8.4 sec
epoch 2, loss 0.4537, train acc 0.865, test acc 0.812, time 7.9 sec
epoch 3, loss 0.3689, train acc 0.878, test acc 0.829, time 7.8 sec
epoch 4, loss 0.3315, train acc 0.886, test acc 0.871, time 7.9 sec
epoch 5, loss 0.3108, train acc 0.892, test acc 0.876, time 8.0 sec
