In [None]:
import torch
import numpy as np
import torch.nn as nn

torch.manual_seed(1)  # reproducible
np.random.seed(1)

In [8]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers=100, do_bn=False):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(neural_num) for i in range(layers)])
        self.neural_num = neural_num
        self.do_bn = do_bn

    def forward(self, x):

        for (i, linear), bn in zip(enumerate(self.linears), self.bns):
            x = linear(x)
            if self.do_bn:
                x = bn(x)
            x = torch.relu(x)

            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
            print("layers:{}, std:{}".format(i, x.std().item()))

        return x

    def initialize(self, mode, std_init=1):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                if mode =="normal":
                    # method 1
                    nn.init.normal_(m.weight.data, std=std_init)    # normal: mean=0, std=1
                elif mode == "kaiming":
                    # method 2 kaiming
                    nn.init.kaiming_normal_(m.weight.data)
                else:
                    print("不支持{}输入".format(mode))

In [13]:
if __name__ == "__main__":

    neural_nums = 256
    layer_nums = 100
    batch_size = 16

    net = MLP(neural_nums, layer_nums, do_bn=False)      # 1. 无初始化； # 2. normal_初始化； # 3。 kaiming初始化
#     net = MLP(neural_nums, layer_nums, do_bn=True)        # 4. BN+无初始化； 5. BN + normal; 6. BN + kaiming, 7. BN+1000
#     net.initialize("normal", std_init=1)
#     net.initialize("normal", std_init=1000)
#     net.initialize("kaiming")

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

layers:0, std:0.5848153233528137
layers:1, std:0.5827966928482056
layers:2, std:0.5787407755851746
layers:3, std:0.583806574344635
layers:4, std:0.5799300074577332
layers:5, std:0.5819533467292786
layers:6, std:0.5800478458404541
layers:7, std:0.5818994045257568
layers:8, std:0.5750249028205872
layers:9, std:0.5772916674613953
layers:10, std:0.5853444337844849
layers:11, std:0.5787179470062256
layers:12, std:0.5728684663772583
layers:13, std:0.5785820484161377
layers:14, std:0.5855928063392639
layers:15, std:0.5862574577331543
layers:16, std:0.5794041156768799
layers:17, std:0.5808504223823547
layers:18, std:0.5883684754371643
layers:19, std:0.5889952778816223
layers:20, std:0.5744621753692627
layers:21, std:0.5760332942008972
layers:22, std:0.5747781991958618
layers:23, std:0.5878269672393799
layers:24, std:0.5815837383270264
layers:25, std:0.5916622281074524
layers:26, std:0.5802280306816101
layers:27, std:0.5706748962402344
layers:28, std:0.579828143119812
layers:29, std:0.580768883

## 观察神经网络神经元数据尺度变化

<font  size=12 face="黑体">
    
有无BN层 | 无初始化 | N(0, 1) | Kaiming初始化 | N(0, 10000)
:-: | :-: | :-: | :-: | :-:
无BN层| 1e-19 | NaN in 35 layers | 0.4 | NaN in 8 layers| 
有BN层 | 0.57 | 0.57 | 0.57 |0.57|
    
</font>