In [1]:
import torch
import numpy as np
import torch.nn as nn

torch.manual_seed(1)  # reproducible
np.random.seed(1)

In [2]:
class MLP(nn.Module):
    def __init__(self, neural_num, layers=100, do_bn=False):
        super(MLP, self).__init__()
        self.linears = nn.ModuleList([nn.Linear(neural_num, neural_num, bias=False) for i in range(layers)])
        self.bns = nn.ModuleList([nn.BatchNorm1d(neural_num) for i in range(layers)])
        self.neural_num = neural_num
        self.do_bn = do_bn

    def forward(self, x):

        for (i, linear), bn in zip(enumerate(self.linears), self.bns):
            x = linear(x)
            if self.do_bn:
                x = bn(x)
            x = torch.relu(x)

            if torch.isnan(x.std()):
                print("output is nan in {} layers".format(i))
                break
            print("layers:{}, std:{}".format(i, x.std().item()))

        return x

    def initialize(self, mode, std_init=1):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                if mode =="normal":
                    # method 1
                    nn.init.normal_(m.weight.data, std=std_init)    # normal: mean=0, std=1
                elif mode == "kaiming":
                    # method 2 kaiming
                    nn.init.kaiming_normal_(m.weight.data)
                else:
                    print("不支持{}输入".format(mode))

In [23]:
if __name__ == "__main__":

    neural_nums = 256
    layer_nums = 100
    batch_size = 16

    net = MLP(neural_nums, layer_nums, do_bn=False)      # 1. 无初始化； # 2. normal_初始化； # 3。 kaiming初始化
#     net = MLP(neural_nums, layer_nums, do_bn=True)        # 4. BN+无初始化； 5. BN + normal; 6. BN + kaiming, 7. BN+1000
#     net.initialize("normal", std_init=1)
#     net.initialize("normal", std_init=10000)
#     net.initialize("kaiming")

    inputs = torch.randn((batch_size, neural_nums))  # normal: mean=0, std=1

    output = net(inputs)
    print(output)

layers:0, std:0.575767457485199
layers:1, std:0.5824317932128906
layers:2, std:0.5823703408241272
layers:3, std:0.5815761089324951
layers:4, std:0.5813344120979309
layers:5, std:0.5830932855606079
layers:6, std:0.5783577561378479
layers:7, std:0.585452675819397
layers:8, std:0.5750972032546997
layers:9, std:0.5790500640869141
layers:10, std:0.5800771713256836
layers:11, std:0.5729996562004089
layers:12, std:0.581875741481781
layers:13, std:0.5765160918235779
layers:14, std:0.5772631168365479
layers:15, std:0.5824704170227051
layers:16, std:0.575861394405365
layers:17, std:0.5708439946174622
layers:18, std:0.5818034410476685
layers:19, std:0.5798003077507019
layers:20, std:0.5792176127433777
layers:21, std:0.5766910314559937
layers:22, std:0.5747811198234558
layers:23, std:0.588204562664032
layers:24, std:0.581583559513092
layers:25, std:0.5788277387619019
layers:26, std:0.5830960869789124
layers:27, std:0.5800673961639404
layers:28, std:0.5785953402519226
layers:29, std:0.5843232274055

## 观察神经网络神经元数据尺度变化

<font  size=12 face="黑体">
    
有无BN层 | 无初始化 | N(0, 1) | Kaiming初始化 | N(0, 10000)
:-: | :-: | :-: | :-: | :-:
无BN层| 1e-40 | NaN in 35 layers | 0.4 | NaN in 8 layers| 
有BN层 | 0.57 | 0.57 | 0.57 |0.57|
    
</font>