# 勾配消失

In [1]:
import torch
import torch.nn as nn

In [2]:
input_size = 64
output_size = 64
hidden_size = 64

In [3]:
x = torch.randn(1, input_size)

def show_grad(model):
    y = model(x)
    y.sum().backward()
    for i, fc in enumerate(model.fcs, 1):
        print(f"fc{i}: {fc.weight.grad.abs().mean()}")

In [4]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
        self.fcs = [self.fc1, self.fc2, self.fc3, self.fc4]
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.sigmoid(self.fc1(x))
        x = self.sigmoid(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        x = self.sigmoid(self.fc4(x))
        return x

In [5]:
model = NeuralNetwork()

In [6]:
show_grad(model)

fc1: 0.0004986495478078723
fc2: 0.0019251107005402446
fc3: 0.01376337744295597
fc4: 0.11762049794197083


In [7]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
        self.fcs = [self.fc1, self.fc2, self.fc3, self.fc4]
        for fc in self.fcs:
            nn.init.normal_(fc.weight, std=0.01)

    def forward(self, x):
        x = self.fc1(x)
        x = self.fc2(x)
        x = self.fc3(x)
        x = self.fc4(x)
        return x

In [8]:
model = NeuralNetwork()
show_grad(model)

fc1: 0.0003132292767986655
fc2: 0.0003741825057659298
fc3: 0.0036067450419068336
fc4: 0.05968386307358742


In [9]:
class NeuralNetwork(nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = nn.Linear(input_size, hidden_size)
        self.fc2 = nn.Linear(hidden_size, hidden_size)
        self.fc3 = nn.Linear(hidden_size, hidden_size)
        self.fc4 = nn.Linear(hidden_size, output_size)
        self.fcs = [self.fc1, self.fc2, self.fc3, self.fc4]
        self.relu = nn.ReLU()

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.relu(self.fc3(x))
        x = self.relu(self.fc4(x))
        return x

In [10]:
model = NeuralNetwork()
show_grad(model)

fc1: 0.02707577496767044
fc2: 0.015823712572455406
fc3: 0.0194535069167614
fc4: 0.029730459675192833
