In [3]:
import torch
from torch import nn
from torch.nn import functional as F

In [4]:
class MLP(nn.Module):
    def __init__(self) -> None:
        super().__init__()
        self.hidden = nn.Linear(20, 256)
        self.out = nn.Linear(256, 10)
        
    def forward(self, X) -> torch.Tensor:
        return self.out(F.relu(self.hidden(X)))

In [5]:
x = torch.rand(2, 20)

In [6]:
net = MLP()
net(x)

tensor([[ 0.0399, -0.0853,  0.0314, -0.0595,  0.1768,  0.0632,  0.0307, -0.0180,
          0.0867,  0.0344],
        [-0.0225, -0.1107,  0.1139,  0.0368,  0.2197, -0.0795,  0.0028, -0.0455,
          0.1751,  0.1345]], grad_fn=<AddmmBackward0>)

In [7]:
# 类似nn.Sequential()的顺序类
class MySequential(nn.Module):
    def __init__(self, *args) -> None:
        super().__init__()
        for idx, module in enumerate(args):
            # _modules为OrderDict，是Module类的成员
            self._modules[str(idx)] = module
            
    def forward(self, X) -> torch.Tensor:
        for block in self._modules.values():
            X = block(X)
        return X

In [8]:
net = MySequential(
    nn.Linear(20, 256),
    nn.ReLU(),
    nn.Linear(256, 10)
)
net(x)

tensor([[-0.0077, -0.1971, -0.2177, -0.0100,  0.0585, -0.0221, -0.0309, -0.0037,
         -0.1867, -0.1681],
        [-0.1798, -0.2975, -0.1773,  0.0044,  0.0373,  0.0159,  0.0181, -0.0868,
         -0.1970, -0.2313]], grad_fn=<AddmmBackward0>)

## 参数管理

In [9]:
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(), nn.Linear(8, 1))
X = torch.rand(size=(2, 4))
net(X)

tensor([[-0.2267],
        [-0.2033]], grad_fn=<AddmmBackward0>)

In [10]:
print(type(net[2].bias))
print(net[2].bias)
print(net[2].bias.data)

<class 'torch.nn.parameter.Parameter'>
Parameter containing:
tensor([-0.2868], requires_grad=True)
tensor([-0.2868])


In [11]:
# 还未反向传播，所以梯度为空
net[2].weight.grad == None

True

In [12]:
# 访问所有参数
print(*[(name, param.shape) for name, param in net[0].named_parameters()])
print(*[(name, param.shape) for name, param in net.named_parameters()])

('weight', torch.Size([8, 4])) ('bias', torch.Size([8]))
('0.weight', torch.Size([8, 4])) ('0.bias', torch.Size([8])) ('2.weight', torch.Size([1, 8])) ('2.bias', torch.Size([1]))


In [13]:
net.state_dict()['2.bias'].data

tensor([-0.2868])

In [14]:
# 多个嵌套块
def block1():
    return nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                    nn.Linear(8, 4), nn.ReLU())

def block2():
    net = nn.Sequential()
    for i in range(4):
    # 在这里嵌套
        net.add_module(f'block {i}', block1())
    return net

rgnet = nn.Sequential(block2(), nn.Linear(4, 1))
rgnet(X)

tensor([[-0.2879],
        [-0.2879]], grad_fn=<AddmmBackward0>)

In [15]:
print(rgnet)

Sequential(
  (0): Sequential(
    (block 0): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 1): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 2): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
    (block 3): Sequential(
      (0): Linear(in_features=4, out_features=8, bias=True)
      (1): ReLU()
      (2): Linear(in_features=8, out_features=4, bias=True)
      (3): ReLU()
    )
  )
  (1): Linear(in_features=4, out_features=1, bias=True)
)


In [16]:
# 可以像字典一样访问它们
rgnet[0][1][0].bias.data

tensor([ 0.0540, -0.2870, -0.0034,  0.3494, -0.0116,  0.3462,  0.0441,  0.4804])

In [17]:
def init_normal(m):
    if type(m) == nn.Linear:
        nn.init.normal_(m.weight, mean=0, std=0.01)
        nn.init.zeros_(m.bias)
net.apply(init_normal)
net[0].weight.data[0], net[0].bias.data[0]

(tensor([ 0.0065, -0.0069,  0.0120, -0.0034]), tensor(0.))

In [19]:
# 参数自定义初始化
def my_init(m):
    if type(m) == nn.Linear:
        print("Init", *[(name, param.shape) for name, param in m.named_parameters()][0])
        nn.init.uniform_(m.weight, -10, 10)
        m.weight.data *= m.weight.data.abs() >= 5
        
net.apply(my_init)
net[0].weight[:2]

Init weight torch.Size([8, 4])
Init weight torch.Size([1, 8])


tensor([[ 0.0000,  7.4294, -5.9418, -8.9348],
        [-8.1131, -0.0000, -0.0000,  8.0154]], grad_fn=<SliceBackward0>)

In [21]:
# 参数绑定
# 设置一个共享的层
# 如下，第三层和第五层的网络参数是绑定的
shared = nn.Linear(8, 8)
net = nn.Sequential(nn.Linear(4, 8), nn.ReLU(),
                shared, nn.ReLU(),
                shared, nn.ReLU(),
                nn.Linear(8, 1))
net(X)

# 检查参数是否相同
print(net[2].weight.data[0] == net[4].weight.data[0])
net[2].weight.data[0, 0] = 100

# 确保它们实际上是同一个对象，而不只是有相同的值
print(net[2].weight.data[0] == net[4].weight.data[0])

tensor([True, True, True, True, True, True, True, True])
tensor([True, True, True, True, True, True, True, True])


In [22]:
# 不带参数的层
class CenteredLayer(nn.Module):
    def __init__(self):
        super().__init__()
        
    def forward(self, X):
        return X - X.mean()

In [23]:
layer = CenteredLayer()
layer(torch.FloatTensor([1, 2, 3, 4, 5]))

tensor([-2., -1.,  0.,  1.,  2.])

In [27]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())
Y = net(torch.rand(4, 8))
Y.mean() # 均值应该为0，但由于浮点数精度问题，是一个非常小的非0数字

tensor(-1.8626e-09, grad_fn=<MeanBackward0>)

In [28]:
# 带参数的层
class MyLinear(nn.Module):
    def __init__(self, in_units, units):
        super().__init__()
        self.weight = nn.Parameter(torch.randn(in_units, units))
        self.bias = nn.Parameter(torch.randn(units,))
        
    def forward(self, X):
        linear = torch.matmul(X, self.weight.data) + self.bias.data
        return F.relu(linear)

In [29]:
linear = MyLinear(5, 3)
linear.weight

Parameter containing:
tensor([[ 0.5436,  0.5482,  0.2956],
        [-0.6875,  0.7431,  0.2294],
        [ 0.7695,  0.0509, -0.7872],
        [ 0.0419, -0.9702, -3.0568],
        [-0.8384, -0.8683,  0.2146]], requires_grad=True)

In [32]:
linear(torch.rand(2, 5))

tensor([[2.2094, 0.0000, 0.0000],
        [2.2611, 0.0000, 0.0000]])