In [1]:
import torch
from torch import nn


class MLP(nn.Module):
    # 声明带有模型参数的层，这里声明了两个全连接层
    def __init__(self, **kwargs):
        # 调用MLP父类Module的构造函数来进行初始化
        # 这样在构造实例函数时还可以指定其他函数参数
        super(MLP, self).__init__(**kwargs)
        self.hidden = nn.Linear(784, 256)
        self.act = nn.ReLU()
        self.output = nn.Linear(256, 10)

    # 定义模型的前向计算，即如何根据输入x计算返回所需要的模型输出
    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)

In [3]:
X = torch.randn(2, 784)
net = MLP()
print(net)
net(X)

MLP(
  (hidden): Linear(in_features=784, out_features=256, bias=True)
  (act): ReLU()
  (output): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[ 0.0611,  0.1468, -0.0908,  0.1905,  0.0187, -0.4276,  0.1566,  0.0192,
          0.2118,  0.1051],
        [-0.2361, -0.2599,  0.2703, -0.6321, -0.1396, -0.1107, -0.3178, -0.0979,
          0.3192, -0.3336]], grad_fn=<AddmmBackward0>)

In [5]:
from collections import OrderedDict


class MySequential(nn.Module):

    def __init__(self, *args):
        super(MySequential, self).__init__()
        # 如果传入的是一个OrderedDict
        if len(args) == 1 and isinstance(args[0], OrderedDict):
            for key, module in args[0].items():
                # add_module 方法会将module添加
                # 进self.modules(一个OrderedDict)
                self.add_module(key, module)
        # 传入的是一些Module
        else:
            for idx, module in enumerate(args):
                self.add_module(str(idx), module)

    def forward(self, input):
        # self._module返回一个OrderedDict，保证会按照成员添加时的顺序遍历成员
        for module in self._modules.values():
            input = module(input)
        return input


In [6]:
net = MySequential(nn.Linear(784, 256), nn.ReLU(), nn.Linear(256, 10))
print(net)
net(X)

MySequential(
  (0): Linear(in_features=784, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


tensor([[ 0.0344,  0.0582,  0.0482,  0.0724,  0.0931,  0.0834, -0.1007,  0.2421,
          0.2931, -0.3046],
        [ 0.0649,  0.0370,  0.2754,  0.0789,  0.4120,  0.1577,  0.0132,  0.1492,
         -0.3027, -0.0986]], grad_fn=<AddmmBackward0>)

In [7]:
class FancyMLP(nn.Module):
    def __init__(self, **kwargs):
        super(FancyMLP, self).__init__(**kwargs)
        # 不可训练参数（常数参数）
        self.rand_weight = torch.rand((20, 20), requires_grad=False)
        self.linear = nn.Linear(20, 20)

    def forward(self, x):
        x = self.linear(x)
        # 使用创建的常数参数，以及nn.functional中的relu函数和mm函数
        x = nn.functional.relu(torch.mm(x, self.rand_weight) + 1)
        # 复用全连接层，等价于两个全连接层共享参数
        x = self.linear(x)
        # 控制流，这里我们需要调用item函数来返回标量进行比较
        while x.norm().item() > 1:
            x /= 2
        if x.norm().item() < 0.8:
            x *= 10
        return x.sum()

In [13]:
X = torch.rand(2, 20)
net = FancyMLP()
print(net)
net(X)

FancyMLP(
  (linear): Linear(in_features=20, out_features=20, bias=True)
)


tensor(0.6931, grad_fn=<SumBackward0>)

In [14]:
class NestMLP(nn.Module):
    def __init__(self, **kwargs):
        super(NestMLP, self).__init__(**kwargs)
        self.net = nn.Sequential(nn.Linear(40, 30), nn.ReLU())

    def forward(self, x):
        return self.net(x)


net = nn.Sequential(NestMLP(), nn.Linear(30, 20), FancyMLP())
X = torch.rand(2, 40)
print(net)
net(X)

Sequential(
  (0): NestMLP(
    (net): Sequential(
      (0): Linear(in_features=40, out_features=30, bias=True)
      (1): ReLU()
    )
  )
  (1): Linear(in_features=30, out_features=20, bias=True)
  (2): FancyMLP(
    (linear): Linear(in_features=20, out_features=20, bias=True)
  )
)


tensor(0.1661, grad_fn=<SumBackward0>)

In [15]:
net = nn.Sequential(nn.Linear(20, 256), nn.ReLU(), nn.Linear(256, 10))
# PyTorch默认已经初始化
print(net)
X = torch.rand(2, 20)
# 前向计算
Y = net(X)

Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [29]:
for name, param in net[0].named_parameters():
    print(name, param.size(), type(param))

weight torch.Size([256, 20]) <class 'torch.nn.parameter.Parameter'>
bias torch.Size([256]) <class 'torch.nn.parameter.Parameter'>


In [20]:
weight_0 = list(net[0].parameters())[0]
type(weight_0.data[0][0]), weight_0.data.size()

(torch.Tensor, torch.Size([256, 20]))

In [21]:
param = dict(net[0].named_parameters())
type(param['weight']), param['weight'].size()

(torch.nn.parameter.Parameter, torch.Size([256, 20]))

In [22]:
weight_0.data

tensor([[ 0.0611, -0.0914, -0.1549,  ..., -0.1124, -0.2068,  0.0363],
        [-0.1203, -0.2004, -0.2225,  ..., -0.0670,  0.0646, -0.1968],
        [-0.1329,  0.1754,  0.2053,  ..., -0.1082, -0.2073, -0.1707],
        ...,
        [-0.1243,  0.0929, -0.1023,  ..., -0.0711, -0.1922, -0.1919],
        [-0.0549, -0.1159, -0.1828,  ...,  0.1021,  0.1844,  0.1189],
        [ 0.1791,  0.1569, -0.0976,  ..., -0.0738, -0.1164,  0.1089]])

In [24]:
print(weight_0.grad)

None


In [30]:
bias_1 = list(net[2].parameters())[1]
bias_1.data

tensor([-0.0337,  0.0343, -0.0257, -0.0213,  0.0356, -0.0034,  0.0427,  0.0229,
         0.0011, -0.0318])

In [31]:
print(net)

Sequential(
  (0): Linear(in_features=20, out_features=256, bias=True)
  (1): ReLU()
  (2): Linear(in_features=256, out_features=10, bias=True)
)


In [32]:
from torch.nn import init

for name, param in net.named_parameters():
    if 'weight' in name:
        init.normal_(param, mean=0, std=0.01)
        print(name, param.data)

0.weight tensor([[ 0.0039,  0.0105,  0.0061,  ...,  0.0088,  0.0106,  0.0171],
        [ 0.0233,  0.0053,  0.0232,  ..., -0.0074,  0.0054,  0.0060],
        [-0.0140,  0.0014,  0.0180,  ...,  0.0044,  0.0041,  0.0028],
        ...,
        [ 0.0002,  0.0093,  0.0056,  ...,  0.0187,  0.0051,  0.0024],
        [ 0.0021,  0.0161,  0.0175,  ..., -0.0157,  0.0053,  0.0109],
        [ 0.0170,  0.0060,  0.0047,  ..., -0.0010,  0.0004,  0.0074]])
2.weight tensor([[-3.8328e-03, -1.9209e-03, -2.3917e-05,  ...,  7.1773e-03,
          8.6600e-03, -9.1975e-03],
        [ 1.0817e-03, -1.2275e-02,  1.3321e-02,  ...,  1.6718e-02,
         -7.4219e-03, -1.6690e-02],
        [ 2.1717e-03, -5.5297e-05,  1.6444e-02,  ..., -3.5544e-03,
         -7.7311e-03,  6.8603e-03],
        ...,
        [ 3.9138e-03,  5.9141e-03,  2.0731e-02,  ...,  2.7182e-02,
         -1.7534e-03,  4.5096e-03],
        [ 1.6116e-03, -5.4110e-04,  3.3411e-03,  ...,  6.3330e-03,
         -9.1691e-03, -5.8241e-03],
        [ 2.0270e-02

In [33]:
for name, param in net.named_parameters():
    if 'bias' in name:
        init.constant_(param, val=0)
        print(name, param.data)

0.bias tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0

In [34]:
def xavier(m):
    if type(m) == nn.Linear:
        torch.nn.init.xavier_uniform_(m.weight)


def init_42(m):
    if type(m) == nn.Linear:
        torch.nn.init.constant_(m.weight, 42)


net[0].apply(xavier)
net[2].apply(init_42)
print(net[0].weight.data[0])
print(net[2].weight.data)

tensor([ 0.0051,  0.0073, -0.1241, -0.0274,  0.1227,  0.0636,  0.1328, -0.1228,
        -0.1235, -0.0788,  0.0281, -0.0256, -0.1021, -0.0204, -0.0384, -0.0510,
         0.1079, -0.0707, -0.0189, -0.0210])
tensor([[42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.],
        ...,
        [42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.],
        [42., 42., 42.,  ..., 42., 42., 42.]])


In [35]:
def normal_(tensor, mean=0, std=1):
    with torch.no_grad():
        return tensor.normal_(mean, std)

In [36]:
def init_weight(tensor):
    with torch.no_grad():
        tensor.uniform_(-10, 10)
        tensor *= (tensor.abs() >= 5).float()


for name, param in net.named_parameters():
    if 'weight' in name:
        init_weight(param)
        print(name, param.data)

0.weight tensor([[-0.0000,  5.1854,  9.7246,  ..., -8.5708,  0.0000, -8.2214],
        [-0.0000, -7.7485, -7.1613,  ...,  7.3610,  0.0000,  6.6981],
        [ 6.4790, -9.3837, -0.0000,  ..., -7.0435,  8.5091, -6.9319],
        ...,
        [ 8.8448,  8.5809,  6.0294,  ..., -0.0000,  0.0000,  5.0400],
        [ 5.9072,  0.0000,  0.0000,  ..., -0.0000,  7.3593, -7.7892],
        [-0.0000, -5.4473, -0.0000,  ..., -9.6593,  0.0000,  0.0000]])
2.weight tensor([[-5.7760, -0.0000,  6.2989,  ...,  0.0000, -0.0000, -0.0000],
        [-8.2869, -7.0773, -5.8406,  ..., -7.3828,  0.0000, -7.2289],
        [ 0.0000,  7.5993,  5.4324,  ...,  5.6188, -0.0000,  9.1456],
        ...,
        [-6.3275,  0.0000,  0.0000,  ..., -0.0000, -9.3381,  7.4939],
        [-0.0000,  6.8807, -8.2158,  ..., -0.0000, -0.0000,  0.0000],
        [ 0.0000, -8.8062, -6.0158,  ...,  0.0000,  6.8106,  0.0000]])


In [37]:
for name, param in net.named_parameters():
    if 'bias' in name:
        param.data += 1
        print(name, param.data)

0.bias tensor([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
        1., 1., 1., 1., 1., 1., 1

In [38]:
linear = nn.Linear(1, 1, bias=False)
net = nn.Sequential(linear, linear)
print(net)
for name, param in net.named_parameters():
    init.constant_(param, val=3)
    print(name, param.data)

Sequential(
  (0): Linear(in_features=1, out_features=1, bias=False)
  (1): Linear(in_features=1, out_features=1, bias=False)
)
0.weight tensor([[3.]])


In [39]:
print(id(net[0]) == id(net[1]))
print(id(net[0].weight) == id(net[1].weight))

True
True


In [40]:
x = torch.ones(1, 1)
y = net(x).sum()
print(y)
y.backward()
# 单次梯度是3，两次所以就是6
print(net[0].weight.grad)

tensor(9., grad_fn=<SumBackward0>)
tensor([[6.]])


In [41]:
class CenteredLayer(nn.Module):
    def __init__(self, **kwargs):
        super(CenteredLayer, self).__init__(**kwargs)

    def forward(self, x):
        return x - x.mean()

In [42]:
layer = CenteredLayer()
layer(torch.tensor([1, 2, 3, 4, 5], dtype=torch.float))

tensor([-2., -1.,  0.,  1.,  2.])

In [43]:
net = nn.Sequential(nn.Linear(8, 128), CenteredLayer())

In [46]:
y = net(torch.rand(4, 8))
y.mean().item()

-3.259629011154175e-09

In [47]:
class MyDense(nn.Module):
    def __init__(self):
        super(MyDense, self).__init__()
        self.params = nn.ParameterList([nn.Parameter(torch.randn(4, 4)) for i in range(3)])
        self.params.append(nn.Parameter(torch.randn(4, 1)))

    def forward(self, x):
        for i in range(len(self.params)):
            x = torch.mm(x, self.params[i])
        return x


net = MyDense()
print(net)


MyDense(
  (params): ParameterList(
      (0): Parameter containing: [torch.float32 of size 4x4]
      (1): Parameter containing: [torch.float32 of size 4x4]
      (2): Parameter containing: [torch.float32 of size 4x4]
      (3): Parameter containing: [torch.float32 of size 4x1]
  )
)


In [48]:
class MyDictDense(nn.Module):
    def __init__(self):
        super(MyDictDense, self).__init__()
        self.params = nn.ParameterDict({
            'linear1': nn.Parameter(torch.randn(4, 4)),
            'linear2': nn.Parameter(torch.randn(4, 1))
        })
        # 新增
        self.params.update({
            'linear3': nn.Parameter(torch.randn(4, 2))
        })

    def forward(self, x, choice='linear1'):
        return torch.mm(x, self.params[choice])


net = MyDictDense()
print(net)


MyDictDense(
  (params): ParameterDict(
      (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
      (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
      (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
  )
)


In [49]:
x = torch.ones(1, 4)
print(net(x, choice='linear1'))
print(net(x, choice='linear2'))
print(net(x, choice='linear3'))

tensor([[ 1.7706, -0.2898,  1.3765,  0.5986]], grad_fn=<MmBackward0>)
tensor([[-3.5328]], grad_fn=<MmBackward0>)
tensor([[ 0.3834, -2.2809]], grad_fn=<MmBackward0>)


In [51]:
net = nn.Sequential(MyDictDense(), MyDense())
print(net)
print(net(x))

Sequential(
  (0): MyDictDense(
    (params): ParameterDict(
        (linear1): Parameter containing: [torch.FloatTensor of size 4x4]
        (linear2): Parameter containing: [torch.FloatTensor of size 4x1]
        (linear3): Parameter containing: [torch.FloatTensor of size 4x2]
    )
  )
  (1): MyDense(
    (params): ParameterList(
        (0): Parameter containing: [torch.float32 of size 4x4]
        (1): Parameter containing: [torch.float32 of size 4x4]
        (2): Parameter containing: [torch.float32 of size 4x4]
        (3): Parameter containing: [torch.float32 of size 4x1]
    )
  )
)
tensor([[-1.3459]], grad_fn=<MmBackward0>)


In [52]:
x = torch.ones(3)
torch.save(x, 'x.pt')

In [53]:
x2 = torch.load('x.pt')
x2

tensor([1., 1., 1.])

In [54]:
y = torch.zeros(4)
torch.save([x, y], 'xy.pt')
xy_list = torch.load('xy.pt')
xy_list

[tensor([1., 1., 1.]), tensor([0., 0., 0., 0.])]

In [55]:
torch.save({'x': x, 'y': y}, 'xy_dict.pt')
xy = torch.load('xy_dict.pt')
xy

{'x': tensor([1., 1., 1.]), 'y': tensor([0., 0., 0., 0.])}

In [56]:
class MLP(nn.Module):
    def __init__(self):
        super(MLP, self).__init__()
        self.hidden = nn.Linear(3, 2)
        self.act = nn.ReLU()
        self.output = nn.Linear(2, 1)

    def forward(self, x):
        a = self.act(self.hidden(x))
        return self.output(a)


net = MLP()

In [57]:
torch.save(net, 'model.bin')

In [58]:
net2 = torch.load('model.bin')

In [59]:
Y2 = net2(x)
y = net(x)
Y2 == y

tensor([True])

In [63]:
torch.device('cpu'), torch.cuda.device('cuda')
torch.cuda.device('cuda:1')

<class 'torch.cuda.device'>


In [64]:
x.device

device(type='cpu')

In [65]:
torch.cuda.device_count()

1

In [67]:
a = torch.tensor([1, 2, 3], device=torch.device('cuda'))
a

tensor([1, 2, 3], device='cuda:0')

In [68]:
z = x.cuda(0)
print(x)
print(z)

tensor([1., 1., 1.])
tensor([1., 1., 1.], device='cuda:0')


In [69]:
torch.exp((z + 2).float()) * x.float().cuda()

tensor([20.0855, 20.0855, 20.0855], device='cuda:0')

In [72]:
net = nn.Linear(3, 1)
net.cuda()
net.weight.data.device

device(type='cuda', index=0)

In [73]:
net(z.view(1, 3).float().cuda())

tensor([[0.4232]], device='cuda:0', grad_fn=<AddmmBackward0>)

In [74]:
net.weight.data

tensor([[ 0.4400,  0.1284, -0.3436]], device='cuda:0')