In [2]:
import torch
import numpy as np

# 单层神经网络
全连接ReLU，1*隐藏层，无bias。x预测y，用L2 Loss
- $H=W_1X$
- $A=ReLU=max(0,H)$
- $\hat{y}=W_2A$

## numpy实现
全程手动

In [4]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建训练数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out) 

learning_rate = 1e-6

for it in range(500):
    # Forward pass
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # compute loss
    loss = np.square(y_pred - y).sum()
    print(it, loss)
    
    # Backward pass
    # compute gradient

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h<0] = 0
    grad_w1 = x.T.dot(grad_h)

    # update weights of w1&w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 30825387.714877978
1 26424172.39503157
2 25931791.17448231
3 25141558.065359056
4 21949806.089865733
5 16451423.916932076
6 10734886.314363547
7 6379827.862905791
8 3723791.8412019247
9 2263690.501938464
10 1489416.019937373
11 1065569.8045618618
12 817632.1548879102
13 659182.732624602
14 548804.064263481
15 466435.7835879561
16 401835.5505659876
17 349380.1800640045
18 305810.04649814736
19 269088.3264597164
20 237832.31781575707
21 211033.65360617122
22 187898.84717623558
23 167811.55971874273
24 150302.04319304804
25 135017.24626753433
26 121593.31139500231
27 109752.55041803865
28 99278.99515361668
29 89993.59371460589
30 81753.10248850787
31 74402.36224686372
32 67831.1194612197
33 61944.71655773239
34 56662.70320269985
35 51912.86110214854
36 47637.95946849718
37 43774.624575904076
38 40275.42877342538
39 37101.12441426163
40 34217.24621664119
41 31592.58237826606
42 29207.05152951455
43 27026.65363741228
44 25031.868900675265
45 23203.572783756066
46 21525.744924684495
47 199

In [None]:
# 训练好的模型
h = x.dot(w1)
h_relu = np.maximum(h, 0)
y_pred = h_relu.dot(w2)
y_pred - y

## pytorch实现
### 1. tensor
类似numpy， 可在GPU加速运算

In [11]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H)
w2 = torch.randn(H, D_out) 

learning_rate = 1e-6

for it in range(500):
    # Forward pass
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.m(w2)

    # compute loss
    loss = (y_pred - y).pow(2).sum().item()
    print(it, loss)
    
    # Backward pass
    # compute gradient

    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h<0] = 0
    grad_w1 = x.T.mm(grad_h)

    # update weights of w1&w2
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 38680648.0
1 37410248.0
2 38808292.0
3 35763896.0
4 26317700.0
5 15317278.0
6 7711322.5
7 3907832.0
8 2249604.25
9 1509048.375
10 1133082.25
11 907271.0
12 751118.75
13 633096.6875
14 539497.6875
15 463305.0
16 400235.40625
17 347500.3125
18 303122.1875
19 265530.0
20 233515.8125
21 206085.8125
22 182474.34375
23 162033.8125
24 144292.9375
25 128812.6796875
26 115281.7109375
27 103404.4375
28 92953.1875
29 83724.640625
30 75560.28125
31 68315.5
32 61873.3359375
33 56134.7421875
34 51015.83203125
35 46432.828125
36 42320.359375
37 38622.43359375
38 35293.7578125
39 32291.03515625
40 29579.720703125
41 27126.14453125
42 24901.77734375
43 22882.10546875
44 21047.58203125
45 19377.57421875
46 17854.66015625
47 16466.177734375
48 15197.48046875
49 14037.7119140625
50 12977.220703125
51 12005.77734375
52 11114.3681640625
53 10296.0693359375
54 9544.44140625
55 8853.1484375
56 8215.8896484375
57 7629.33251953125
58 7088.6865234375
59 6590.0673828125
60 6129.91796875
61 5704.8115234375
62 53

autograd

In [10]:
x = torch.tensor(1., requires_grad=True)
w = torch.tensor(2., requires_grad=True)
b = torch.tensor(3., requires_grad=True)

y = w*x + b # y = 2*1 + 3

y.backward()

# dy / dw = x
print(w.grad, x.grad, b.grad)


tensor(1.) tensor(2.) tensor(1.)


### 2. torch + autograd
自动求导所有参数梯度

In [13]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

w1 = torch.randn(D_in, H, requires_grad=True)# 需要计算梯度
w2 = torch.randn(H, D_out, requires_grad=True) 

learning_rate = 1e-6

for it in range(500):
    # Forward pass
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # compute loss
    loss = (y_pred - y).pow(2).sum() # computation graph
    print(it, loss.item())
    
    # Backward pass
    loss.backward()

    # update weights of w1&w2
    with torch.no_grad():# 不让计算图占内存
        w1 -= learning_rate * w1.grad # 所有tensor运算属于计算图
        w2 -= learning_rate * w2.grad

        w1.grad.zero_() # 清0，否则不断叠加
        w2.grad.zero_()

0 32329088.0
1 28430886.0
2 25814546.0
3 21692844.0
4 16014325.0
5 10525513.0
6 6430998.5
7 3913725.25
8 2488612.25
9 1702237.0
10 1251896.875
11 976648.5
12 794140.125
13 663772.8125
14 564831.0
15 486478.65625
16 422603.625
17 369450.1875
18 324602.0625
19 286418.90625
20 253704.03125
21 225448.28125
22 200947.546875
23 179607.53125
24 160919.28125
25 144592.1875
26 130190.046875
27 117451.4453125
28 106149.296875
29 96131.6171875
30 87208.25
31 79230.734375
32 72091.84375
33 65684.6171875
34 59922.2265625
35 54727.90625
36 50039.3515625
37 45798.5390625
38 41959.93359375
39 38480.50390625
40 35322.41796875
41 32453.037109375
42 29846.5546875
43 27472.35546875
44 25306.689453125
45 23327.138671875
46 21517.76953125
47 19861.38671875
48 18344.009765625
49 16953.4921875
50 15677.9326171875
51 14506.2646484375
52 13429.140625
53 12438.294921875
54 11526.7919921875
55 10686.517578125
56 9912.150390625
57 9197.7373046875
58 8538.646484375
59 7930.1162109375
60 7367.61962890625
61 6847.580

### 3. nn | nural net

In [16]:
import torch.nn as nn
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = nn.Sequential(
    nn.Linear(D_in, H),
    nn.ReLU(), # activation function
    nn.Linear(H, D_out)
)

# 拟合效果不好，尝试改变初始权重
nn.init.normal_(model[0].weight)
nn.init.normal_(model[2].weight)

# model  = model.cuda()

loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-6

for it in range(500):
    # Forward pass
    y_pred = model(x) # model.forward()

    # compute loss
    loss = loss_fn(y_pred, y) # computation graph
    print(it, loss.item())
    
    # Backward pass
    loss.backward()

    # update weights of w1&w2
    with torch.no_grad():# 不让计算图占内存
        for param in model.parameters():
            param -= learning_rate * param.grad
        
        model.zero_grad()

0 27049660.0
1 20304284.0
2 15793264.0
3 12005248.0
4 8828522.0
5 6306160.0
6 4467661.0
7 3185989.5
8 2320843.0
9 1737293.25
10 1339322.125
11 1060529.75
12 859409.5
13 710004.3125
14 595719.125
15 505926.5625
16 433675.09375
17 374515.65625
18 325361.75
19 284114.34375
20 249176.4375
21 219354.90625
22 193736.1875
23 171668.9375
24 152538.65625
25 135883.46875
26 121323.4375
27 108564.859375
28 97346.703125
29 87455.7421875
30 78698.65625
31 70940.140625
32 64061.578125
33 57936.796875
34 52478.8359375
35 47600.7109375
36 43231.421875
37 39313.0703125
38 35787.07421875
39 32615.515625
40 29759.14453125
41 27180.224609375
42 24850.9140625
43 22744.19140625
44 20836.673828125
45 19107.009765625
46 17537.1015625
47 16109.0546875
48 14809.3154296875
49 13625.1982421875
50 12545.26953125
51 11559.7607421875
52 10660.068359375
53 9837.37109375
54 9083.7509765625
55 8393.39453125
56 7760.5859375
57 7179.54638671875
58 6646.306640625
59 6156.02978515625
60 5705.11083984375
61 5289.908203125
6

### 4. optim
不用手动更新weight。含不同模型优化方法，如SGD+momentum, RMSProp, Adam...

In [20]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

model = nn.Sequential(
    nn.Linear(D_in, H),
    nn.ReLU(),
    nn.Linear(H, D_out)
)

# 拟合效果不好，尝试改变初始权重
# nn.init.normal_(model[0].weight)
# nn.init.normal_(model[2].weight)

# model  = model.cuda()

loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4 # 1e-3 ~ -4对Adam较好
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)


for it in range(500):
    # Forward pass
    y_pred = model(x) # model.forward()

    # compute loss
    loss = loss_fn(y_pred, y) # computation graph
    print(it, loss.item())
    
    # Backward pass
    loss.backward()

    # update model parameters
    optimizer.step()
    optimizer.zero_grad()

0 720.2042236328125
1 669.2815551757812
2 624.4517822265625
3 584.4412231445312
4 548.5148315429688
5 516.002197265625
6 486.4120178222656
7 459.094482421875
8 433.7242431640625
9 410.1972961425781
10 388.00762939453125
11 367.08099365234375
12 347.23956298828125
13 328.35699462890625
14 310.48626708984375
15 293.5306396484375
16 277.3670349121094
17 261.919189453125
18 247.19970703125
19 233.1800537109375
20 219.79916381835938
21 207.0872802734375
22 194.89376831054688
23 183.31271362304688
24 172.33128356933594
25 161.923828125
26 152.0611114501953
27 142.73854064941406
28 133.94195556640625
29 125.63080596923828
30 117.77931213378906
31 110.38900756835938
32 103.43904113769531
33 96.91029357910156
34 90.78981018066406
35 85.03186798095703
36 79.6490478515625
37 74.60723876953125
38 69.8998794555664
39 65.50548553466797
40 61.40217590332031
41 57.57604217529297
42 53.99755859375
43 50.65953063964844
44 47.53276824951172
45 44.61350631713867
46 41.89106369018555
47 39.353248596191406


## 自定义nn Modules
继承nn.Module。需要比Sequential更复杂的模型时

In [24]:
N, D_in, H, D_out = 64, 1000, 100, 10

# 随机创建训练数据
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

class TwoLayerNet(nn.Module):
    def __init__(self, D_in, H, D_out):
        super(TwoLayerNet, self).__init__()
        # define model architecture
        self.linear1 = nn.Linear(D_in, H, bias=False)
        self.linear2 = nn.Linear(H, D_out, bias=False)

    def forward(self, x):
        y_pred = self.linear2(self.linear1(x).clamp(min=0))
        return y_pred

model = TwoLayerNet(D_in, H, D_out)

loss_fn = nn.MSELoss(reduction='sum')
learning_rate = 1e-4 # 1e-3 ~ -4对Adam较好
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


for it in range(500):
    # Forward pass
    y_pred = model(x) # model.forward()

    # compute loss
    loss = loss_fn(y_pred, y) # computation graph
    print(it, loss.item())
    
    # Backward pass
    loss.backward()

    # update model parameters
    optimizer.step()
    optimizer.zero_grad()

0 684.5139770507812
1 667.6268310546875
2 651.1676025390625
3 635.1614379882812
4 619.5819091796875
5 604.3728637695312
6 589.5579833984375
7 575.171875
8 561.2047729492188
9 547.6242065429688
10 534.4136352539062
11 521.5889892578125
12 509.19317626953125
13 497.1786193847656
14 485.5188903808594
15 474.1136474609375
16 463.02880859375
17 452.27691650390625
18 441.8218688964844
19 431.606689453125
20 421.6548156738281
21 411.95623779296875
22 402.47705078125
23 393.25445556640625
24 384.36181640625
25 375.6669006347656
26 367.17913818359375
27 358.9237060546875
28 350.8595886230469
29 342.94256591796875
30 335.21478271484375
31 327.6778564453125
32 320.2955322265625
33 313.05682373046875
34 305.96783447265625
35 299.0216979980469
36 292.2017822265625
37 285.52008056640625
38 278.98016357421875
39 272.5668640136719
40 266.31854248046875
41 260.1911926269531
42 254.17552185058594
43 248.2553253173828
44 242.4443359375
45 236.74142456054688
46 231.14573669433594
47 225.6807403564453
48 2

# 数据集

In [None]:
import torchvision.dataset