# Time: 20190521

In [2]:
### Numpy for warm up
import numpy as np

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [5]:
learning_rate = 1e-6

In [9]:
print(x.shape, w1.shape, w2.shape, y.shape)

(64, 1000) (1000, 100) (100, 10) (64, 10)


In [10]:
for t in range(500):
    # Forward pass: compute predicted y
    h = x.dot(w1) # 64 x 100
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2) # 64 x 10, 前者是batch_size,后者是输出的维度

    # Compute and print loss
    loss = np.square(y_pred - y).sum()
    print(t, loss)

    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)

    # Update weights, 重点是计算出对w1和w2的偏导数，然后就知道如何更新w1,w2了
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2 

0 29071796.83166056
1 24569664.9248389
2 23990569.539683305
3 23599752.978507653
4 21281797.298583124
5 16800051.25914912
6 11579698.948861374
7 7221074.3936439175
8 4311511.673420972
9 2615080.6007481837
10 1675560.23522867
11 1154040.3919267429
12 851663.444232638
13 664260.100171605
14 538902.6805823517
15 449162.936663236
16 381039.25097072707
17 327198.98921218
18 283437.1270618086
19 247108.79955871665
20 216519.08812808257
21 190471.2227356064
22 168179.75375379683
23 148995.39272204033
24 132404.1917407303
25 117982.39246101116
26 105410.23463013576
27 94402.83505110041
28 84734.27008618409
29 76223.44045207964
30 68704.02184766655
31 62058.90303455837
32 56170.34109058576
33 50925.974209136686
34 46245.154421304665
35 42060.63428068247
36 38310.99116094492
37 34944.954318762066
38 31916.612636830003
39 29189.413954740652
40 26727.58899776011
41 24501.532601200895
42 22485.51597616949
43 20657.08748955682
44 18998.09436637847
45 17481.95986475678
46 16102.433820643602
47 14844.

395 0.0006262286366099004
396 0.0006010720624414366
397 0.0005769324666357719
398 0.0005537678444015655
399 0.0005315541772414037
400 0.0005102230826911928
401 0.0004897512168682476
402 0.00047010693121295286
403 0.00045127046308652326
404 0.0004331772760854403
405 0.00041582549431623653
406 0.0003991748428669932
407 0.00038319310794755355
408 0.00036784647084604343
409 0.00035311848390734576
410 0.0003389811792594786
411 0.00032542427677077654
412 0.00031240437727078814
413 0.0002999061639331835
414 0.0002879132852640913
415 0.0002764100059726666
416 0.00026536130047236106
417 0.0002547562969110396
418 0.00024457773210321177
419 0.00023481350543244693
420 0.00022544312822651434
421 0.00021644504561805751
422 0.000207804200990206
423 0.0001995160625972076
424 0.0001915558653258038
425 0.00018391376414394795
426 0.00017657913173144184
427 0.00016954359523687132
428 0.00016278423577505008
429 0.0001562961433889529
430 0.00015006828977407523
431 0.00014409333916189874
432 0.00013835391291

In [20]:
### Pytorch Version, 手动求导
# -*- coding: utf-8 -*-
import torch

dtype = torch.float
device = torch.device('cpu')

# prepare data via torch
N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype) # 第一个轴是batch轴
y = torch.randn(N, D_out, device=device, dtype=dtype)

# randomly initialize weights
w1 = torch.randn(D_in, H, device=device, dtype=dtype)
w2 = torch.randn(H, D_out, device=device, dtype=dtype)

learning_rate = 1e-6

# define model and then train
for t in range(500):
    # forward pass
    h = x.mm(w1) # matmul
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    loss = (y_pred - y).pow(2).sum().item() # 使用item()取出标量值
    print(t, loss)
    
    # Backprop to compute gradients of w1 and w2 with respect to loss
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)
    
    # Update weights using gradient descent
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2


0 26326660.0
1 19587142.0
2 16630454.0
3 14733426.0
4 12753223.0
5 10539637.0
6 8187933.0
7 6066016.0
8 4327742.5
9 3046481.25
10 2143570.75
11 1532575.125
12 1121545.0
13 845843.125
14 657351.875
15 525864.5625
16 431368.375
17 361476.0
18 308120.34375
19 266201.1875
20 232431.75
21 204563.6875
22 181198.5625
23 161357.359375
24 144293.5625
25 129514.1015625
26 116610.0078125
27 105273.3671875
28 95269.203125
29 86399.1640625
30 78509.2421875
31 71472.53125
32 65175.87109375
33 59525.7421875
34 54444.125
35 49863.609375
36 45729.8515625
37 41994.08203125
38 38611.0859375
39 35539.4921875
40 32747.37109375
41 30207.861328125
42 27890.203125
43 25774.50390625
44 23841.552734375
45 22073.279296875
46 20453.568359375
47 18968.251953125
48 17605.169921875
49 16353.4541015625
50 15201.3515625
51 14142.017578125
52 13166.8701171875
53 12267.07421875
54 11435.560546875
55 10667.86328125
56 9957.986328125
57 9301.119140625
58 8692.6455078125
59 8128.05224609375
60 7604.12890625
61 7117.5410156

378 0.0028118910267949104
379 0.0027068164199590683
380 0.0026064799167215824
381 0.002513911109417677
382 0.0024209392722696066
383 0.0023340603802353144
384 0.0022506180685013533
385 0.002168811857700348
386 0.0020909549202769995
387 0.002014253754168749
388 0.0019451173720881343
389 0.0018773607444018126
390 0.0018104169284924865
391 0.001746603986248374
392 0.0016866527730599046
393 0.0016273710643872619
394 0.0015729484148323536
395 0.0015182936331257224
396 0.0014677481958642602
397 0.00141852255910635
398 0.0013705792371183634
399 0.0013256664387881756
400 0.0012808226747438312
401 0.0012383986031636596
402 0.001198306679725647
403 0.0011604635510593653
404 0.0011228073853999376
405 0.0010862541384994984
406 0.0010511985747143626
407 0.0010171328904107213
408 0.000985784805379808
409 0.0009534776327200234
410 0.0009245315450243652
411 0.0008970031049102545
412 0.0008703430648893118
413 0.0008434950141236186
414 0.0008169874199666083
415 0.0007930614519864321
416 0.00076965574407

In [13]:
test = torch.randn(10, 20)

In [17]:
test.sum()

tensor(-0.7230)

In [18]:
test.sum().item()

-0.7230345010757446

### 自动求导

In [21]:
import torch
dtype = torch.float
device = torch.device('cpu')
# device = torch.device('cuda:0') # run on gpu

N, D_in, H, D_out = 64, 1000, 100, 10

x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6

for t in range(500):
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    loss.backward()
    
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()
        


0 34847208.0
1 30845190.0
2 30621580.0
3 28753476.0
4 23497150.0
5 16171323.0
6 9708265.0
7 5469016.5
8 3142985.75
9 1950028.5
10 1330893.75
11 987011.6875
12 776493.0625
13 635034.0
14 532048.0
15 452565.375
16 388914.875
17 336690.46875
18 293162.8125
19 256480.5
20 225362.25
21 198781.65625
22 175967.0
23 156310.953125
24 139271.421875
25 124438.578125
26 111486.234375
27 100150.515625
28 90188.453125
29 81409.0078125
30 73647.828125
31 66768.578125
32 60658.7109375
33 55210.625
34 50341.6953125
35 45982.40625
36 42070.51953125
37 38551.27734375
38 35379.11328125
39 32514.2109375
40 29923.859375
41 27576.3203125
42 25443.849609375
43 23504.46484375
44 21739.14453125
45 20130.70703125
46 18661.04296875
47 17315.939453125
48 16084.39453125
49 14954.46484375
50 13916.486328125
51 12962.83984375
52 12084.369140625
53 11274.390625
54 10527.1318359375
55 9836.7275390625
56 9197.6025390625
57 8605.7880859375
58 8057.7998046875
59 7549.03759765625
60 7076.55419921875
61 6637.7802734375
62 6

489 0.00034651922760531306
490 0.0003398341068532318
491 0.0003329093742650002
492 0.0003268508007749915
493 0.00032100355019792914
494 0.0003141887718811631
495 0.00030916588730178773
496 0.0003029830113518983
497 0.0002969988854601979
498 0.0002920021943282336
499 0.0002858381485566497
