In [2]:
import numpy as np

In [3]:
# N是批大小；D_in是输入维度
# H是隐藏层维度；D_out是输出维度  
N, D_in, H, D_out = 64, 1000, 100, 10

In [4]:
# 产生随机输入和输出数据
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [5]:
# 随机初始化权重
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [6]:
learning_rate = 1e-6

In [7]:
for t in range(500):
    # 前向传播：计算预测值y
    h = x.dot(w1)
    h_relu = np.maximum(h, 0)
    y_pred = h_relu.dot(w2)

    # 计算并显示loss（损失）
    loss = np.square(y_pred - y).sum()
    print(t, loss)
    
    # 反向传播，计算w1、w2对loss的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.T.dot(grad_y_pred)
    grad_h_relu = grad_y_pred.dot(w2.T)
    grad_h = grad_h_relu.copy()
    grad_h[h < 0] = 0
    grad_w1 = x.T.dot(grad_h)                   
    
    # 更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 33922936.30200172
1 30620957.26118327
2 30626121.83231876
3 28887723.509839572
4 23370143.00435086
5 15749007.2523231
6 9174116.854688026
7 5024978.31716456
8 2842952.0224032905
9 1765755.2106240846
10 1221997.0426821292
11 922590.4438565008
12 737727.7864147462
13 610602.7213659992
14 515699.80599002895
15 441068.8865185587
16 380624.3759688017
17 330523.36451546015
18 288606.251357481
19 253158.39238515784
20 222956.76935083256
21 197068.49656397768
22 174767.78321630834
23 155456.88542895438
24 138682.86051651876
25 124061.44342239501
26 111269.09297344927
27 100032.21848210468
28 90137.37583864178
29 81389.04346091516
30 73647.75743775716
31 66767.63388158599
32 60642.13627416424
33 55167.505742551104
34 50264.36154876117
35 45867.017354772266
36 41918.08592340113
37 38360.97816634704
38 35150.86284078189
39 32248.40165369281
40 29620.629101338833
41 27235.90816278985
42 25070.07649369538
43 23099.82815463491
44 21306.088171855023
45 19672.12823225274
46 18179.57819061669
47 1681

391 0.0024712189957011464
392 0.0023820912176616372
393 0.002296211086066658
394 0.002213445446143252
395 0.0021336679418041387
396 0.0020567788496392726
397 0.001982711630953237
398 0.001911325303817943
399 0.0018425263339420879
400 0.0017762130729643147
401 0.0017122990922562725
402 0.0016507028713676846
403 0.0015913481615403135
404 0.001534150052484568
405 0.0014790001760532955
406 0.001425860614882615
407 0.0013746313006547453
408 0.0013252549342022708
409 0.0012776929394091093
410 0.0012318189866859844
411 0.0011876207753071044
412 0.0011450038037946425
413 0.0011039241121995956
414 0.0010643554930541265
415 0.001026194207641239
416 0.0009894089044541442
417 0.0009539511517965235
418 0.0009197654759832071
419 0.0008868255717587345
420 0.0008550762593347625
421 0.0008244578799380179
422 0.0007949424643152968
423 0.0007664911626821026
424 0.0007390702511030057
425 0.0007126420430656419
426 0.000687153890988064
427 0.0006625792286018479
428 0.000638892457544162
429 0.000616060547706

In [8]:
import torch

In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

In [10]:
# N是批大小； D_in 是输入维度；
# H 是隐藏层维度； D_out 是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

In [11]:
# 产生随机输入和输出数据
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# 随机初始化权重
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

In [12]:
learning_rate = 1e-6

In [13]:
for t in range(500):
    # 前向传播：计算预测值y
    h = x.mm(w1)
    h_relu = h.clamp(min=0)
    y_pred = h_relu.mm(w2)
    
    # 计算并输出loss；loss是存储在PyTorch的tensor中的标量，维度是()（零维标量）；
    # 我们使用loss.item()得到tensor中的纯python数值。
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # 反向传播，计算w1、w2对loss的梯度
    grad_y_pred = 2.0 * (y_pred - y)
    grad_w2 = h_relu.t().mm(grad_y_pred)
    grad_h_relu = grad_y_pred.mm(w2.t())
    grad_h = grad_h_relu.clone()
    grad_h[h < 0] = 0
    grad_w1 = x.t().mm(grad_h)

    # 使用梯度下降更新权重
    w1 -= learning_rate * grad_w1
    w2 -= learning_rate * grad_w2

0 35376504.0
1 34342960.0
2 36938024.0
3 36429448.0
4 29491878.0
5 18689596.0
6 9820698.0
7 4853100.0
8 2586478.25
9 1597107.125
10 1132032.75
11 879261.5625
12 718666.5625
13 603587.75
14 514728.1875
15 443177.9375
16 384207.625
17 334811.5625
18 293089.875
19 257600.640625
20 227270.984375
21 201164.84375
22 178604.53125
23 159055.09375
24 142029.375
25 127151.203125
26 114096.0625
27 102606.828125
28 92466.1875
29 83447.8046875
30 75438.015625
31 68314.390625
32 61959.265625
33 56285.328125
34 51201.88671875
35 46641.1953125
36 42549.7578125
37 38862.03125
38 35533.0625
39 32521.64453125
40 29795.38671875
41 27323.15234375
42 25079.37890625
43 23040.7734375
44 21185.1328125
45 19494.6875
46 17955.46484375
47 16549.44921875
48 15264.39453125
49 14089.05078125
50 13012.87109375
51 12026.3623046875
52 11121.083984375
53 10290.26171875
54 9526.8291015625
55 8824.525390625
56 8178.4658203125
57 7582.9619140625
58 7034.05908203125
59 6528.02001953125
60 6061.0458984375
61 5630.0908203125


427 7.483785157091916e-05
428 7.348045619437471e-05
429 7.225052104331553e-05
430 7.084364915499464e-05
431 6.960834434721619e-05
432 6.843965093139559e-05
433 6.748258601874113e-05
434 6.603606743738055e-05
435 6.491360545624048e-05
436 6.394374941010028e-05
437 6.289623706834391e-05
438 6.181615026434883e-05
439 6.081028186599724e-05
440 5.9860016335733235e-05
441 5.885825521545485e-05
442 5.797696212539449e-05
443 5.7213033869629726e-05
444 5.620321098831482e-05
445 5.547437467612326e-05
446 5.441176472231746e-05
447 5.35636099812109e-05
448 5.303089346853085e-05
449 5.222402251092717e-05
450 5.1491915655788034e-05
451 5.102565046399832e-05
452 5.011096800444648e-05
453 4.956290649715811e-05
454 4.882616121903993e-05
455 4.7976671339711174e-05
456 4.725985854747705e-05
457 4.660899503505789e-05
458 4.617930972017348e-05
459 4.571184399537742e-05
460 4.4994962081545964e-05
461 4.451934728422202e-05
462 4.391728725749999e-05
463 4.339035149314441e-05
464 4.2866726289503276e-05
465 4.2

In [1]:
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu') 

# N是批大小；D_in是输入维度；
# H是隐藏层维度；D_out是输出维度  
N, D_in, H, D_out = 64, 1000, 100, 10

In [2]:
# 产生随机输入和输出数据
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# 产生随机权重tensor，将requires_grad设置为True意味着我们希望在反向传播时候计算这些值的梯度
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

In [3]:
learning_rate = 1e-6

In [4]:
for t in range(500):

    # 前向传播：使用tensor的操作计算预测值y。
    # 由于w1和w2有requires_grad=True，涉及这些张量的操作将让PyTorch构建计算图，
    # 从而允许自动计算梯度。由于我们不再手工实现反向传播，所以不需要保留中间值的引用。
    y_pred = x.mm(w1).clamp(min=0).mm(w2)
    
    # 计算并输出loss，loss是一个形状为()的张量，loss.item()是这个张量对应的python数值
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())
    
    # 使用autograd计算反向传播。这个调用将计算loss对所有requires_grad=True的tensor的梯度。
    # 这次调用后，w1.grad和w2.grad将分别是loss对w1和w2的梯度张量。
    loss.backward()
    
    # 使用梯度下降更新权重。对于这一步，我们只想对w1和w2的值进行原地改变；不想为更新阶段构建计算图，
    # 所以我们使用torch.no_grad()上下文管理器防止PyTorch为更新构建计算图
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # 反向传播之后手动置零梯度
        w1.grad.zero_()
        w2.grad.zero_() 

0 40479408.0
1 38384120.0
2 37456552.0
3 31778112.0
4 21542916.0
5 12001392.0
6 6113673.5
7 3273590.5
8 1992737.5
9 1384663.625
10 1055103.375
11 848751.875
12 703443.4375
13 593063.75
14 505408.96875
15 434086.5625
16 375236.0625
17 326021.46875
18 284618.15625
19 249529.90625
20 219598.125
21 193905.875
22 171768.046875
23 152611.421875
24 135990.671875
25 121514.765625
26 108834.734375
27 97691.3828125
28 87872.796875
29 79190.875
30 71492.875
31 64652.11328125
32 58560.37109375
33 53113.7265625
34 48243.6484375
35 43879.4609375
36 39964.078125
37 36445.9609375
38 33280.50390625
39 30427.75
40 27851.240234375
41 25521.421875
42 23408.392578125
43 21489.74609375
44 19745.337890625
45 18157.44921875
46 16716.0703125
47 15400.7333984375
48 14199.1318359375
49 13100.4794921875
50 12094.669921875
51 11173.619140625
52 10331.50390625
53 9559.3291015625
54 8849.900390625
55 8197.455078125
56 7597.814453125
57 7045.87353515625
58 6536.80810546875
59 6067.4384765625
60 5634.27001953125
61 52

385 0.00041355934808962047
386 0.0004027709655929357
387 0.0003925684723071754
388 0.00038270934601314366
389 0.0003722218098118901
390 0.00036307855043560266
391 0.00035361485788598657
392 0.0003452679666224867
393 0.0003366434248164296
394 0.0003287450526840985
395 0.0003210802678950131
396 0.0003125945513602346
397 0.0003042323514819145
398 0.0002972735383082181
399 0.0002904593711718917
400 0.0002843136026058346
401 0.0002776563633233309
402 0.00027071149088442326
403 0.0002639194135554135
404 0.00025784893659874797
405 0.0002524365554563701
406 0.00024646081146784127
407 0.00024111477250698954
408 0.00023575520026497543
409 0.00023060457897372544
410 0.00022591272136196494
411 0.00022074636945035309
412 0.0002154983376385644
413 0.00021087794448249042
414 0.0002066966553684324
415 0.0002024151326622814
416 0.00019834090198855847
417 0.0001940483198268339
418 0.00018930432270281017
419 0.00018547495710663497
420 0.00018214277224615216
421 0.00017818956985138357
422 0.00017468680744

In [5]:
import torch

In [8]:

class MyReLU(torch.autograd.Function):
    """
    我们可以通过建立torch.autograd的子类来实现我们自定义的autograd函数，
    并完成张量的正向和反向传播。
    """
    @staticmethod
    def forward(ctx, x):
        """
        在正向传播中，我们接收到一个上下文对象和一个包含输入的张量；
        我们必须返回一个包含输出的张量，
        并且我们可以使用上下文对象来缓存对象，以便在反向传播中使用。
        """
        ctx.save_for_backward(x)
        return x.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        在反向传播中，我们接收到上下文对象和一个张量，
        其包含了相对于正向传播过程中产生的输出的损失的梯度。
        我们可以从上下文对象中检索缓存的数据，
        并且必须计算并返回与正向传播的输入相关的损失的梯度。
        """
        x, = ctx.saved_tensors
        grad_x = grad_output.clone()
        grad_x[x < 0] = 0
        return grad_x


device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# N是批大小； D_in 是输入维度；
# H 是隐藏层维度； D_out 是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 产生输入和输出的随机张量
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# 产生随机权重的张量
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6

In [9]:
for t in range(500):
    # 正向传播：使用张量上的操作来计算输出值y；
    # 我们通过调用 MyReLU.apply 函数来使用自定义的ReLU
    y_pred = MyReLU.apply(x.mm(w1)).mm(w2)

    # 计算并输出loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # 使用autograd计算反向传播过程。
    loss.backward()

    with torch.no_grad():
        # 用梯度下降更新权重
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # 在反向传播之后手动清零梯度
        w1.grad.zero_()
        w2.grad.zero_()

0 30490908.0
1 24387528.0
2 20769012.0
3 17129984.0
4 13088678.0
5 9273996.0
6 6232811.0
7 4117984.5
8 2753325.0
9 1904299.0
10 1373370.625
11 1033339.0
12 806664.6875
13 648943.75
14 534457.625
15 448407.21875
16 381733.125
17 328469.34375
18 284937.3125
19 248802.515625
20 218445.34375
21 192653.0
22 170563.03125
23 151515.296875
24 134995.75
25 120611.71875
26 108038.4375
27 97004.0625
28 87296.7890625
29 78716.296875
30 71110.359375
31 64354.171875
32 58331.5546875
33 52949.6015625
34 48131.3046875
35 43810.765625
36 39929.1875
37 36438.11328125
38 33290.73828125
39 30448.92578125
40 27880.0078125
41 25555.90234375
42 23449.91796875
43 21537.640625
44 19800.310546875
45 18220.11328125
46 16780.77734375
47 15467.896484375
48 14270.544921875
49 13175.3779296875
50 12173.966796875
51 11257.5673828125
52 10418.001953125
53 9647.984375
54 8940.876953125
55 8290.376953125
56 7691.9140625
57 7140.9208984375
58 6633.2646484375
59 6165.271484375
60 5733.171875
61 5334.37890625
62 4965.93847

391 0.00031093222787603736
392 0.0003022081800736487
393 0.00029338820604607463
394 0.0002853916957974434
395 0.0002774990862235427
396 0.00027035720995627344
397 0.000262516550719738
398 0.0002555289538577199
399 0.00024841976119205356
400 0.00024208596732933074
401 0.00023558530665468425
402 0.00022929029364604503
403 0.00022311262728180736
404 0.00021726803970523179
405 0.00021177035523578525
406 0.00020631799998227507
407 0.00020051478350069374
408 0.00019567713025026023
409 0.00019093073206022382
410 0.00018577011360321194
411 0.0001816182630136609
412 0.0001771648821886629
413 0.0001731448428472504
414 0.0001688365009613335
415 0.00016467843670397997
416 0.00016051680722739547
417 0.0001570539316162467
418 0.0001532282622065395
419 0.0001493250165367499
420 0.00014586006000172347
421 0.00014287288649939
422 0.00013939780183136463
423 0.00013670534826815128
424 0.00013362118625082076
425 0.00013019995822105557
426 0.00012774523929692805
427 0.00012453639646992087
428 0.00012223990

In [10]:
import torch

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

In [11]:
# N是批大小；D是输入维度
# H是隐藏层维度；D_out是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 产生输入和输出随机张量
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)


In [12]:
# 使用nn包将我们的模型定义为一系列的层。
# nn.Sequential是包含其他模块的模块，并按顺序应用这些模块来产生其输出。
# 每个线性模块使用线性函数从输入计算输出，并保存其内部的权重和偏差张量。
# 在构造模型之后，我们使用.to()方法将其移动到所需的设备。
model = torch.nn.Sequential(
            torch.nn.Linear(D_in, H),
            torch.nn.ReLU(),
            torch.nn.Linear(H, D_out),
        ).to(device)


# nn包还包含常用的损失函数的定义；
# 在这种情况下，我们将使用平均平方误差(MSE)作为我们的损失函数。
# 设置reduction='sum'，表示我们计算的是平方误差的“和”，而不是平均值;
# 这是为了与前面我们手工计算损失的例子保持一致，
# 但是在实践中，通过设置reduction='elementwise_mean'来使用均方误差作为损失更为常见。
loss_fn = torch.nn.MSELoss(reduction='sum')

learning_rate = 1e-4

In [13]:
for t in range(500):

    # 前向传播：通过向模型传入x计算预测的y。
    # 模块对象重载了__call__运算符，所以可以像函数那样调用它们。
    # 这么做相当于向模块传入了一个张量，然后它返回了一个输出张量。
    y_pred = model(x)

    # 计算并打印损失。我们传递包含y的预测值和真实值的张量，损失函数返回包含损失的张量。
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # 反向传播之前清零梯度
    model.zero_grad()

    # 反向传播：计算模型的损失对所有可学习参数的导数（梯度）。
    # 在内部，每个模块的参数存储在requires_grad=True的张量中，
    # 因此这个调用将计算模型中所有可学习参数的梯度。
    loss.backward()

    # 使用梯度下降更新权重。
    # 每个参数都是张量，所以我们可以像我们以前那样可以得到它的数值和梯度
    with torch.no_grad():
        for param in model.parameters():
            param.data -= learning_rate * param.grad

0 731.995361328125
1 679.1925659179688
2 633.917724609375
3 594.18017578125
4 558.9620971679688
5 527.3099975585938
6 498.2781677246094
7 471.5604248046875
8 446.6444091796875
9 423.25738525390625
10 401.3170166015625
11 380.5508117675781
12 360.9151611328125
13 342.3538513183594
14 324.658935546875
15 307.81005859375
16 291.6984558105469
17 276.4001770019531
18 261.86358642578125
19 247.87969970703125
20 234.57266235351562
21 221.91665649414062
22 209.89010620117188
23 198.4191436767578
24 187.49024963378906
25 177.0889892578125
26 167.21827697753906
27 157.84808349609375
28 148.94020080566406
29 140.4915313720703
30 132.45651245117188
31 124.85338592529297
32 117.66509246826172
33 110.89397430419922
34 104.4852066040039
35 98.41480255126953
36 92.67346954345703
37 87.25593566894531
38 82.13833618164062
39 77.31446838378906
40 72.7578125
41 68.46537017822266
42 64.41796112060547
43 60.59678268432617
44 57.00053024291992
45 53.6152458190918
46 50.43129348754883
47 47.441062927246094
48

351 0.00019072591385338455
352 0.00018435687525197864
353 0.00017818754713516682
354 0.0001722334127407521
355 0.0001664798619458452
356 0.00016092271835077554
357 0.00015555178106296808
358 0.00015036696277093142
359 0.000145352678373456
360 0.00014050514437258244
361 0.0001358214212814346
362 0.00013129318540450186
363 0.00012691860320046544
364 0.00012269271246623248
365 0.00011860641825478524
366 0.00011465988063719124
367 0.0001108449578168802
368 0.00010716034739743918
369 0.00010359541192883626
370 0.00010015237057814375
371 9.682211384642869e-05
372 9.360627882415429e-05
373 9.049452637555078e-05
374 8.749161497689784e-05
375 8.458937372779474e-05
376 8.17812979221344e-05
377 7.906851533334702e-05
378 7.6444455771707e-05
379 7.391376129817218e-05
380 7.14622947270982e-05
381 6.90919769112952e-05
382 6.680560181848705e-05
383 6.459131691372022e-05
384 6.245637632673606e-05
385 6.038870196789503e-05
386 5.8390422054799274e-05
387 5.645751298288815e-05
388 5.4589083447353914e-05
3

In [14]:
# Optim

import torch

# N是批大小；D是输入维度
# H是隐藏层维度；D_out是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 产生随机输入和输出张量
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# 使用nn包定义模型和损失函数
model = torch.nn.Sequential(
          torch.nn.Linear(D_in, H),
          torch.nn.ReLU(),
          torch.nn.Linear(H, D_out),
        )
loss_fn = torch.nn.MSELoss(reduction='sum')

In [15]:
# 使用optim包定义优化器（Optimizer）。Optimizer将会为我们更新模型的权重。
# 这里我们使用Adam优化方法；optim包还包含了许多别的优化算法。
# Adam构造函数的第一个参数告诉优化器应该更新哪些张量。
learning_rate = 1e-4
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)


In [16]:
for t in range(500):

    # 前向传播：通过像模型输入x计算预测的y
    y_pred = model(x)

    # 计算并打印loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # 在反向传播之前，使用optimizer将它要更新的所有张量的梯度清零(这些张量是模型可学习的权重)
    optimizer.zero_grad()

    # 反向传播：根据模型的参数计算loss的梯度
    loss.backward()

    # 调用Optimizer的step函数使它所有参数更新
    optimizer.step()

0 720.2864379882812
1 702.9299926757812
2 686.0489501953125
3 669.7139282226562
4 653.8800048828125
5 638.479248046875
6 623.4898681640625
7 609.0009765625
8 594.912353515625
9 581.1812744140625
10 567.7374267578125
11 554.6830444335938
12 542.0006103515625
13 529.6609497070312
14 517.6701049804688
15 506.005859375
16 494.6691589355469
17 483.6268310546875
18 472.9267272949219
19 462.4908447265625
20 452.2975769042969
21 442.3665771484375
22 432.6736145019531
23 423.1961669921875
24 413.96771240234375
25 404.96636962890625
26 396.1551208496094
27 387.502685546875
28 379.0046081542969
29 370.6878356933594
30 362.5466003417969
31 354.5804443359375
32 346.83306884765625
33 339.2843933105469
34 331.873046875
35 324.6117858886719
36 317.5010070800781
37 310.5386962890625
38 303.6908264160156
39 296.97698974609375
40 290.4043884277344
41 283.9589538574219
42 277.6510925292969
43 271.47723388671875
44 265.3930358886719
45 259.4109802246094
46 253.5265350341797
47 247.77081298828125
48 242.130

397 1.5911637092358433e-05
398 1.4991624993854202e-05
399 1.4123294931778219e-05
400 1.3306093933351804e-05
401 1.2532607797766104e-05
402 1.1805336725956295e-05
403 1.1115560482721776e-05
404 1.0465932973602321e-05
405 9.85384758678265e-06
406 9.277428034693003e-06
407 8.731622074265033e-06
408 8.21844059828436e-06
409 7.734796781733166e-06
410 7.278727480297675e-06
411 6.848836619610665e-06
412 6.443271104217274e-06
413 6.060316081857309e-06
414 5.700783276552102e-06
415 5.361946932680439e-06
416 5.041183158027707e-06
417 4.740674739878159e-06
418 4.456874194147531e-06
419 4.189482297078939e-06
420 3.938125701097306e-06
421 3.70122120330052e-06
422 3.4773222523654113e-06
423 3.2672212455508998e-06
424 3.070547563766013e-06
425 2.8840765935456147e-06
426 2.7085861802333966e-06
427 2.5442313926760107e-06
428 2.388715529377805e-06
429 2.2429958335123956e-06
430 2.1054368062323192e-06
431 1.9764029275393113e-06
432 1.8551430684965453e-06
433 1.741344021866098e-06
434 1.6338005934812827e-

In [17]:
# customize nn.

import torch

class TwoLayerNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        在构造函数中，我们实例化了两个nn.Linear模块，并将它们作为成员变量。
        """
        super(TwoLayerNet, self).__init__()
        self.linear1 = torch.nn.Linear(D_in, H)
        self.linear2 = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        在前向传播的函数中，我们接收一个输入的张量，也必须返回一个输出张量。
        我们可以使用构造函数中定义的模块以及张量上的任意的（可微分的）操作。
        """
        h_relu = self.linear1(x).clamp(min=0)
        y_pred = self.linear2(h_relu)
        return y_pred
# N是批大小； D_in 是输入维度；
# H 是隐藏层维度； D_out 是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 产生输入和输出的随机张量
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# 通过实例化上面定义的类来构建我们的模型。
model = TwoLayerNet(D_in, H, D_out)

# 构造损失函数和优化器。
# SGD构造函数中对model.parameters()的调用，
# 将包含模型的一部分，即两个nn.Linear模块的可学习参数。
loss_fn = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4)
for t in range(500):
    # 前向传播：通过向模型传递x计算预测值y
    y_pred = model(x)

    #计算并输出loss
    loss = loss_fn(y_pred, y)
    print(t, loss.item())

    # 清零梯度，反向传播，更新权重
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 660.17431640625
1 613.322021484375
2 572.3101806640625
3 536.0904541015625
4 503.7911682128906
5 474.54205322265625
6 447.54534912109375
7 422.71185302734375
8 399.5103454589844
9 377.7994384765625
10 357.3600769042969
11 338.0434265136719
12 319.8118896484375
13 302.4556884765625
14 286.08209228515625
15 270.55499267578125
16 255.73660278320312
17 241.57582092285156
18 228.0094757080078
19 215.09295654296875
20 202.8623504638672
21 191.25196838378906
22 180.25782775878906
23 169.8169708251953
24 159.89015197753906
25 150.458251953125
26 141.51165771484375
27 133.05078125
28 125.06277465820312
29 117.52413940429688
30 110.40528106689453
31 103.69328308105469
32 97.3614273071289
33 91.4104232788086
34 85.79354095458984
35 80.51587677001953
36 75.56578063964844
37 70.91744995117188
38 66.55717468261719
39 62.46943283081055
40 58.64437484741211
41 55.058319091796875
42 51.6992301940918
43 48.552574157714844
44 45.577171325683594
45 42.78523254394531
46 40.17023849487305
47 37.7179412841

360 0.0002723940706346184
361 0.0002644466294441372
362 0.00025674342759884894
363 0.0002492624334990978
364 0.00024200444750022143
365 0.00023495731875300407
366 0.0002281306660734117
367 0.00022149665164761245
368 0.00021505748736672103
369 0.00020881282398477197
370 0.00020276228315196931
371 0.00019688358588609844
372 0.00019117540796287358
373 0.0001856314775068313
374 0.0001802625774871558
375 0.00017504353309050202
376 0.00016997069178614765
377 0.00016505482199136168
378 0.0001602813572390005
379 0.00015565547801088542
380 0.00015115899441298097
381 0.00014679774176329374
382 0.00014255876885727048
383 0.00013844002387486398
384 0.0001344518386758864
385 0.00013057125033810735
386 0.00012681267980951816
387 0.00012316147331148386
388 0.00011960935080423951
389 0.00011616948177106678
390 0.00011282659397693351
391 0.00010958416532957926
392 0.00010643055429682136
393 0.00010338186984881759
394 0.0001004104342428036
395 9.752649202710018e-05
396 9.47289154282771e-05
397 9.2010406

In [3]:
# 可以用控制流来构建中间层，方便构建深层网络、权值共享（减少参数）

import random
import torch

class DynamicNet(torch.nn.Module):
    def __init__(self, D_in, H, D_out):
        """
        在构造函数中，我们构造了三个nn.Linear实例，它们将在前向传播时被使用。
        """
        super(DynamicNet, self).__init__()
        self.input_linear = torch.nn.Linear(D_in, H)
        self.middle_linear = torch.nn.Linear(H, H)
        self.output_linear = torch.nn.Linear(H, D_out)

    def forward(self, x):
        """
        对于模型的前向传播，我们随机选择0、1、2、3，
        并重用了多次计算隐藏层的middle_linear模块。
        由于每个前向传播构建一个动态计算图，
        我们可以在定义模型的前向传播时使用常规Python控制流运算符，如循环或条件语句。
        在这里，我们还看到，在定义计算图形时多次重用同一个模块是完全安全的。
        这是Lua Torch的一大改进，因为Lua Torch中每个模块只能使用一次。
        """
        h_relu = self.input_linear(x).clamp(min=0)
        for _ in range(random.randint(0, 3)):
            h_relu = self.middle_linear(h_relu).clamp(min=0)
        y_pred = self.output_linear(h_relu)
        return y_pred


# N是批大小；D是输入维度
# H是隐藏层维度；D_out是输出维度
N, D_in, H, D_out = 64, 1000, 100, 10

# 产生输入和输出随机张量
x = torch.randn(N, D_in)
y = torch.randn(N, D_out)

# 实例化上面定义的类来构造我们的模型
model = DynamicNet(D_in, H, D_out)

# 构造我们的损失函数（loss function）和优化器（Optimizer）。
# 用平凡的随机梯度下降训练这个奇怪的模型是困难的，所以我们使用了momentum方法。
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.SGD(model.parameters(), lr=1e-4, momentum=0.9)
for t in range(500):

    # 前向传播：通过向模型传入x计算预测的y。
    y_pred = model(x)

    # 计算并打印损失
    loss = criterion(y_pred, y)
    print(t, loss.item())

    # 清零梯度，反向传播，更新权重 
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()

0 678.40625
1 677.8968505859375
2 705.8118896484375
3 676.80859375
4 674.8616333007812
5 673.5823974609375
6 670.7431030273438
7 533.7207641601562
8 667.0558471679688
9 644.7584838867188
10 662.161376953125
11 627.8104858398438
12 613.874755859375
13 652.74853515625
14 578.4938354492188
15 342.9739990234375
16 637.6456298828125
17 520.388671875
18 657.6228637695312
19 260.0716247558594
20 455.3699035644531
21 588.2593994140625
22 191.0424041748047
23 553.8648681640625
24 611.1359252929688
25 335.3816833496094
26 134.79811096191406
27 289.0176086425781
28 528.87744140625
29 243.4275665283203
30 123.29727935791016
31 451.66387939453125
32 424.5642395019531
33 188.45164489746094
34 372.83355712890625
35 119.07291412353516
36 105.57628631591797
37 313.840576171875
38 134.7579345703125
39 61.446937561035156
40 257.437744140625
41 252.14808654785156
42 173.5098876953125
43 87.60072326660156
44 67.57376861572266
45 649.2791748046875
46 310.70458984375
47 397.7838439941406
48 500.633056640625


432 0.4313868582248688
433 0.34600725769996643
434 0.37325790524482727
435 0.34310290217399597
436 0.5306800007820129
437 0.47933003306388855
438 0.10929522663354874
439 0.11106613278388977
440 0.10659852623939514
441 0.40075457096099854
442 0.33957958221435547
443 0.2604547441005707
444 0.6901151537895203
445 0.22539550065994263
446 0.4407001733779907
447 0.13454797863960266
448 0.41517868638038635
449 0.7609120011329651
450 0.23099371790885925
451 0.3186397850513458
452 0.6221746802330017
453 0.1028003841638565
454 0.28581365942955017
455 0.2559056878089905
456 0.22289763391017914
457 0.49493399262428284
458 0.12477445602416992
459 0.1636742800474167
460 0.5198514461517334
461 0.4421413540840149
462 0.48378753662109375
463 0.08710428327322006
464 0.47529616951942444
465 0.42590683698654175
466 0.38458409905433655
467 0.3555295169353485
468 0.3344631493091583
469 0.2918657064437866
470 0.503897488117218
471 0.10599182546138763
472 0.5111166834831238
473 0.42031264305114746
474 0.38926

In [2]:
import torch
from torch import nn
from torchviz import make_dot, make_dot_from_trace