## Numpy

In [1]:
import numpy as np

In [2]:
N,D_in,H, D_out = 64,1000,100,10
# N is batch size; 
#D_in is input dimension;
# H is hidden dimension; 
#D_out is output dimension.

In [3]:
# Create random input and output data
x = np.random.randn(N, D_in)
y = np.random.randn(N, D_out)

In [4]:
# Randomly initialize weights
w1 = np.random.randn(D_in, H)
w2 = np.random.randn(H, D_out)

In [5]:
learning_rate = 1e-6

In [10]:
for t in range(500):
  # Forward pass: compute predicted y
  h = x.dot(w1)
  h_relu = np.maximum(h, 0)
  y_pred = h_relu.dot(w2)
  
  # Compute and print loss
  loss = np.square(y_pred - y).sum()
  print(t, loss)
  
  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.T.dot(grad_y_pred)
  grad_h_relu = grad_y_pred.dot(w2.T)
  grad_h = grad_h_relu.copy()
  grad_h[h < 0] = 0
  grad_w1 = x.T.dot(grad_h)
 
  # Update weights
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 1.1736386542566705e-23
1 1.1715175517778327e-23
2 1.1673252828421266e-23
3 1.16320841408541e-23
4 1.1603969421303725e-23
5 1.1599262607986642e-23
6 1.156441805765e-23
7 1.1576114502661166e-23
8 1.1573242782188753e-23
9 1.1543752663247305e-23
10 1.1505731891924206e-23
11 1.1467178290668058e-23
12 1.1451784496531246e-23
13 1.1435492011819312e-23
14 1.1401444091059657e-23
15 1.1420808103143078e-23
16 1.1429008038942326e-23
17 1.1431072977887867e-23
18 1.143471118204196e-23
19 1.1443895461812407e-23
20 1.144848381531936e-23
21 1.1398074816925948e-23
22 1.1409826567109916e-23
23 1.134211049975245e-23
24 1.1371928146802982e-23
25 1.1338819798092637e-23
26 1.1320602533589642e-23
27 1.13289033528223e-23
28 1.1301978319270877e-23
29 1.1235717239384435e-23
30 1.1197911194950105e-23
31 1.1179280154738394e-23
32 1.1126304146445386e-23
33 1.1143764179482304e-23
34 1.1090343315545362e-23
35 1.1072505338654122e-23
36 1.1093353898527882e-23
37 1.107037056494713e-23
38 1.1066439652760079e-23
39 1.107

463 6.046961241584928e-24
464 6.057629238609491e-24
465 6.0311301683162866e-24
466 6.025045462612188e-24
467 6.0280983504514964e-24
468 6.017422772292596e-24
469 5.999549435544245e-24
470 6.015327132242256e-24
471 6.0229466987395895e-24
472 6.01061050326673e-24
473 6.030649848309467e-24
474 6.037757974829493e-24
475 6.006481236148218e-24
476 6.020798252109624e-24
477 6.020548633099904e-24
478 6.021277439621488e-24
479 6.031223310862481e-24
480 6.015528984782867e-24
481 6.017398605157912e-24
482 6.002238709230426e-24
483 5.987437548569962e-24
484 5.9922791023894776e-24
485 5.9872758394992216e-24
486 5.941659770839612e-24
487 5.9581260027075884e-24
488 5.94951721757495e-24
489 5.94816583413306e-24
490 5.928314790402659e-24
491 5.930644009307029e-24
492 5.921907424843848e-24
493 5.938848145387958e-24
494 5.9336477154557565e-24
495 5.9475872458739814e-24
496 5.941500107961104e-24
497 5.950923076559207e-24
498 5.979960832184575e-24
499 5.976396496002201e-24


## Tensor

In [11]:
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random input and output data
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Randomly initialize weights
w1 = torch.randn(D_in, H, device=device)
w2 = torch.randn(H, D_out, device=device)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y
  h = x.mm(w1)
  h_relu = h.clamp(min=0)
  y_pred = h_relu.mm(w2)

  # Compute and print loss; loss is a scalar, and is stored in a PyTorch Tensor
  # of shape (); we can get its value as a Python number with loss.item().
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Backprop to compute gradients of w1 and w2 with respect to loss
  grad_y_pred = 2.0 * (y_pred - y)
  grad_w2 = h_relu.t().mm(grad_y_pred)
  grad_h_relu = grad_y_pred.mm(w2.t())
  grad_h = grad_h_relu.clone()
  grad_h[h < 0] = 0
  grad_w1 = x.t().mm(grad_h)

  # Update weights using gradient descent
  w1 -= learning_rate * grad_w1
  w2 -= learning_rate * grad_w2

0 21094304.0
1 14210768.0
2 10761236.0
3 8780712.0
4 7482153.0
5 6501708.5
6 5667946.5
7 4900743.0
8 4183143.25
9 3515075.75
10 2914003.25
11 2386322.75
12 1939144.0
13 1566869.0
14 1263856.0
15 1019513.3125
16 825139.625
17 670850.125
18 548919.6875
19 452365.65625
20 375834.8125
21 314911.375
22 266170.3125
23 226868.546875
24 194998.15625
25 168994.734375
26 147615.8125
27 129845.3125
28 114969.7421875
29 102406.046875
30 91700.125
31 82508.6171875
32 74560.140625
33 67642.8203125
34 61585.00390625
35 56246.47265625
36 51517.35546875
37 47304.7109375
38 43539.65625
39 40159.078125
40 37111.5703125
41 34353.3828125
42 31850.720703125
43 29572.498046875
44 27494.3671875
45 25592.82421875
46 23851.1328125
47 22252.470703125
48 20783.033203125
49 19430.05078125
50 18181.099609375
51 17026.78125
52 15957.970703125
53 14967.748046875
54 14050.00390625
55 13197.46484375
56 12406.1923828125
57 11669.5615234375
58 10983.0556640625
59 10343.158203125
60 9745.6572265625
61 9187.53515625
62 866

407 0.004505965393036604
408 0.0043437727726995945
409 0.004192729946225882
410 0.004046046175062656
411 0.0039033207576721907
412 0.0037705623544752598
413 0.0036409783642739058
414 0.0035183788277208805
415 0.0033952693920582533
416 0.0032781288027763367
417 0.003168439492583275
418 0.003057571593672037
419 0.0029566187877207994
420 0.0028587959241122007
421 0.0027608072850853205
422 0.0026704901829361916
423 0.002583865076303482
424 0.0024984758347272873
425 0.0024176116567105055
426 0.002339667407795787
427 0.002262029331177473
428 0.002189632970839739
429 0.0021199705079197884
430 0.0020521641708910465
431 0.0019864134956151247
432 0.0019255110528320074
433 0.001865495927631855
434 0.0018067813944071531
435 0.0017504645511507988
436 0.0016969889402389526
437 0.0016434363787993789
438 0.001594172208569944
439 0.001547179650515318
440 0.001500509912148118
441 0.0014550521736964583
442 0.0014159581623971462
443 0.0013746272306889296
444 0.0013345519546419382
445 0.001294380403123796


In [12]:
import torch

device = torch.device('cpu')
# device = torch.device('cuda') # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs
x = torch.randn(N, D_in, device=device)
y = torch.randn(N, D_out, device=device)

# Create random Tensors for weights; setting requires_grad=True means that we
# want to compute gradients for these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
  # Forward pass: compute predicted y using operations on Tensors. Since w1 and
  # w2 have requires_grad=True, operations involving these Tensors will cause
  # PyTorch to build a computational graph, allowing automatic computation of
  # gradients. Since we are no longer implementing the backward pass by hand we
  # don't need to keep references to intermediate values.
  y_pred = x.mm(w1).clamp(min=0).mm(w2)
  
  # Compute and print loss. Loss is a Tensor of shape (), and loss.item()
  # is a Python number giving its value.
  loss = (y_pred - y).pow(2).sum()
  print(t, loss.item())

  # Use autograd to compute the backward pass. This call will compute the
  # gradient of loss with respect to all Tensors with requires_grad=True.
  # After this call w1.grad and w2.grad will be Tensors holding the gradient
  # of the loss with respect to w1 and w2 respectively.
  loss.backward()

  # Update weights using gradient descent. For this step we just want to mutate
  # the values of w1 and w2 in-place; we don't want to build up a computational
  # graph for the update steps, so we use the torch.no_grad() context manager
  # to prevent PyTorch from building a computational graph for the updates
  with torch.no_grad():
    w1 -= learning_rate * w1.grad
    w2 -= learning_rate * w2.grad

    # Manually zero the gradients after running the backward pass
    w1.grad.zero_()
    w2.grad.zero_()

0 30802824.0
1 23767346.0
2 19386378.0
3 15407568.0
4 11485234.0
5 8097805.0
6 5494142.0
7 3710088.5
8 2546547.75
9 1808084.75
10 1333240.125
11 1020737.6875
12 806978.9375
13 654657.875
14 541700.6875
15 454984.40625
16 386419.3125
17 331145.125
18 285804.71875
19 248026.3125
20 216258.875
21 189320.796875
22 166314.90625
23 146568.390625
24 129544.765625
25 114826.3125
26 102037.140625
27 90886.015625
28 81135.0546875
29 72573.7734375
30 65068.36328125
31 58453.015625
32 52604.1484375
33 47423.015625
34 42825.3984375
35 38734.1015625
36 35089.5
37 31833.58984375
38 28918.041015625
39 26304.41796875
40 23956.951171875
41 21847.61328125
42 19946.7265625
43 18231.458984375
44 16681.41796875
45 15279.5341796875
46 14010.2734375
47 12858.16015625
48 11811.623046875
49 10860.5791015625
50 9993.9970703125
51 9203.8857421875
52 8482.904296875
53 7824.22802734375
54 7221.93798828125
55 6670.75537109375
56 6165.771484375
57 5702.7626953125
58 5277.9482421875
59 4887.6865234375
60 4528.91113281

423 9.185117232846096e-05
424 9.007823246065527e-05
425 8.857922512106597e-05
426 8.685355714987963e-05
427 8.495117799611762e-05
428 8.351985889021307e-05
429 8.213432738557458e-05
430 8.090440678643063e-05
431 7.963907410157844e-05
432 7.819438178557903e-05
433 7.683093281229958e-05
434 7.557534263469279e-05
435 7.405767246382311e-05
436 7.286498293979093e-05
437 7.176402141340077e-05
438 7.050741260172799e-05
439 6.948476220713928e-05
440 6.842133007012308e-05
441 6.720722012687474e-05
442 6.616947939619422e-05
443 6.531378312502056e-05
444 6.443519669119269e-05
445 6.32108494755812e-05
446 6.225280958460644e-05
447 6.14011223660782e-05
448 6.03237931500189e-05
449 5.957684697932564e-05
450 5.855887502548285e-05
451 5.778543709311634e-05
452 5.69698131585028e-05
453 5.599900396191515e-05
454 5.531121860258281e-05
455 5.4530391935259104e-05
456 5.393350511440076e-05
457 5.290298940963112e-05
458 5.226896610110998e-05
459 5.1600603910628706e-05
460 5.083293217467144e-05
461 5.02437724

In [13]:
import torch
import torch.nn as nn

In [14]:
# batch_size is batch size; 
#n_in is input dimension;
# n_h is hidden dimension; 
#n_out is output dimension.
n_in, n_h, n_out, batch_size = 10, 5, 1, 10

And now, we create some dummy input data x and some dummy target data y . We use PyTorch Tensors to store this data. PyTorch Tensors can be used and manipulated just like NumPy arrays but with the added benefit that PyTorch tensors can be run on the GPUs.

In [15]:
x = torch.randn(batch_size, n_in)
y = torch.tensor([[1.0], [0.0], [0.0], [1.0], [1.0], [1.0], [0.0], [0.0], [1.0], [1.0]])

In [18]:
model = nn.Sequential(nn.Linear(n_in,n_h),nn.ReLU(),nn.Linear(n_h,n_out),nn.Sigmoid())

 input -> linear -> relu -> linear -> sigmoid. 

Mean Squared Error Loss.

In [19]:
criterion = torch.nn.MSELoss()

In [20]:
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

 We use Stochastic Gradient Descent in this one and a learning rate of 0.01. 
 model.parameters() returns an iterator over our model’s parameters (weights and biases).

we run our Gradient Descent for 50 epochs. This does the forward propagation, loss computation, backward propagation and parameter updation in that sequence.

In [22]:
for epoch in range(50):
    # Forward Propagation
    y_pred = model(x)
    # Compute and print loss
    loss = criterion(y_pred, y)
    print('epoch: ', epoch,' loss: ', loss.item())
    # Zero the gradients
    optimizer.zero_grad()
    
    # perform a backward pass (backpropagation)
    loss.backward()
    
    # Update the parameters
    optimizer.step()

epoch:  0  loss:  0.27020835876464844
epoch:  1  loss:  0.27003487944602966
epoch:  2  loss:  0.2698621451854706
epoch:  3  loss:  0.26969003677368164
epoch:  4  loss:  0.2695186138153076
epoch:  5  loss:  0.2693478763103485
epoch:  6  loss:  0.26917779445648193
epoch:  7  loss:  0.2690083980560303
epoch:  8  loss:  0.26883965730667114
epoch:  9  loss:  0.26867154240608215
epoch:  10  loss:  0.2685041129589081
epoch:  11  loss:  0.26833733916282654
epoch:  12  loss:  0.2681712210178375
epoch:  13  loss:  0.26800572872161865
epoch:  14  loss:  0.2678408920764923
epoch:  15  loss:  0.2676766812801361
epoch:  16  loss:  0.2675131559371948
epoch:  17  loss:  0.2673501968383789
epoch:  18  loss:  0.2671879231929779
epoch:  19  loss:  0.26702624559402466
epoch:  20  loss:  0.26686519384384155
epoch:  21  loss:  0.2667047679424286
epoch:  22  loss:  0.26654496788978577
epoch:  23  loss:  0.2663857638835907
epoch:  24  loss:  0.26622721552848816
epoch:  25  loss:  0.266069233417511
epoch:  26 

**y_pred** gets the predicted values from a forward pass of our model.<br/>
We pass this, along with target values y to the criterion which calculates the loss. <br/>Then, optimizer.zero_grad() zeroes out all the gradients. We need to do this so that previous gradients don’t keep on accumulating.<br/> Then, loss.backward() is the main PyTorch magic that uses PyTorch’s Autograd feature.<br/> Autograd computes all the gradients w.r.t. all the parameters automatically based on the computation graph that it creates dynamically.<br/>Basically, this does the backward pass (backpropagation)of gradient descent.<br/> Finally, we call optimizer.step() which does a single updation of all the parameters using the new gradients.

In [23]:
import torch

dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
# Setting requires_grad=False indicates that we do not need to compute gradients
# with respect to these Tensors during the backward pass.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
# Setting requires_grad=True indicates that we want to compute gradients with
# respect to these Tensors during the backward pass.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # Forward pass: compute predicted y using operations on Tensors; these
    # are exactly the same operations we used to compute the forward pass using
    # Tensors, but we do not need to keep references to intermediate values since
    # we are not implementing the backward pass by hand.
    y_pred = x.mm(w1).clamp(min=0).mm(w2)

    # Compute and print loss using operations on Tensors.
    # Now loss is a Tensor of shape (1,)
    # loss.item() gets the a scalar value held in the loss.
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass. This call will compute the
    # gradient of loss with respect to all Tensors with requires_grad=True.
    # After this call w1.grad and w2.grad will be Tensors holding the gradient
    # of the loss with respect to w1 and w2 respectively.
    loss.backward()

    # Manually update weights using gradient descent. Wrap in torch.no_grad()
    # because weights have requires_grad=True, but we don't need to track this
    # in autograd.
    # An alternative way is to operate on weight.data and weight.grad.data.
    # Recall that tensor.data gives a tensor that shares the storage with
    # tensor, but doesn't track history.
    # You can also use torch.optim.SGD to achieve this.
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 31863564.0
1 28181590.0
2 28272544.0
3 27736368.0
4 24085338.0
5 17759356.0
6 11250782.0
7 6503882.5
8 3721392.25
9 2254924.5
10 1495913.875
11 1084700.0
12 842592.6875
13 685329.1875
14 573981.6875
15 489504.15625
16 422309.875
17 367263.21875
18 321261.09375
19 282359.8125
20 249157.625
21 220597.59375
22 195869.234375
23 174370.609375
24 155617.53125
25 139267.6875
26 124903.8671875
27 112245.34375
28 101066.265625
29 91158.515625
30 82360.6015625
31 74533.5625
32 67557.234375
33 61331.4375
34 55767.09375
35 50780.15234375
36 46299.97265625
37 42266.07421875
38 38628.8125
39 35343.4921875
40 32373.501953125
41 29683.095703125
42 27244.0859375
43 25029.9921875
44 23017.587890625
45 21184.857421875
46 19515.69140625
47 17992.994140625
48 16602.54296875
49 15332.1435546875
50 14170.4794921875
51 13106.2841796875
52 12131.4765625
53 11236.44921875
54 10414.419921875
55 9658.9990234375
56 8964.345703125
57 8324.6474609375
58 7734.97314453125
59 7191.33056640625
60 6689.94384765625
61 6

417 0.00031597830820828676
418 0.0003079535672441125
419 0.00029918961809016764
420 0.0002926206507254392
421 0.000284779496723786
422 0.0002773872693069279
423 0.00027053567464463413
424 0.00026436569169163704
425 0.00025839332374744117
426 0.00025157255004160106
427 0.0002453807392157614
428 0.00023928462178446352
429 0.0002339881903026253
430 0.00022902674390934408
431 0.00022356871340889484
432 0.00021826583542861044
433 0.0002129406639141962
434 0.00020743634377140552
435 0.00020344584481790662
436 0.0001992730249185115
437 0.00019530445570126176
438 0.00019068401888944209
439 0.00018704401736613363
440 0.00018226233078166842
441 0.00017888404545374215
442 0.00017515310901217163
443 0.00017094521899707615
444 0.00016757790581323206
445 0.00016397039871662855
446 0.00016044860240072012
447 0.00015707462443970144
448 0.00015379744581878185
449 0.00015085277846083045
450 0.00014791174908168614
451 0.000145281373988837
452 0.00014224309416022152
453 0.0001397120940964669
454 0.0001369

IndentationError: expected an indented block (<ipython-input-42-1ccb121e35e5>, line 13)

In [28]:
# -*- coding: utf-8 -*-
import torch


class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


dtype = torch.float
device = torch.device("cpu")
# device = torch.device("cuda:0") # Uncomment this to run on GPU

# N is batch size; D_in is input dimension;
# H is hidden dimension; D_out is output dimension.
N, D_in, H, D_out = 64, 1000, 100, 10

# Create random Tensors to hold input and outputs.
x = torch.randn(N, D_in, device=device, dtype=dtype)
y = torch.randn(N, D_out, device=device, dtype=dtype)

# Create random Tensors for weights.
w1 = torch.randn(D_in, H, device=device, dtype=dtype, requires_grad=True)
w2 = torch.randn(H, D_out, device=device, dtype=dtype, requires_grad=True)

learning_rate = 1e-6
for t in range(500):
    # To apply our Function, we use Function.apply method. We alias this as 'relu'.
    relu = MyReLU.apply

    # Forward pass: compute predicted y using operations; we compute
    # ReLU using our custom autograd operation.
    y_pred = relu(x.mm(w1)).mm(w2)

    # Compute and print loss
    loss = (y_pred - y).pow(2).sum()
    print(t, loss.item())

    # Use autograd to compute the backward pass.
    loss.backward()

    # Update weights using gradient descent
    with torch.no_grad():
        w1 -= learning_rate * w1.grad
        w2 -= learning_rate * w2.grad

        # Manually zero the gradients after updating weights
        w1.grad.zero_()
        w2.grad.zero_()

0 36331240.0
1 38757940.0
2 44611860.0
3 44735552.0
4 34210536.0
5 18897714.0
6 8370339.0
7 3673174.5
8 1945350.875
9 1281026.0
10 971599.3125
11 788806.0
12 660836.8125
13 562513.6875
14 483572.9375
15 418931.96875
16 365166.65625
17 319889.375
18 281478.90625
19 248729.3125
20 220599.609375
21 196308.375
22 175232.875
23 156862.15625
24 140774.265625
25 126662.734375
26 114228.8359375
27 103226.3125
28 93461.4375
29 84778.2890625
30 77040.53125
31 70128.46875
32 63931.08203125
33 58373.47265625
34 53372.703125
35 48869.5234375
36 44800.2265625
37 41120.26171875
38 37788.4921875
39 34764.0
40 32015.591796875
41 29512.99609375
42 27231.46875
43 25149.50390625
44 23246.966796875
45 21506.42578125
46 19911.814453125
47 18449.09765625
48 17106.255859375
49 15874.052734375
50 14741.9931640625
51 13699.9638671875
52 12739.6484375
53 11854.1748046875
54 11036.765625
55 10281.626953125
56 9583.78125
57 8938.6328125
58 8341.23046875
59 7787.53466796875
60 7274.2353515625
61 6797.9609375
62 635

377 0.0013685583835467696
378 0.0013205413706600666
379 0.0012737854849547148
380 0.0012281436938792467
381 0.0011853264877572656
382 0.001142614521086216
383 0.0011030836030840874
384 0.0010661107953637838
385 0.0010298928245902061
386 0.0009946629870682955
387 0.0009607297251932323
388 0.0009285278501920402
389 0.000897254969459027
390 0.0008664201013743877
391 0.0008372701704502106
392 0.0008088027243502438
393 0.0007835196447558701
394 0.0007585182320326567
395 0.0007340024458244443
396 0.0007104565156623721
397 0.000686919956933707
398 0.0006665671826340258
399 0.0006455513066612184
400 0.0006254948093555868
401 0.0006058682920411229
402 0.0005881987744942307
403 0.0005696252337656915
404 0.0005519597907550633
405 0.0005350960418581963
406 0.0005198389990255237
407 0.0005039985408075154
408 0.0004888261901214719
409 0.0004739941214211285
410 0.00046160805504769087
411 0.0004466141981538385
412 0.0004337560385465622
413 0.0004218463145662099
414 0.00041027183760888875
415 0.0003998

In [43]:
import torch 

class TwoLayerModel(torch.nn.Module):
    
    def __init__(self,D_in,H,D_out):
        """
        In the constructor we instantiate two nn.Linear modules and assign them as
        member variables.
        """       
        super(TwoLayerModel,self).__init__()
        self.linear1 = torch.nn.Linear(D_in,H)
        self.linear2 = torch.nn.Linear(H,D_out)
    def forward(self,x):
        h_relu = 
        

AttributeError: module 'torch.nn' has no attribute 'module'