In [1]:
import torch
import torch.nn as nn
import torch.nn.functional as F

class Net(nn.Module):
    # a fully connected Neural Network
    def __init__(self,K=10,d=10):
        super(Net,self).__init__()
        hidden_size = 2*d+1
        # Input (x1,x2,…,xd): d-nodes
        self.input = nn.Sequential(
            nn.Linear(d,hidden_size),
            nn.ReLU()
        )
        # K-hidden fully connected layers with bias of 2d+1 nodes
        hidden_list = []
        for i in range(K):
            hidden_list.append(nn.Linear(hidden_size,hidden_size,bias=True))
            hidden_list.append(nn.ReLU())
        self.hidden = nn.Sequential(*hidden_list)
        # Output (predict): 1 node
        self.output = nn.Sequential(
            nn.Linear(hidden_size,1),
        )
        
    def forward(self,x):
        x = self.input(x)
        x = self.hidden(x)
        x = self.output(x)
        return x

model = Net()
model

Net(
  (input): Sequential(
    (0): Linear(in_features=10, out_features=21, bias=True)
    (1): ReLU()
  )
  (hidden): Sequential(
    (0): Linear(in_features=21, out_features=21, bias=True)
    (1): ReLU()
    (2): Linear(in_features=21, out_features=21, bias=True)
    (3): ReLU()
    (4): Linear(in_features=21, out_features=21, bias=True)
    (5): ReLU()
    (6): Linear(in_features=21, out_features=21, bias=True)
    (7): ReLU()
    (8): Linear(in_features=21, out_features=21, bias=True)
    (9): ReLU()
    (10): Linear(in_features=21, out_features=21, bias=True)
    (11): ReLU()
    (12): Linear(in_features=21, out_features=21, bias=True)
    (13): ReLU()
    (14): Linear(in_features=21, out_features=21, bias=True)
    (15): ReLU()
    (16): Linear(in_features=21, out_features=21, bias=True)
    (17): ReLU()
    (18): Linear(in_features=21, out_features=21, bias=True)
    (19): ReLU()
  )
  (output): Sequential(
    (0): Linear(in_features=21, out_features=1, bias=True)
  )
)

In [2]:
# 2.Generate the input data (x1,x2,..xd) \in [0,1] drawn from a uniform random distribution
n_sample=20
d=10
x=torch.Tensor(n_sample,d).uniform_(0,1)
x

tensor([[0.2659, 0.1186, 0.0292, 0.8546, 0.1721, 0.2413, 0.5650, 0.2015, 0.5293,
         0.0882],
        [0.3239, 0.5718, 0.2037, 0.6626, 0.3914, 0.9792, 0.1123, 0.4600, 0.7393,
         0.8573],
        [0.2705, 0.9003, 0.0257, 0.3053, 0.4876, 0.3443, 0.7383, 0.6315, 0.1882,
         0.4509],
        [0.9394, 0.3523, 0.4443, 0.6567, 0.9696, 0.5028, 0.8928, 0.8165, 0.3917,
         0.3599],
        [0.9202, 0.0873, 0.9021, 0.8899, 0.3689, 0.9817, 0.9215, 0.9948, 0.0733,
         0.8841],
        [0.9664, 0.0805, 0.7113, 0.2430, 0.9132, 0.5585, 0.2394, 0.4683, 0.8358,
         0.3403],
        [0.8715, 0.4915, 0.0427, 0.9939, 0.9656, 0.1677, 0.0042, 0.5592, 0.9243,
         0.4736],
        [0.2664, 0.0666, 0.8331, 0.9408, 0.7673, 0.0162, 0.2848, 0.8458, 0.2953,
         0.9762],
        [0.6480, 0.6877, 0.7188, 0.3692, 0.6990, 0.4124, 0.5635, 0.9833, 0.3476,
         0.4854],
        [0.3594, 0.9829, 0.9270, 0.5396, 0.3106, 0.8589, 0.9491, 0.2278, 0.0850,
         0.1688],
        [0

In [3]:
# 3.Generate the labels y = (x1*x1+x2*x2+…+xd*xd)/d
y=(x**2).sum(dim=1)/d
y

tensor([0.1552, 0.3530, 0.2517, 0.4575, 0.6186, 0.3742, 0.4342, 0.4080, 0.3845,
        0.4068, 0.2887, 0.3627, 0.2776, 0.1809, 0.1951, 0.2567, 0.4530, 0.3313,
        0.3127, 0.3180])

In [4]:
# 4.Implement a loss function L = (predict-y)^2
def loss(predict, y): 
    return (predict - y)**2

In [5]:
# 5.Use batch size of 1, that means feed data one point at a time into network and compute the loss. Do one time forward propagation with one data point.
model_state_dict = model.state_dict()
batch_size=1
x_batch=x[:batch_size]
y_batch=y[:batch_size]
l=loss(model(x_batch),y_batch)
print('loss:',l)

loss: tensor([[0.0783]], grad_fn=<PowBackward0>)


In [6]:
# 6.Compute the gradients using pytorch autograd:
# a. dL/dw, dL/db
# b. Print these values into a text file: torch_autograd.dat
l.backward()
for layer,param in zip(model.state_dict().keys(),model.parameters()):
    print(layer,param.grad)
    with open('torch_autograd.dat','a') as f:
        f.write(layer+':\n')
        f.write(str(param.grad)+'\n\n')

input.0.weight tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-4.8064e-07, -2.1448e-07, -5.2697e-08, -1.5449e-06, -3.1113e-07,
         -4.3615e-07, -1.0215e-06, -3.6424e-07, -9.5678e-07, -1.5952e-07],
        [ 1.6173e-06,  7.2172e-07,  1.7732e-07,  5.1984e-06,  1.0469e-06,
          1.4676e-06,  3.4372e-06,  1.2256e-06,  3.2195e-06,  5.3676e-07],
        [ 1.3310e-07,  5.9395e-08,  1.4593e-08,  4.2781e-07,  8.6159e-08,
          1.2078e-07,  2.8287e-07,  1.0087e-07,  2.6495e-07,  4.4174e-08],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 5.9909e-07,  2.6734e-07,  6.5684e-08,  1.9256e-06,  3.8780e-07,
          5.4363e-07,  1.2732e-06,  4.5400e-07,  1.1926e-06,  1.9883e-07],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e

hidden.18.bias tensor([-0.0370,  0.0000, -0.0504,  0.0000, -0.0400, -0.0600,  0.0693, -0.0568,
         0.0000,  0.0000,  0.0000, -0.0135,  0.0000,  0.0000,  0.0000,  0.0000,
        -0.0388,  0.0857,  0.0197,  0.0000,  0.1026])
output.0.weight tensor([[-0.0379,  0.0000, -0.0436,  0.0000, -0.0930, -0.1269, -0.0619, -0.0558,
          0.0000,  0.0000,  0.0000, -0.0909,  0.0000,  0.0000,  0.0000,  0.0000,
         -0.0464, -0.1135, -0.0264,  0.0000, -0.0688]])
output.0.bias tensor([-0.5596])


In [7]:
# 7.Implement the forward propagation and backpropagation algorithm from scratch, 
# without using pytorch autograd, compute the gradients using your implementation
# a. dL/dw, dL/db
# b. Print these values into a text file: my_autograd.dat
from torch.autograd.variable import Variable
from torch.autograd.function import Function, NestedIOFunction  
from torch.autograd.gradcheck import gradcheck, gradgradcheck  
from torch.autograd.grad_mode import no_grad, enable_grad, set_grad_enabled 
from torch.autograd.anomaly_mode import detect_anomaly, set_detect_anomaly  
from torch.autograd import profiler  

def _make_grads(outputs, grads):
    new_grads = []
    for out, grad in zip(outputs, grads):
        if isinstance(grad, torch.Tensor):
            new_grads.append(grad)
        elif grad is None:
            if out.requires_grad:
                if out.numel() != 1:
                    raise RuntimeError("grad can be implicitly created only for scalar outputs")
                new_grads.append(torch.ones_like(out))
            else:
                new_grads.append(None)
        else:
            raise TypeError("gradients can be either Tensors or None, but got " +
                            type(grad).__name__)
    return tuple(new_grads)


def my_backward(tensors, grad_tensors=None, retain_graph=None, create_graph=False, grad_variables=None):
    if grad_variables is not None:
        warnings.warn("'grad_variables' is deprecated. Use 'grad_tensors' instead.")
        if grad_tensors is None:
            grad_tensors = grad_variables
        else:
            raise RuntimeError("'grad_tensors' and 'grad_variables' (deprecated) "
                               "arguments both passed to backward(). Please only "
                               "use 'grad_tensors'.")

    tensors = (tensors,) if isinstance(tensors, torch.Tensor) else tuple(tensors)

    if grad_tensors is None:
        grad_tensors = [None] * len(tensors)
    elif isinstance(grad_tensors, torch.Tensor):
        grad_tensors = [grad_tensors]
    else:
        grad_tensors = list(grad_tensors)

    grad_tensors = _make_grads(tensors, grad_tensors)
    if retain_graph is None:
        retain_graph = create_graph

    Variable._execution_engine.run_backward(
        tensors, grad_tensors, retain_graph, create_graph,
        allow_unreachable=True)  


In [8]:
model.zero_grad()
model.load_state_dict(model_state_dict)
my_l=loss(model(x_batch),y_batch)
my_backward(my_l)
for layer,param in zip(model.state_dict().keys(),model.parameters()):
    print(layer,param.grad)
    with open('my_autograd.dat','a') as f:
        f.write(layer+':\n')
        f.write(str(param.grad)+'\n\n')

input.0.weight tensor([[ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [-4.8064e-07, -2.1448e-07, -5.2697e-08, -1.5449e-06, -3.1113e-07,
         -4.3615e-07, -1.0215e-06, -3.6424e-07, -9.5678e-07, -1.5952e-07],
        [ 1.6173e-06,  7.2172e-07,  1.7732e-07,  5.1984e-06,  1.0469e-06,
          1.4676e-06,  3.4372e-06,  1.2256e-06,  3.2195e-06,  5.3676e-07],
        [ 1.3310e-07,  5.9395e-08,  1.4593e-08,  4.2781e-07,  8.6159e-08,
          1.2078e-07,  2.8287e-07,  1.0087e-07,  2.6495e-07,  4.4174e-08],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00],
        [ 5.9909e-07,  2.6734e-07,  6.5684e-08,  1.9256e-06,  3.8780e-07,
          5.4363e-07,  1.2732e-06,  4.5400e-07,  1.1926e-06,  1.9883e-07],
        [ 0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,  0.0000e+00,
          0.0000e

hidden.14.bias tensor([ 0.0000,  0.0000,  0.0000,  0.0032,  0.0000, -0.0046,  0.0000,  0.0000,
         0.0000, -0.0015,  0.0000,  0.0092, -0.0033,  0.0000, -0.0086,  0.0101,
         0.0000,  0.0066,  0.0000,  0.0000,  0.0024])
hidden.16.weight tensor([[ 0.0000,  0.0000,  0.0000, -0.0009,  0.0000, -0.0008,  0.0000,  0.0000,
          0.0000, -0.0005,  0.0000, -0.0003, -0.0013,  0.0000, -0.0002, -0.0025,
          0.0000, -0.0013,  0.0000,  0.0000, -0.0013],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0000,  0.0000,  0.0000,  0.0000],
        [ 0.0000,  0.0000,  0.0000, -0.0029,  0.0000, -0.0026,  0.0000,  0.0000,
          0.0000, -0.0017,  0.0000, -0.0009, -0.0043,  0.0000, -0.0007, -0.0082,
          0.0000, -0.0044,  0.0000,  0.0000, -0.0044],
        [ 0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,  0.0000,
          0.0000,  0.0