### recall from numpy
>numpy is a general math framework,so I use it to optimize a random net.

$$Y=X W$$

$$\frac{\partial Loss}{\partial X}=\frac{\partial Loss}{\partial Y} W^T$$

$$\frac{\partial Loss}{\partial W}=X^T \frac{\partial Loss}{\partial Y}$$

$$Loss=\frac{1}{2}\|Y-\hat{Y}\|^2$$
$$\frac{\partial Loss}{\partial Y}=\hat{Y}-Y$$

In [1]:
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from IPython.display import clear_output
from time import time

In [35]:
#try replace randn with rand
N, D_in, H, D_out = 64, 1000, 100, 10
np.random.seed(0)
X=np.random.randn(N,D_in)
W1=np.random.randn(D_in,H)
W2=np.random.randn(H,D_out)
Y=np.random.randn(N,D_out)
lr=1e-6
start_time=time()
for i in range(500):
    #forward 
    Z1=X.dot(W1)
    H1=np.maximum(Z1,0)
#     print(np.sum(H1))
    Z2=H1.dot(W2)
    Loss=0.5*np.sum((Y-Z2)**2)/N
    print(Loss)
    #backward
    dZ2=Z2-Y
    dW2=H1.T.dot(dZ2)
    dH1=dZ2.dot(W2.T)
    dZ1=dH1
    dZ1[Z1<0]=0
    dW1=X.T.dot(dZ1)

    W1=W1-lr*dW1
    W2=W2-lr*dW2
clear_output()
print(Loss)
print(time()-start_time)

0.00028473321216456864
2.387462615966797


### Using torch to compute graph

In [34]:
#try replace randn with rand
N, D_in, H, D_out = 64, 1000, 100, 10
# torch.random.set_rng_state()
X=torch.randn(N,D_in)
W1=torch.randn(D_in,H)
W2=torch.randn(H,D_out)
Y=torch.randn(N,D_out)
lr=1e-6
start_time=time()
for i in range(500):
    #forward 
    Z1=X.mm(W1)
    H1=torch.clamp(Z1,min=0)
    Z2=H1.mm(W2)
    Loss=0.5*torch.sum((Y-Z2)**2)/N
    print(Loss.item())
    #backward
    dZ2=Z2-Y
    dW2=H1.t().mm(dZ2)
    dH1=dZ2.mm(W2.t())
    dZ1=torch.clone(dH1)
    dZ1[Z1<0]=0
    dW1=X.t().mm(dZ1)

    W1=W1-lr*dW1
    W2=W2-lr*dW2
clear_output()
print(Loss)
print(time()-start_time)

tensor(0.0016)
0.9933171272277832


### AutoGradient 

In [86]:
#try replace randn with rand
device = torch.device("cpu")
N, D_in, H, D_out = 64, 1000, 100, 10
# torch.random.set_rng_state()
X=torch.randn(N,D_in,device=device)
W1=torch.randn(D_in,H,requires_grad=True,device=device)
W2=torch.randn(H,D_out,requires_grad=True,device=device)
Y=torch.randn(N,D_out,device=device)
lr=1e-6
start_time=time()
for i in range(500):
    #forward 
    Z=X.mm(W1).clamp(min=0).mm(W2)
    Loss=0.5*torch.sum((Y-Z)**2)
    print(Loss.item())
    #backward
    Loss.backward()
    with torch.no_grad():
        W1.sub_(lr*W1.grad)
        W2.sub_(lr*W2.grad)
        W1.grad.zero_()
        W2.grad.zero_()
clear_output()
print(Loss)
print(time()-start_time)

tensor(0.0472, grad_fn=<MulBackward0>)
1.0727624893188477


#### some bug I made

In [81]:
W=torch.Tensor([10])
W1=W
W-=1 #same as W.sub_(1)
print(W,W1)
W=torch.Tensor([10])
W1=W
W=W-1
print(W,W1)

tensor([9.]) tensor([9.])
tensor([9.]) tensor([10.])


In [112]:
W=torch.tensor((10.0),requires_grad=True)

for i in range(3):
    W.backward()
    print(W.grad)
    with torch.no_grad():
        W-=W.grad
    W.grad.zero_()

tensor(1.)
tensor(1.)
tensor(1.)


### Now Define Custom Function,This Function like a graph operator in Tensorflow

In [137]:
class MyReLU(torch.autograd.Function):
    """
    We can implement our own custom autograd Functions by subclassing
    torch.autograd.Function and implementing the forward and backward passes
    which operate on Tensors.
    """

    @staticmethod
    def forward(ctx, input):
        """
        In the forward pass we receive a Tensor containing the input and return
        a Tensor containing the output. ctx is a context object that can be used
        to stash information for backward computation. You can cache arbitrary
        objects for use in the backward pass using the ctx.save_for_backward method.
        """
        ctx.save_for_backward(input)
        return input.clamp(min=0)

    @staticmethod
    def backward(ctx, grad_output):
        """
        In the backward pass we receive a Tensor containing the gradient of the loss
        with respect to the output, and we need to compute the gradient of the loss
        with respect to the input.
        """
        input, = ctx.saved_tensors
        grad_input = grad_output.clone()
        grad_input[input < 0] = 0
        return grad_input


In [138]:
W=torch.tensor([1,-1,2],requires_grad=True,dtype=torch.float32)
relu=MyReLU.apply
Y=relu(W)
print(Y)
Z=torch.sum(Y**2)
Z.backward()
print(W.grad)

tensor([1., 0., 2.], grad_fn=<MyReLUBackward>)
tensor([2., 0., 4.])


### Sequence Module Can Help Me manager all learnable parameter

In [157]:
device = torch.device("cpu")
N, D_in, H, D_out = 64, 1000, 100, 10
# torch.random.set_rng_state()
X=torch.randn(N,D_in,device=device)
Y=torch.randn(N,D_out,device=device)
lr=1e-3

model=nn.Sequential(nn.Linear(D_in,H),nn.ReLU(),nn.Linear(H,D_out))
criterion=nn.MSELoss(reduction='sum')
for i in range(300):
    Yhat=model(X)
    loss=criterion(Yhat,Y)
    print(loss.item())
    model.zero_grad()
    loss.backward()
#     with torch.no_grad():
    for v in model.parameters():
        v.data-=lr*v.grad.data
clear_output()
print(loss.item())

2.0949088817834305e-12


### why not use predefine optimizer?

In [165]:
device = torch.device("cpu")
N, D_in, H, D_out = 64, 1000, 100, 10
# torch.random.set_rng_state()
X=torch.randn(N,D_in,device=device)
Y=torch.randn(N,D_out,device=device)
lr=1e-3

model=nn.Sequential(nn.Linear(D_in,H),nn.ReLU(),nn.Linear(H,D_out))
criterion=nn.MSELoss(reduction='sum')
solver=optim.Adam(model.parameters(),lr)
for i in range(300):
    Yhat=model(X)
    loss=criterion(Yhat,Y)
    print(loss.item())
    solver.zero_grad()
    loss.backward()
    solver.step()
clear_output()
print(loss.item())

1.504963167975326e-11


In [35]:
#a max pool
def maxPoolForward(X):
    N,C,H,W=X.size()
    s=3
    Hnew,Wnew=H//s,W//s
    
    X=X.view(N*C,H,W) #view using same tensor as before
 
    Y=torch.zeros(N*C,Hnew,Wnew)
    Yidx=torch.zeros(N*C,Hnew,Wnew,dtype=torch.int)
    
    for h in range(0,H-s,s):
        for w in range(0,W-s,s):
            tmp=X[:,h:h+s,w:w+s].clone()
            tmp=tmp.view(N*C,-1)
            v,idx=torch.max(tmp,dim=1) #(NC,)
            Y[:,h//s,w//s]=v
            Yidx[:,h//s,w//s]=idx
    Y=Y.view(N,C,Hnew,Wnew)
    Yidx=Yidx.view(N,C,Hnew,Wnew)
    return Y
X=torch.rand((32,32,64,64))
Y=maxPoolForward(X)
# print(Y.size())
# A,B=torch.max(X.flatten(),dim=(0,1))
# print(A)
# print(X.flatten()[B])
# print(B)

tensor([[[[3, 8, 8,  ..., 1, 2, 8],
          [0, 0, 6,  ..., 5, 0, 2],
          [1, 7, 2,  ..., 2, 4, 2],
          ...,
          [3, 7, 8,  ..., 4, 3, 5],
          [6, 0, 7,  ..., 2, 4, 4],
          [5, 1, 6,  ..., 4, 8, 6]],

         [[0, 1, 6,  ..., 2, 4, 1],
          [3, 7, 7,  ..., 5, 8, 7],
          [8, 8, 0,  ..., 3, 8, 1],
          ...,
          [4, 5, 1,  ..., 5, 0, 0],
          [0, 3, 0,  ..., 6, 4, 6],
          [0, 7, 1,  ..., 1, 1, 1]],

         [[5, 6, 5,  ..., 2, 3, 4],
          [7, 6, 3,  ..., 3, 4, 4],
          [7, 4, 4,  ..., 6, 6, 5],
          ...,
          [7, 3, 5,  ..., 2, 5, 6],
          [5, 4, 2,  ..., 7, 5, 3],
          [4, 8, 2,  ..., 8, 6, 2]],

         ...,

         [[4, 4, 2,  ..., 0, 7, 0],
          [0, 8, 2,  ..., 0, 2, 2],
          [3, 2, 8,  ..., 2, 7, 3],
          ...,
          [2, 5, 1,  ..., 2, 5, 8],
          [4, 2, 8,  ..., 8, 7, 5],
          [1, 5, 7,  ..., 4, 6, 0]],

         [[3, 5, 0,  ..., 6, 8, 0],
          [0, 3, 

In [27]:
X=torch.rand(2,1)
Y=
Y[0]=11
print(X)

tensor([[0.9473],
        [0.9870]])


In [23]:
A=

tensor([[0.0085, 0.1422, 0.0841, 0.0159],
        [0.6125, 0.9052, 0.3643, 0.8606],
        [0.8837, 0.8688, 0.7011, 0.9851],
        [0.6369, 0.2326, 0.5140, 0.4784],
        [0.1734, 0.1631, 0.9880, 0.9613],
        [0.3081, 0.9198, 0.8727, 0.3004],
        [0.3395, 0.7368, 0.8652, 0.4882],
        [0.3953, 0.4048, 0.2693, 0.6423],
        [0.2239, 0.1508, 0.3040, 0.1762],
        [0.1835, 0.6734, 0.5466, 0.7143]])