In [1]:
import numpy as np
import torch 

### Pytorch gradient calculation 

* If we deine a varibale with **requires_grad=True**, then we can find a gradient of that function with respect to that variable

* If the final function is a scaler then, the gradient implimentation w.r.t. that variable is simple
    - for ex: f = average(), then f.balkward() and x.grad is enough 

* But if the final function is a vector valued function, then when we call backward function we need to insert another array to get the correct gradient out
    - ex. f = torch.sin(x), then f.backward(torch.ones_like(f)), AND  x.grad is needed 

### Example 01: Gradient of vector function 

In [2]:
x = torch.rand(3,requires_grad=True)
print(x)

tensor([0.8099, 0.9596, 0.7738], requires_grad=True)


In [3]:
y = x + 2 
print(y)

tensor([2.8099, 2.9596, 2.7738], grad_fn=<AddBackward0>)


In [4]:
y.backward(torch.ones_like(y))
x.grad

tensor([1., 1., 1.])

In [5]:
z = y*y*2
print(z)

tensor([15.7907, 17.5183, 15.3877], grad_fn=<MulBackward0>)


### Example 02: Gradient of scaler valued fucntion 

In [6]:
z = z.mean()
print(z)

tensor(16.2323, grad_fn=<MeanBackward0>)


In [7]:
z.backward()
x.grad

tensor([4.7465, 4.9461, 4.6984])

### Example 03: How to get the jacobian of a matrix? 

1. using the torch inbuilt jacobian function (Note:this is very slow in computing)

   https://stackoverflow.com/questions/54754153/autograd-grad-for-tensor-in-pytorch

2. utilize torch.autograd.grad() to get jacobian matrix elements along the batch dimension 

Toy problem to get jacobian: 

* $B_x = x^3 -y^2, B_y = y^2 + z^2, B_z = z^3 - x^2$

In [8]:
import torch 

x = torch.arange(0,10,dtype=torch.float32,requires_grad=True)
y = torch.arange(0,10,dtype=torch.float32,requires_grad=True)
z = torch.arange(0,10,dtype=torch.float32,requires_grad=True)

Bx = x**3 - y**2
By = y**2 + z**2 
Bz = z**3 - x**2

In [9]:
B = torch.stack([Bx,By,Bz],dim=1)
B

tensor([[  0.,   0.,   0.],
        [  0.,   2.,   0.],
        [  4.,   8.,   4.],
        [ 18.,  18.,  18.],
        [ 48.,  32.,  48.],
        [100.,  50., 100.],
        [180.,  72., 180.],
        [294.,  98., 294.],
        [448., 128., 448.],
        [648., 162., 648.]], grad_fn=<StackBackward0>)

In [10]:
# get the gradinet of y w.r.t. x (in our case x is along the batch-dim)
def get_grad(y,x):
    return torch.autograd.grad(y,[x],torch.ones_like(y),create_graph=True,allow_unused=True)[0]

# this find the columns of jacobian matrix 
def get_jacob_col(Y,x):
    jacob_col = []
    for ii in range(Y.shape[-1]):
        jacob_col.append(get_grad(Y[:,ii],x).unsqueeze(dim=1)) 
    return torch.cat(jacob_col,dim=1)

# note shape of Y (batch_dim,output_dim), x(batch_dim) -> returns (batch_dim,output_dim,input_dim)
# here input dim is 3 (x1,x2,x3)
def get_jacob(Y,x1,x2,x3):
    Jx = get_jacob_col(Y,x1)
    Jy = get_jacob_col(Y,x2)
    Jz = get_jacob_col(Y,x3)
    return torch.stack((Jx,Jy,Jz),dim=1).permute(0,2,1)
    

In [11]:
get_jacob(B,x,y,z)

tensor([[[  0.,   0.,   0.],
         [  0.,   0.,   0.],
         [  0.,   0.,   0.]],

        [[  3.,  -2.,   0.],
         [  0.,   2.,   2.],
         [ -2.,   0.,   3.]],

        [[ 12.,  -4.,   0.],
         [  0.,   4.,   4.],
         [ -4.,   0.,  12.]],

        [[ 27.,  -6.,   0.],
         [  0.,   6.,   6.],
         [ -6.,   0.,  27.]],

        [[ 48.,  -8.,   0.],
         [  0.,   8.,   8.],
         [ -8.,   0.,  48.]],

        [[ 75., -10.,   0.],
         [  0.,  10.,  10.],
         [-10.,   0.,  75.]],

        [[108., -12.,   0.],
         [  0.,  12.,  12.],
         [-12.,   0., 108.]],

        [[147., -14.,   0.],
         [  0.,  14.,  14.],
         [-14.,   0., 147.]],

        [[192., -16.,   0.],
         [  0.,  16.,  16.],
         [-16.,   0., 192.]],

        [[243., -18.,   0.],
         [  0.,  18.,  18.],
         [-18.,   0., 243.]]], grad_fn=<PermuteBackward0>)

* (b) Let's use pytorch inbuilt  **jacobian** function to get the jacobin for our toy problem 

In [12]:
import torch 

x = torch.arange(0,10,dtype=torch.float32,requires_grad=True)
y = torch.arange(0,10,dtype=torch.float32,requires_grad=True)
z = torch.arange(0,10,dtype=torch.float32,requires_grad=True)

def get_B(X):
    Bx = X[:,0]**3 - X[:,1]**2
    By = X[:,1]**2 + X[:,2]**2 
    Bz = X[:,2]**3 - X[:,0]**2
    B = torch.stack([Bx,By,Bz],dim=1)
    return B

X = torch.stack([x,y,z],dim=1)


In [13]:
#using torch.autograd.functional.jacobian
B_jacob = torch.autograd.functional.jacobian(get_B,X) 
#the off-diagonal elements of 0th and 2nd dimension are all zero. So we remove them
B_grad  = torch.diagonal(B_jacob,offset=0,dim1=0,dim2=2)                   # (output_dim, input_dim, batch_size)
B_grad  = B_grad.permute(2,0,1)       

In [14]:
B_grad

tensor([[[  0.,   0.,   0.],
         [  0.,   0.,   0.],
         [  0.,   0.,   0.]],

        [[  3.,  -2.,   0.],
         [  0.,   2.,   2.],
         [ -2.,   0.,   3.]],

        [[ 12.,  -4.,   0.],
         [  0.,   4.,   4.],
         [ -4.,   0.,  12.]],

        [[ 27.,  -6.,   0.],
         [  0.,   6.,   6.],
         [ -6.,   0.,  27.]],

        [[ 48.,  -8.,   0.],
         [  0.,   8.,   8.],
         [ -8.,   0.,  48.]],

        [[ 75., -10.,   0.],
         [  0.,  10.,  10.],
         [-10.,   0.,  75.]],

        [[108., -12.,   0.],
         [  0.,  12.,  12.],
         [-12.,   0., 108.]],

        [[147., -14.,   0.],
         [  0.,  14.,  14.],
         [-14.,   0., 147.]],

        [[192., -16.,   0.],
         [  0.,  16.,  16.],
         [-16.,   0., 192.]],

        [[243., -18.,   0.],
         [  0.,  18.,  18.],
         [-18.,   0., 243.]]])

### How to prevent gradient tracking of a varibale  

- x.requires_grad_(False)
- x.detach()
- or using torch.no_grad() context manager 

In [15]:
x = torch.rand(3,requires_grad=True)
print(x)

# if we want to prevent gradient tracking of varible x 
x.requires_grad_(False)
print(x)

tensor([0.7201, 0.3558, 0.4785], requires_grad=True)
tensor([0.7201, 0.3558, 0.4785])


In [16]:
x = torch.rand(3,requires_grad=True)
print(x)

# remove gradient tracking of variable x 
y = x.detach()
print(y)

tensor([0.7154, 0.2369, 0.9745], requires_grad=True)
tensor([0.7154, 0.2369, 0.9745])


In [17]:
x = torch.randn(3,requires_grad=True)

with torch.no_grad(): # don't track gradient here 
    y = x+2 
    print(y)

tensor([1.8116, 3.0144, 1.4517])


### How to stop accumulation of gradients (zero-outing gradients)

In [18]:
weights = torch.ones(4,requires_grad=True)

for eopoch in range(3):
    model_output = (weights*3).sum()
    print(model_output)
    model_output.backward()
    print(weights.grad)

tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
tensor(12., grad_fn=<SumBackward0>)
tensor([6., 6., 6., 6.])
tensor(12., grad_fn=<SumBackward0>)
tensor([9., 9., 9., 9.])


In [19]:
weights = torch.tensor([1.,2.,3.,4.],dtype=torch.float32,requires_grad=True)

for eopoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


### Backpropergation: chain rule to get the derivtive of function (final) w.r.t wieght parameter

* y = a(x)
* z = b(y)

* chain rule : $\frac{\partial z}{\partial x} = \frac{\partial z}{\partial y} \cdot \frac{\partial y}{\partial x}$

* at each node (**one type of computation is done here**) of the computational graph it finds the local gradient 

In [20]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)

w = torch.tensor(1.5,requires_grad=True)

# forward pass
y_hat = w*x
loss = (y_hat - y)**2 
print(f'loss value = {loss}')

# backward pass 
loss.backward()
print(f'gradient of loss w.r.t. w = {w.grad}')


loss value = 0.25
gradient of loss w.r.t. w = -1.0


### Gradient descent: Way to optimize the weight to reduce the error function 

1. First we are going to update weights using **numpy** , note that we need to know the derivatives of the error function w.r.t. weights  

2. Advantage of using **Pytorch** is it find the gradient for us, we don't need to manually implement the gradient computation w.r.t. weights 

In [21]:
# f = w*x with w=2 
X = np.array([1,2,3,4],dtype=np.float32)
Y = np.array([2,4,6,8],dtype=np.float32)

# weight paramter (we want to update this)
W = 0.0 

# model prediction 
def forward(x):
    return W*x

# loss 
def loss(y,y_pred):
    return ((y-y_pred)**2).mean()

# gradient 
# MSE = 1/N * (w*x-y)**2 
# dJ/dw = 1/N* 2x * (w*x -y)

# gradinet of loss w.r.t. weight 
def gradient(x,y,y_pred):
    return np.dot(2*x, y_pred-y).mean()

print(f'Prediction before training:f(5)={forward(5):.3f}')

# training 
lr = 0.01
n_iters = 10 

for epoch in range(n_iters):
    y_pred = forward(X)             # y_pred 
    l = loss(Y,y_pred)              # loss   
    dw = gradient(X,Y,y_pred)       # dL/dw
    W-=lr*dw                        # update w 
    
    if epoch % 1 ==0:
        print(f'epoch {epoch +1}: w = {w:.3f},loss={l:.8f}')
    
    
print(f'Prediction after training:f(5)={forward(5):.3f}')

Prediction before training:f(5)=0.000
epoch 1: w = 1.500,loss=30.00000000
epoch 2: w = 1.500,loss=4.79999924
epoch 3: w = 1.500,loss=0.76800019
epoch 4: w = 1.500,loss=0.12288000
epoch 5: w = 1.500,loss=0.01966083
epoch 6: w = 1.500,loss=0.00314570
epoch 7: w = 1.500,loss=0.00050332
epoch 8: w = 1.500,loss=0.00008053
epoch 9: w = 1.500,loss=0.00001288
epoch 10: w = 1.500,loss=0.00000206
Prediction after training:f(5)=9.999


In [22]:
# f = w*x with w=2 
X = torch.tensor([1,2,3,4],dtype=torch.float32) 
Y = torch.tensor([2,4,6,8],dtype=torch.float32)
W = torch.tensor(0.0,dtype=torch.float32,requires_grad=True)  

# model prediction 
def forward(x):
    return W*x

# loss 
def loss(y,y_pred):
    return ((y-y_pred)**2).mean()

# training 
lr = 0.01
n_iters = 10 

for epoch in range(n_iters):
    y_pred = forward(X)      # y_pred 
    l = loss(Y,y_pred)       # loss 
    l.backward()             # dL/dw 
    with torch.no_grad():
        W-=lr*W.grad         # update weight 
    W.grad.zero_()           # zero-out gradient 
        
    if epoch % 2 ==0:
        print(f'epoch {epoch +1}: w = {W:.3f},loss={l:.8f}')

epoch 1: w = 0.300,loss=30.00000000
epoch 3: w = 0.772,loss=15.66018772
epoch 5: w = 1.113,loss=8.17471695
epoch 7: w = 1.359,loss=4.26725292
epoch 9: w = 1.537,loss=2.22753215


#### This example shows the usage of **pytorch** in-buit functions to make the gradient descent easy
- nn.MSELoss() to get the error 
- torch.optim.SGD() to manully update the weights s.t. minimize the error(or to do gradient decent)

In [23]:
import torch.nn as nn
import torch

X = torch.tensor([1,2,3,4],dtype=torch.float32)
Y = torch.tensor([2,4,6,8],dtype=torch.float32)

W = torch.tensor(0.0,dtype=torch.float32,requires_grad=True) 

# model prediction 
def forward(x):
    return W*x

# loss 
loss = nn.MSELoss()
optimizer = torch.optim.SGD([W],lr=0.01)

# training 
n_iters = 10 

for epoch in range(n_iters):
    y_pred = forward(X)         # y_pred 
    l = loss(Y,y_pred)          # loss 
    l.backward()                # dl/dw
    optimizer.step()            # update w 
    optimizer.zero_grad()       # zero-out gradient 
    
    if epoch % 1 ==0:
        print(f'epoch {epoch +1}: w = {W:.3f},loss={l:.8f}')
    


epoch 1: w = 0.300,loss=30.00000000
epoch 2: w = 0.555,loss=21.67499924
epoch 3: w = 0.772,loss=15.66018772
epoch 4: w = 0.956,loss=11.31448650
epoch 5: w = 1.113,loss=8.17471695
epoch 6: w = 1.246,loss=5.90623236
epoch 7: w = 1.359,loss=4.26725292
epoch 8: w = 1.455,loss=3.08308983
epoch 9: w = 1.537,loss=2.22753215
epoch 10: w = 1.606,loss=1.60939169


### Using all **Pytorch** functions 

1. Design the model (input size, output size, forward pass)

2. Construct loss and optimizer 

3. Training loop:

    - forward pass: compute prediction 
    - backward pass: gradients 
    - update the weights 

In [24]:
import torch.nn as nn
import torch

X = torch.tensor([[1],[2],[3],[4]],dtype=torch.float32)
Y = torch.tensor([[2],[4],[6],[8]],dtype=torch.float32)
X_test = torch.tensor([5],dtype=torch.float32)
num_smaples,num_features = X.shape

# model prediction 
model = nn.Linear(num_features,num_features)           # this gves y_pred 

# loss 
loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

print(f'Prediction before training:f(5)={model(X_test.detach()).item():.3f}')

# training 
n_iters = 1000

for epoch in range(n_iters):
    y_pred = model(X)              # y_pred 
    l = loss(Y,y_pred)             # loss 
    l.backward()                   # dL/dw
    optimizer.step()               # update w 
    optimizer.zero_grad()          # zero grad 
        
    if epoch % 100 ==0:
        [w,b] = model.parameters()
        print(f'epoch {epoch +1}: w = {w[0][0]:.3f},loss={l:.8f}')
    
    
print(f'Prediction after training:f(5)={model(X_test.detach()).item():.3f}')

Prediction before training:f(5)=1.973
epoch 1: w = 0.535,loss=21.27290535
epoch 101: w = 1.744,loss=0.73334575
epoch 201: w = 1.968,loss=0.02578374
epoch 301: w = 2.009,loss=0.00137958
epoch 401: w = 2.016,loss=0.00051042
epoch 501: w = 2.017,loss=0.00045357
epoch 601: w = 2.017,loss=0.00042627
epoch 701: w = 2.017,loss=0.00040146
epoch 801: w = 2.016,loss=0.00037816
epoch 901: w = 2.016,loss=0.00035621
Prediction after training:f(5)=10.031
