In [18]:
import numpy as np
import torch 

### Pytorch gradient calculation 

* If we deine a varibale with **requires_grad=True**, then we can find a gradient of that function with respect to that variable

* If the final function is a scaler then, the gradient implimentation w.r.t. that variable is simple
    - for ex: f = average(), then f.balkward() and x.grad is enough 

* But if the final function is a vector valued function, then when we call backward function we need to insert another array to get the correct gradient out
    - ex. f = torch.sin(x), then f.backward(torch.ones_like(f)), AND  x.grad is needed 

### Example 01: Gradient of vector function 

In [19]:
x = torch.rand(3,requires_grad=True)
print(x)

tensor([0.4420, 0.6609, 0.0305], requires_grad=True)


In [20]:
y = x + 2 
print(y)

tensor([2.4420, 2.6609, 2.0305], grad_fn=<AddBackward0>)


In [21]:
y.backward(torch.ones_like(y))
x.grad

tensor([1., 1., 1.])

In [22]:
z = y*y*2
print(z)

tensor([11.9269, 14.1607,  8.2458], grad_fn=<MulBackward0>)


### Example 02: Gradient of scaler valued fucntion 

In [23]:
z = z.mean()
print(z)

tensor(11.4445, grad_fn=<MeanBackward0>)


In [24]:
z.backward()
x.grad

tensor([4.2560, 4.5479, 3.7073])

### How to prevent gradient tracking of a varibale  

- x.requires_grad_(False)
- x.detach()
- or using torch.no_grad() context manager 

In [25]:
x = torch.rand(3,requires_grad=True)
print(x)

# if we want to prevent gradient tracking of varible x 
x.requires_grad_(False)
print(x)

tensor([0.7598, 0.5327, 0.4899], requires_grad=True)
tensor([0.7598, 0.5327, 0.4899])


In [26]:
x = torch.rand(3,requires_grad=True)
print(x)

# remove gradient tracking of variable x 
y = x.detach()
print(y)

tensor([0.4271, 0.2189, 0.8690], requires_grad=True)
tensor([0.4271, 0.2189, 0.8690])


In [27]:
x = torch.randn(3,requires_grad=True)

with torch.no_grad(): # don't track gradient here 
    y = x+2 
    print(y)

tensor([3.0992, 1.4607, 0.9361])


### How to stop accumulation of gradients (zero-outing gradients)

In [28]:
weights = torch.ones(4,requires_grad=True)

for eopoch in range(3):
    model_output = (weights*3).sum()
    print(model_output)
    model_output.backward()
    print(weights.grad)

tensor(12., grad_fn=<SumBackward0>)
tensor([3., 3., 3., 3.])
tensor(12., grad_fn=<SumBackward0>)
tensor([6., 6., 6., 6.])
tensor(12., grad_fn=<SumBackward0>)
tensor([9., 9., 9., 9.])


In [29]:
weights = torch.tensor([1.,2.,3.,4.],dtype=torch.float32,requires_grad=True)

for eopoch in range(3):
    model_output = (weights*3).sum()
    model_output.backward()
    print(weights.grad)
    weights.grad.zero_()

tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])
tensor([3., 3., 3., 3.])


### Backpropergation: chain rule to get the derivtive of function (final) w.r.t wieght parameter

* y = a(x)
* z = b(y)

* chain rule : $\frac{\partial z}{\partial x} = \frac{\partial z}{\partial y} \cdot \frac{\partial y}{\partial x}$

* at each node (**one type of computation is done here**) of the computational graph it finds the local gradient 

In [35]:
x = torch.tensor(1.0)
y = torch.tensor(2.0)

w = torch.tensor(1.5,requires_grad=True)

# forward pass
y_hat = w*x
loss = (y_hat - y)**2 
print(f'loss value = {loss}')

# backward pass 
loss.backward()
print(f'gradient of loss w.r.t. w = {w.grad}')


loss value = 0.25
gradient of loss w.r.t. w = -1.0


### Gradient descent: Way to optimize the weight to reduce the error function 

1. First we are going to update weights using **numpy** , note that we need to know the derivatives of the error function w.r.t. weights  

2. Advantage of using **Pytorch** is it find the gradient for us, we don't need to manually implement the gradient computation w.r.t. weights 

In [31]:
# f = w*x with w=2 
X = np.array([1,2,3,4],dtype=np.float32)
Y = np.array([2,4,6,8],dtype=np.float32)

# weight paramter (we want to update this)
W = 0.0 

# model prediction 
def forward(x):
    return W*x

# loss 
def loss(y,y_pred):
    return ((y-y_pred)**2).mean()

# gradient 
# MSE = 1/N * (w*x-y)**2 
# dJ/dw = 1/N* 2x * (w*x -y)

# gradinet of loss w.r.t. weight 
def gradient(x,y,y_pred):
    return np.dot(2*x, y_pred-y).mean()

print(f'Prediction before training:f(5)={forward(5):.3f}')

# training 
lr = 0.01
n_iters = 10 

for epoch in range(n_iters):
    y_pred = forward(X)             # y_pred 
    l = loss(Y,y_pred)              # loss   
    dw = gradient(X,Y,y_pred)       # dL/dw
    W-=lr*dw                        # update w 
    
    if epoch % 1 ==0:
        print(f'epoch {epoch +1}: w = {w:.3f},loss={l:.8f}')
    
    
print(f'Prediction after training:f(5)={forward(5):.3f}')

Prediction before training:f(5)=0.000
epoch 1: w = 1.500,loss=30.00000000
epoch 2: w = 1.500,loss=4.79999924
epoch 3: w = 1.500,loss=0.76800019
epoch 4: w = 1.500,loss=0.12288000
epoch 5: w = 1.500,loss=0.01966083
epoch 6: w = 1.500,loss=0.00314570
epoch 7: w = 1.500,loss=0.00050332
epoch 8: w = 1.500,loss=0.00008053
epoch 9: w = 1.500,loss=0.00001288
epoch 10: w = 1.500,loss=0.00000206
Prediction after training:f(5)=9.999


In [59]:
# f = w*x with w=2 
X = torch.tensor([1,2,3,4],dtype=torch.float32) 
Y = torch.tensor([2,4,6,8],dtype=torch.float32)
W = torch.tensor(0.0,dtype=torch.float32,requires_grad=True)  

# model prediction 
def forward(x):
    return W*x

# loss 
def loss(y,y_pred):
    return ((y-y_pred)**2).mean()

# training 
lr = 0.01
n_iters = 10 

for epoch in range(n_iters):
    y_pred = forward(X)      # y_pred 
    l = loss(Y,y_pred)       # loss 
    l.backward()             # dL/dw 
    with torch.no_grad():
        W-=lr*W.grad         # update weight 
    W.grad.zero_()           # zero-out gradient 
        
    if epoch % 2 ==0:
        print(f'epoch {epoch +1}: w = {W:.3f},loss={l:.8f}')

epoch 1: w = 0.300,loss=30.00000000
epoch 3: w = 0.772,loss=15.66018772
epoch 5: w = 1.113,loss=8.17471695
epoch 7: w = 1.359,loss=4.26725292
epoch 9: w = 1.537,loss=2.22753215


#### This example shows the usage of **pytorch** in-buit functions to make the gradient descent easy
- nn.MSELoss() to get the error 
- torch.optim.SGD() to manully update the weights s.t. minimize the error(or to do gradient decent)

In [60]:
import torch.nn as nn
import torch

X = torch.tensor([1,2,3,4],dtype=torch.float32)
Y = torch.tensor([2,4,6,8],dtype=torch.float32)

W = torch.tensor(0.0,dtype=torch.float32,requires_grad=True) 

# model prediction 
def forward(x):
    return W*x

# loss 
loss = nn.MSELoss()
optimizer = torch.optim.SGD([W],lr=0.01)

# training 
n_iters = 10 

for epoch in range(n_iters):
    y_pred = forward(X)         # y_pred 
    l = loss(Y,y_pred)          # loss 
    l.backward()                # dl/dw
    optimizer.step()            # update w 
    optimizer.zero_grad()       # zero-out gradient 
    
    if epoch % 1 ==0:
        print(f'epoch {epoch +1}: w = {W:.3f},loss={l:.8f}')
    


epoch 1: w = 0.300,loss=30.00000000
epoch 2: w = 0.555,loss=21.67499924
epoch 3: w = 0.772,loss=15.66018772
epoch 4: w = 0.956,loss=11.31448650
epoch 5: w = 1.113,loss=8.17471695
epoch 6: w = 1.246,loss=5.90623236
epoch 7: w = 1.359,loss=4.26725292
epoch 8: w = 1.455,loss=3.08308983
epoch 9: w = 1.537,loss=2.22753215
epoch 10: w = 1.606,loss=1.60939169


### Using all **Pytorch** functions 

1. Design the model (input size, output size, forward pass)

2. Construct loss and optimizer 

3. Training loop:

    - forward pass: compute prediction 
    - backward pass: gradients 
    - update the weights 

In [61]:
import torch.nn as nn
import torch

X = torch.tensor([[1],[2],[3],[4]],dtype=torch.float32)
Y = torch.tensor([[2],[4],[6],[8]],dtype=torch.float32)
X_test = torch.tensor([5],dtype=torch.float32)
num_smaples,num_features = X.shape

# model prediction 
model = nn.Linear(num_features,num_features)           # this gves y_pred 

# loss 
loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(),lr=0.001)

print(f'Prediction before training:f(5)={model(X_test.detach()).item():.3f}')

# training 
n_iters = 1000

for epoch in range(n_iters):
    y_pred = model(X)              # y_pred 
    l = loss(Y,y_pred)             # loss 
    l.backward()                   # dL/dw
    optimizer.step()               # update w 
    optimizer.zero_grad()          # zero grad 
        
    if epoch % 100 ==0:
        [w,b] = model.parameters()
        print(f'epoch {epoch +1}: w = {w[0][0]:.3f},loss={l:.8f}')
    
    
print(f'Prediction after training:f(5)={model(X_test.detach()).item():.3f}')

Prediction before training:f(5)=2.831
epoch 1: w = 0.763,loss=18.12822723
epoch 101: w = 1.875,loss=0.64779180
epoch 201: w = 2.078,loss=0.04428379
epoch 301: w = 2.113,loss=0.02221767
epoch 401: w = 2.117,loss=0.02025468
epoch 501: w = 2.115,loss=0.01905417
epoch 601: w = 2.111,loss=0.01794574
epoch 701: w = 2.108,loss=0.01690265
epoch 801: w = 2.105,loss=0.01592013
epoch 901: w = 2.102,loss=0.01499469
Prediction after training:f(5)=10.204
