# **Gradient Descent Algorithm used to Optimize the loss function**

In [1]:
import torch
import numpy as np

## **Part 1 : Gradient Descent From Strach (Using Numpy Only)**

In [68]:
'''
Our Function approximation is simple linear regressor which has the relationship,
                Y = 2 * X
'''

X = np.array(
            [1,2,3,4,5],
            dtype = np.float32
            )

Y = np.array(
            [2,4,6,8,10],
            dtype = np.float32
            )

In [69]:
# Step 1 : Forward Pass

'''
    Function Approximation : Y_hat = W * X
    
    dim(X) -> (5,) --> vector
    dim(Y) -> (5,) --> vector
    dim(W) -> ()   --> Scalar
'''

W = 0.0 #initialize weights to zero

def forward_propogation(X):
    return W * X

In [70]:
# Step 2 : Loss Function

'''
    Loss Function approximated by,
    
          J = ((Y_hat - Y) ** 2).mean()
'''

def MSE(W , X , Y):
    return ((W * X - Y) ** 2).mean()

In [71]:
# Step 3 : Backward Pass

'''
        J = ((Y_hat - Y) ** 2).mean()
        J = ((W * X - Y) ** 2) / N     ; Here N=5
        dJ/dW = (2/N) * (Y_hat - Y) * X
'''

def gradients(X, Y):
    Y_hat = forward_propogation(X)
    return 2* np.dot(Y_hat - Y, X).mean()
    
def gradient_descent(W, dW, lr):
    W = W - lr * dW
    return W
    

In [72]:
# Step 4 : Training Loop

lr = 0.01
epoches = 10

for epoch in range(epoches):
    Y_hat = forward_propogation(X)
    mse = MSE(W , X , Y)
    dW = gradients(X, Y)
    W = gradient_descent(W, dW, lr)
    
    print("Epoch : {} -> Loss : {} -> W : {}".format(epoch, round(mse, 3), round(W, 3)))

Epoch : 0 -> Loss : 44.0 -> W : 2.2
Epoch : 1 -> Loss : 0.4399999976158142 -> W : 1.98
Epoch : 2 -> Loss : 0.004000000189989805 -> W : 2.002
Epoch : 3 -> Loss : 0.0 -> W : 2.0
Epoch : 4 -> Loss : 0.0 -> W : 2.0
Epoch : 5 -> Loss : 0.0 -> W : 2.0
Epoch : 6 -> Loss : 0.0 -> W : 2.0
Epoch : 7 -> Loss : 0.0 -> W : 2.0
Epoch : 8 -> Loss : 0.0 -> W : 2.0
Epoch : 9 -> Loss : 0.0 -> W : 2.0


# **Part 2 : Gradient Descent From AutoGrad (Using Torch Only)**

In [89]:
X = torch.tensor(
            [1,2,3,4,5],
            dtype = torch.float32
            )

Y = torch.tensor(
            [2,4,6,8,10],
            dtype = torch.float32
            )

W = torch.tensor(0.0, dtype = torch.float32, requires_grad=True)

In [90]:
lr = 0.01
epoches = 10

for epoch in range(epoches):
    
    Y_hat = forward_propogation(X)
    mse = MSE(W , X , Y)
    mse.backward()

    #updating weights shouldn't track the gradients (This isn't a part of the computational graph)
    with torch.no_grad():
        W.sub_(W.grad*lr)
        W.grad.zero_() # avoid gradient accumilation
        
    print("Epoch : {} -> Loss : {} -> W : {}".format(epoch, round(mse.item(), 3), round(W.item(), 3)))

Epoch : 0 -> Loss : 44.0 -> W : 0.44
Epoch : 1 -> Loss : 26.77 -> W : 0.783
Epoch : 2 -> Loss : 16.287 -> W : 1.051
Epoch : 3 -> Loss : 9.909 -> W : 1.26
Epoch : 4 -> Loss : 6.029 -> W : 1.423
Epoch : 5 -> Loss : 3.668 -> W : 1.55
Epoch : 6 -> Loss : 2.231 -> W : 1.649
Epoch : 7 -> Loss : 1.358 -> W : 1.726
Epoch : 8 -> Loss : 0.826 -> W : 1.786
Epoch : 9 -> Loss : 0.503 -> W : 1.833
