## Gradient Descent with Autograd and Backpropogation

Implement toy example for gradient descent

In [1]:
import numpy as np 
import torch 

In [2]:
#Doing everything manually using numpy -- using for-loops |

#f = w * x 
#f = 2 * x

X = np.array([1,2,3,4], dtype=np.float32)
Y = 2 * X

W = 0.0

#model prediction 
def forward(x):
    return W * x 

#loss function -- Mean squared error 
def loss(y,y_predicted):
    return ((y_predicted - y)**2).mean()

#gradient 
#MSE = 1/N * (w*x -y)**2
#dJ/dW = 1/N * 2x (w*x - y)
def gradient(x, y, y_predicted):
    return np.dot(2*x, y_predicted-y).mean()

print(f'Prediction before training f(5) = {forward(5):.3f}')

#Training 
learning_rate = 0.01 
n_iters = 20

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)
    
    #loss 
    l = loss(Y, y_pred)
    
    #gradient 
    dw = gradient(X, Y, y_pred)
    
    #update weights
    W -= learning_rate * dw 
    
    if epoch % 4 == 0: 
        print(f'epoch {epoch+1}: W = {W:.3f}, loss = {l:.8f}')

print(f'Prediction before training f(5) = {forward(5):.3f}')        

Prediction before training f(5) = 0.000
epoch 1: W = 1.200, loss = 30.00000000
epoch 5: W = 1.980, loss = 0.01966083
epoch 9: W = 1.999, loss = 0.00001288
epoch 13: W = 2.000, loss = 0.00000001
epoch 17: W = 2.000, loss = 0.00000000
Prediction before training f(5) = 10.000


### Torch implementation 
Let's do the same example using PyTorch 

In [3]:
#Doing everything manually using numpy -- using for-loops |

#f = w * x 
#f = 2 * x

X = torch.tensor([1,2,3,4], dtype=torch.float32)
Y = torch.mul(2.0, X)
W = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

#model prediction 
def forward(x):
    return W * x 

#loss function -- Mean squared error 
def loss(y,y_predicted):
    return ((y_predicted - y)**2).mean()

#gradient 
#MSE = 1/N * (w*x -y)**2
#dJ/dW = 1/N * 2x (w*x - y)
def gradient(x, y, y_predicted):
    return np.dot(2*x, y_predicted-y).mean()

print(f'Prediction before training f(5) = {forward(5):.3f}')

#Training 
learning_rate = 0.01 
n_iters = 100

for epoch in range(n_iters):
    # prediction = forward pass
    y_pred = forward(X)
    
    #loss 
    l = loss(Y, y_pred)
    
    #gradient = backward - pass 
    l.backward() #dL/dW done this automatically 
    
    #update weights -- before updating the weight we have detach the gradients and make them zero 
    with torch.no_grad():
        W -= learning_rate * W.grad 
    
    W.grad.zero_()
    
    if epoch % 20 == 0: 
        print(f'epoch {epoch+1}: W = {W:.3f}, loss = {l:.8f}')

print(f'Prediction before training f(5) = {forward(5):.3f}')      

Prediction before training f(5) = 0.000
epoch 1: W = 0.300, loss = 30.00000000
epoch 21: W = 1.934, loss = 0.04506890
epoch 41: W = 1.997, loss = 0.00006770
epoch 61: W = 2.000, loss = 0.00000010
epoch 81: W = 2.000, loss = 0.00000000
Prediction before training f(5) = 10.000


It can be seen that the weight value we get using the `backward` function for the PyTorch routine is not as precise as the numerical gradient descent this is due to in-built method being slightly different than the vanilla gradient descent algorithm. 

## Using PyTorch's optimizers and loss criterion

1. Design model (input, output size, forward pass) 
2. Construct loss function and optimizer 
3. Training loop 
    - forward pass: compute prediction 
    - backward pass: gradients 
    - update weights using gradient descent 

In [4]:
#PyTorch Implementation 
#f = w * x 
#f = 2 * x

import torch 
import torch.nn as nn 

#Defining the input vector with new shape 
X = torch.tensor([[1],[2],[3],[4]], dtype=torch.float32)
Y = torch.mul(2.0, X)
W = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)
X_test = torch.tensor([5], dtype=torch.float32)

n_samples, n_features = X.shape

#model prediction 
#def forward(x):
#    return W * x 
# Replace with PyTorch model 

input_size = n_features
output_size = n_features

class LinearRegression(nn.Module):
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        self.lin = nn.Linear(input_dim, output_dim)
    def forward(self, x):
        return self.lin(x)

#model = nn.Linear(input_size, output_size)
model = LinearRegression(input_size, output_size)

#loss function -- Mean squared error 
#def loss(y,y_predicted):
#    return ((y_predicted - y)**2).mean()
# NOW: Defined from Pytorch

loss = nn.MSELoss()
learning_rate = 0.01
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)

#gradient 
#MSE = 1/N * (w*x -y)**2
#dJ/dW = 1/N * 2x (w*x - y)
#def gradient(x, y, y_predicted):
#    return np.dot(2*x, y_predicted-y).mean()

print(f'Prediction before training f(5) = {model(X_test).item():.3f}')

#Training 
learning_rate = 0.08
n_iters = 300

for epoch in range(n_iters):     

Prediction before training f(5) = -3.913
epoch 1: W = -0.480, loss = 55.24893951
epoch 21: W = 1.517, loss = 0.30566832
epoch 41: W = 1.593, loss = 0.23852304
epoch 61: W = 1.618, loss = 0.21154271
epoch 81: W = 1.641, loss = 0.18763362
epoch 101: W = 1.661, loss = 0.16642678
epoch 121: W = 1.681, loss = 0.14761679
epoch 141: W = 1.700, loss = 0.13093269
epoch 161: W = 1.717, loss = 0.11613436
epoch 181: W = 1.734, loss = 0.10300860
epoch 201: W = 1.749, loss = 0.09136640
epoch 221: W = 1.764, loss = 0.08103992
epoch 241: W = 1.778, loss = 0.07188058
epoch 261: W = 1.790, loss = 0.06375644
epoch 281: W = 1.803, loss = 0.05655050
Prediction before training f(5) = 9.616


Again, as seen here, the GradientDescent implementation here is not as accurate as the numerical implementation but it much faster and scalable. 