# Training Pipeline

## 1. Manual Calculations of Gradient Descent

### - Prediction: Manually
### - Gradients Computation: Manually
### - Loss Computation: Manually
### - Parameter updates: Manually

In [1]:
import numpy as np

In [32]:
# Logistic regression ?

In [19]:
# f = w*x
# f = 2*x  # a function that multiplies any given number by 2 (we don't know the function yet)

# Usually the function is not known, only the inputs (observations) and outputs. The task is to find weights to accomodate this relationship. 
# Then any other number, like 5, will also be churned through the function in the same fashion.

X = np.array([1,2,3,4], dtype=np.float32) # observations
Y = np.array([2,4,6,8], dtype=np.float32) # results for each observation
w = 0.0

In [20]:
# model prediction
def forward(x):
    return w * x

In [21]:
# loss = MSE
# loss -> 0
def loss(y, y_predicted):
    return ((y_predicted-y)**2).mean()

In [22]:
# gradient : denotes the direction of greatest change of a scalar function.
# We can choose the error method. 
# If we choose MSE, then the derivative of MSE, with the help of the "chain rule" is:

# MSE = 1/N * (w*x - y)**2
# chain rule (see 02_Backpropagation.ipynb) dloss/dw = 2s * x = 2x * (y_hat - y) . To find the mean, divide by N:
# dJ/dw = 1/N * 2*(w*x - y) * x       = 1/N 2x (w*x - y)

def gradient(x,y,y_predicted):
    return np.dot(2*x, (y_predicted - y)).mean()

print(f'Prediction before training: f(5) = {forward(5):.3f}')

Prediction before training: f(5) = 0.000


In [23]:
# Training

learning_rate = 0.01
n_iters = 10

for epoch in range(n_iters):
    # prediction: forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # calculate derivative of the loss function
    dw = gradient(X, Y, y_pred)
    
    # update weights (negative direction wrt the gradient)
    w -= learning_rate * dw
    
    if epoch % 1 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')      

epoch 1: w = 1.200, loss = 30.00000000
epoch 2: w = 1.680, loss = 4.79999924
epoch 3: w = 1.872, loss = 0.76800019
epoch 4: w = 1.949, loss = 0.12288000
epoch 5: w = 1.980, loss = 0.01966083
epoch 6: w = 1.992, loss = 0.00314570
epoch 7: w = 1.997, loss = 0.00050332
epoch 8: w = 1.999, loss = 0.00008053
epoch 9: w = 1.999, loss = 0.00001288
epoch 10: w = 2.000, loss = 0.00000206
Prediction after training: f(5) = 9.999


In [25]:
# More iterations

X = np.array([1,2,3,4], dtype=np.float32)
Y = np.array([2,4,6,8], dtype=np.float32)
w = 0.0

print(f'Prediction before training: f(5) = {forward(5):.3f}')

learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    # prediction: forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients
    dw = gradient(X, Y, y_pred)
    
    # update weights (negative direction wrt the gradient)
    w -= learning_rate * dw
    
    if epoch % 2 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')      

epoch 1: w = 1.200, loss = 30.00000000
epoch 3: w = 1.872, loss = 0.76800019
epoch 5: w = 1.980, loss = 0.01966083
epoch 7: w = 1.997, loss = 0.00050332
epoch 9: w = 1.999, loss = 0.00001288
epoch 11: w = 2.000, loss = 0.00000033
epoch 13: w = 2.000, loss = 0.00000001
epoch 15: w = 2.000, loss = 0.00000000
epoch 17: w = 2.000, loss = 0.00000000
epoch 19: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000


## 2. Manually, with Autograd for Gradient Descent

### - Prediction: Manually
### - Gradients Computation: Autograd
### - Loss Computation: Manually
### - Parameter updates: Manually

In [27]:
import torch

X = torch.tensor([1,2,3,4], dtype=torch.float32)
Y = torch.tensor([2,4,6,8], dtype=torch.float32)
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True) # interested in derivative wrt weights w. Turning on tracking of nodes history.

In [30]:
# model prediction
def forward(x):
    return w * x

# loss = MSE
def loss(y, y_predicted):
    return ((y_predicted-y)**2).mean()

print(f'Prediction before training: f(5) = {forward(5):.3f}')

learning_rate = 0.01
n_iters = 20

for epoch in range(n_iters):
    # prediction: forward pass
    y_pred = forward(X)  # history: node1 (multiplied two tensors)
    
    # calculate loss
    l = loss(Y, y_pred) # history: node2 (subtraction of 2 tensors) and node3 (power of 2)
    
    # calculate gradient of the loss function (autograd) - the node graph is by now in memory
    l.backward() # dl/dw
    
    # update weights (dl/dw = w.grad - calculates automatically since the history of nodes is tracked)
    # we don't want to create a graph and track the nodes history on the backward pass => no_grad
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # prevent gradient accumulation
    w.grad.zero_()
    
    if epoch % 2 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')      

epoch 1: w = 0.300, loss = 30.00000000
epoch 3: w = 0.772, loss = 15.66018772
epoch 5: w = 1.113, loss = 8.17471695
epoch 7: w = 1.359, loss = 4.26725292
epoch 9: w = 1.537, loss = 2.22753215
epoch 11: w = 1.665, loss = 1.16278565
epoch 13: w = 1.758, loss = 0.60698116
epoch 15: w = 1.825, loss = 0.31684780
epoch 17: w = 1.874, loss = 0.16539653
epoch 19: w = 1.909, loss = 0.08633806
Prediction after training: f(5) = 9.612


In [31]:
# not as accurate as manual -> let's increase the number of iterations

X = torch.tensor([1,2,3,4], dtype=torch.float32)
Y = torch.tensor([2,4,6,8], dtype=torch.float32)
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# model prediction
def forward(x):
    return w * x

# loss = MSE
def loss(y, y_predicted):
    return ((y_predicted-y)**2).mean()

learning_rate = 0.01
n_iters = 100

print(f'Prediction before training: f(5) = {forward(5):.3f}')

for epoch in range(n_iters):
    # prediction: forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients
    l.backward() # dl/dw
    
    # update weights (negative direction wrt the gradient)
    with torch.no_grad():
        w -= learning_rate * w.grad
        
    # zero gradients
    w.grad.zero_()
    
    if epoch % 10 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')   

epoch 1: w = 0.300, loss = 30.00000000
epoch 11: w = 1.665, loss = 1.16278565
epoch 21: w = 1.934, loss = 0.04506890
epoch 31: w = 1.987, loss = 0.00174685
epoch 41: w = 1.997, loss = 0.00006770
epoch 51: w = 1.999, loss = 0.00000262
epoch 61: w = 2.000, loss = 0.00000010
epoch 71: w = 2.000, loss = 0.00000000
epoch 81: w = 2.000, loss = 0.00000000
epoch 91: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000


## 3. Gradient, Loss and Parameter updates by PyTorch

### - Prediction: Manually
### - Gradients Computation: Autograd
### - Loss Computation: PyTorch Loss
### - Parameter updates: PyTorch Optimizer

In [34]:
# The steps of a design:

# 1) Design model (input, output size, forward pass)
# 2) Construct loss and optimizer
# 3) Training loop
#  - forward pass: compute prediction
#  - backward pass: gradients
#  - update weights

import torch
import torch.nn as nn

X = torch.tensor([1,2,3,4], dtype=torch.float32)
Y = torch.tensor([2,4,6,8], dtype=torch.float32)
w = torch.tensor(0.0, dtype=torch.float32, requires_grad=True)

# model prediction
def forward(x):
    return w * x

print(f'Prediction before training: f(5) = {forward(5):.3f}')

learning_rate = 0.01
n_iters = 100

loss = nn.MSELoss()  # replace manual loss function with PyTorch function
optimizer = torch.optim.SGD([w], lr=learning_rate) # parameter are the weights (to optimize)

for epoch in range(n_iters):
    # prediction: forward pass
    y_pred = forward(X)
    
    # loss
    l = loss(Y, y_pred)
    
    # gradients
    l.backward() # dl/dw
    
    # update weights (negative direction wrt the gradient)
    optimizer.step()
        
    # zero gradients
    optimizer.zero_grad()
    
    if epoch % 10 == 0:
        print(f'epoch {epoch+1}: w = {w:.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {forward(5):.3f}')   

Prediction before training: f(5) = 0.000
epoch 1: w = 0.300, loss = 30.00000000
epoch 11: w = 1.665, loss = 1.16278565
epoch 21: w = 1.934, loss = 0.04506890
epoch 31: w = 1.987, loss = 0.00174685
epoch 41: w = 1.997, loss = 0.00006770
epoch 51: w = 1.999, loss = 0.00000262
epoch 61: w = 2.000, loss = 0.00000010
epoch 71: w = 2.000, loss = 0.00000000
epoch 81: w = 2.000, loss = 0.00000000
epoch 91: w = 2.000, loss = 0.00000000
Prediction after training: f(5) = 10.000


## 4. Gradient, Loss and Parameter updates by PyTorch

### - Prediction: PyTorch Model
### - Gradients Computation: Autograd
### - Loss Computation: PyTorch Loss
### - Parameter updates: PyTorch Optimizer

In [14]:
# The steps of a design:

# 1) Design model (input, output size, forward pass)
# 2) Construct loss and optimizer
# 3) Training loop
#  - forward pass: compute prediction
#  - backward pass: gradients
#  - update weights

import torch
import torch.nn as nn

X = torch.tensor([[1],[2],[3],[4]], dtype=torch.float32)
Y = torch.tensor([[2],[4],[6],[8]], dtype=torch.float32)

X_test = torch.tensor([5], dtype=torch.float32)

n_samples, n_features = X.shape
print(n_samples, n_features)  # 4 samples, 1 feature

input_size = n_features
output_size = n_features

model = nn.Linear(input_size, output_size) # The model now knows taht we have inputs, outputs, and want to optimize weights (model parameters)
print(f"Model input size: {input_size}")
print(f"Model ouput size: {output_size}")
[w,b] = model.parameters()
print(f"Model weights before training: {w[0]}") # a generator
print(f"Model biases before training: {b[0]}") # a generator

print(f'Prediction before training: f(5) = {model(X_test).item():.3f}')

learning_rate = 0.01
n_iters = 200

loss = nn.MSELoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate) # The weights (to optimize) are the model parameters now

for epoch in range(n_iters):
    # prediction: forward pass
    y_pred = model(X)
    
    # loss
    l = loss(y_pred, Y)
    
    # gradients
    l.backward() # dl/dw
    
    # update weights (the optimizer takes care of the negative direction wrt the gradient)
    optimizer.step()
        
    # zero gradients
    optimizer.zero_grad()
    
    if epoch % 10 == 0:
        [w,b] = model.parameters()
        print(f'epoch {epoch+1}: w = {w[0][0].item():.3f}, bias: {b[0]:.3f}, loss = {l:.8f}')
        # print(f'epoch {epoch+1}: w = {w[0][0].item():.3f}, loss = {l:.8f}')
        
print(f'Prediction after training: f(5) = {model(X_test).item():.3f}')   

# still less perfect - experiments with learning rate and n_iter
# The overall improvement is visible, though

4 1
Model input size: 1
Model ouput size: 1
Model weights before training: tensor([-0.2777], grad_fn=<SelectBackward0>)
Model biases before training: -0.2588077783584595
Prediction before training: f(5) = -1.647
epoch 1: w = 0.077, bias: -0.140, loss = 41.92353439
epoch 11: w = 1.564, bias: 0.351, loss = 1.11736858
epoch 21: w = 1.807, bias: 0.419, loss = 0.05971167
epoch 31: w = 1.849, bias: 0.419, loss = 0.03055468
epoch 41: w = 1.860, bias: 0.409, loss = 0.02811180
epoch 51: w = 1.865, bias: 0.397, loss = 0.02645837
epoch 61: w = 1.869, bias: 0.385, loss = 0.02491792
epoch 71: w = 1.873, bias: 0.374, loss = 0.02346754
epoch 81: w = 1.877, bias: 0.363, loss = 0.02210162
epoch 91: w = 1.880, bias: 0.352, loss = 0.02081517
epoch 101: w = 1.884, bias: 0.342, loss = 0.01960364
epoch 111: w = 1.887, bias: 0.331, loss = 0.01846258
epoch 121: w = 1.891, bias: 0.322, loss = 0.01738797
epoch 131: w = 1.894, bias: 0.312, loss = 0.01637589
epoch 141: w = 1.897, bias: 0.303, loss = 0.01542274
ep

In [None]:
# One can create a custom linear regression model

class LinearRegression(nn.Module):
    
    def __init__(self, input_dim, output_dim):
        super(LinearRegression, self).__init__()
        # define layers
        self.lin = nn.Linear(input_dim, output_dim)
        
    def forward(self, x):
        return self.lin(x)
    
model = LinearRegression(input_size, output_size) 
        