## 5.5 Pytorch's autograd: Backpropagating all things

In [2]:
import torch
import numpy as np

import matplotlib.pyplot as plt


### Data

In [3]:
#input
t_u =  [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]

#labels temp in C
t_c =  [0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0]

#convert into tensor
t_c = torch.tensor(t_c)
t_u = torch.tensor(t_u)

- Rescale inputs

In [4]:
t_un = t_u * 0.1

### Model

In [5]:
def model(w,b,inputs=t_u):
    out = w * inputs + b
    return out

### Loss function

In [6]:
def loss_fn(preds, targets=t_c):
    ''' return Mean Square Error as MSE
    '''
    squared_diffs = (preds - targets)**2
    mean_sq_diffs = squared_diffs.mean()
    return mean_sq_diffs

## 5.5.2 Optimize a la carte

- Different Optimization algorithms

In [7]:
import torch.optim as optim

In [8]:
dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

- Using Gradient Descent Optimizer via optim
    - SGD : stochastic gradient descent
        - stochastic: from random mini-batch instead of the whole dataset

In [9]:
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-5
optimizer = optim.SGD([params], lr=learning_rate)

In [10]:
preds = model(*params, inputs=t_c)
loss = loss_fn(preds, targets=t_u)
#calculate gradient
loss.backward()
#update params 
optimizer.step()

print(params)

tensor([1.0099e+00, 8.2600e-04], requires_grad=True)


- Next iteration: 
    - Adding zero_grad

In [11]:
preds = model(*params, inputs=t_u)
loss = loss_fn(preds, targets=t_c)
#zero grad
optimizer.zero_grad()
#calculate gradient
loss.backward()
#update params 
optimizer.step()

print(params)

tensor([ 9.6414e-01, -1.0268e-05], requires_grad=True)


- Updated Version of Training Loop with Hyper Parameters:
    - Num epochs
    - Optimizer will include learning_rate

In [12]:
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD(params=[params],lr=learning_rate)

In [13]:
def training_loop(n_epochs,params, optimizer, inputs = t_u, targets=t_c):
    for epoch in range(1,n_epochs+1):
        w,b = params
        preds = model(w,b,inputs)
        loss = loss_fn(preds, targets)

        #zero grad
        optimizer.zero_grad()
        #calculate gradients
        loss.backward()
        #update parameters
        optimizer.step()

        #log 
        if epoch % 500 == 0:
            print(f"Epoch: {epoch} - loss:{loss:.4f}")
            
    return params
            
    

In [14]:
training_loop(
    n_epochs=5000,
    params=params,
    optimizer=optimizer,
    inputs=t_un,
    targets=t_c
)

Epoch: 500 - loss:7.8601
Epoch: 1000 - loss:3.8285
Epoch: 1500 - loss:3.0922
Epoch: 2000 - loss:2.9577
Epoch: 2500 - loss:2.9331
Epoch: 3000 - loss:2.9286
Epoch: 3500 - loss:2.9278
Epoch: 4000 - loss:2.9277
Epoch: 4500 - loss:2.9277
Epoch: 5000 - loss:2.9276


tensor([  5.3671, -17.3012], requires_grad=True)

- Testing other optimizers
    - Adam

In [15]:
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-1
optimizer = optim.Adam([params], lr=learning_rate)

training_loop(
    n_epochs = 5000,
    params = params,
    optimizer=optimizer,
    inputs=t_u,
    targets=t_c
)


Epoch: 500 - loss:7.6129
Epoch: 1000 - loss:3.0867
Epoch: 1500 - loss:2.9286
Epoch: 2000 - loss:2.9276
Epoch: 2500 - loss:2.9276
Epoch: 3000 - loss:2.9276
Epoch: 3500 - loss:2.9276
Epoch: 4000 - loss:2.9276
Epoch: 4500 - loss:2.9276
Epoch: 5000 - loss:2.9276


tensor([  0.5368, -17.3048], requires_grad=True)

### 5.5.3 Training, validation and overfitting
- Avoid overfitting: 
    - Adding penalization terms to the loss function
    - Or adding noise to the inputsamples  
- Evaluating the loss on training
    - If the loss does not decrease : model is so simple for the data.
- Evalutaing the loss on validation:
    - if the loss on training and validation diverge, the model is overfitting

- Splitting A Dataset

In [36]:
t_u.shape , t_u.shape[0]

(torch.Size([11]), 11)

In [18]:
#shuffle data
n_samples = t_u.shape[0] # get iteger out of the tensor 
#to split data 80% , 20%
n_val = int(0.2 * n_samples)
n_val

2

In [23]:
#shuffle indices
shuffled_indices = torch.randperm(n_samples)

In [24]:
train_indices = shuffled_indices[:-n_val] # from n-val count back to the beginning
val_indices = shuffled_indices[-n_val:] # from n-val to the end
train_indices, val_indices

(tensor([ 6,  8,  1,  9,  3,  4, 10,  2,  7]), tensor([0, 5]))

In [33]:
# training set 80%
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]

#validation set 20%
val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

#re-scale
train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

In [34]:
def training_loop(n_epochs,params, optimizer, train_t_u, train_t_c ,val_t_u, val_t_c ):
    for epoch in range(1,n_epochs+1):
        w,b = params

        #train
        train_preds = model(w,b, train_t_u) 
        train_loss = loss_fn(train_preds, train_t_c)
        #validation
        val_preds = model(w,b,val_t_u)
        val_loss = loss_fn(val_preds, val_t_c)

        #Apply optimizer for train stage only
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

        if epoch < 3 or epoch % 500 == 0:
            print(f" Epoch : {epoch}, Training loss : {train_loss.item():.4f}," 
                  f" Validation loss: {val_loss.item():.4f}")

    return params

In [35]:
params = torch.tensor([1.0,0.0],requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params],lr=learning_rate)

In [32]:
training_loop(
    n_epochs = 5000,
    optimizer = optimizer,
    params = params,
    train_t_u = train_t_un,
    val_t_u  = val_t_un,
    train_t_c = train_t_c,
    val_t_c = val_t_c
)

 Epoch : 0, Training loss : 96.1012, Validation loss: 9.5485
 Epoch : 1, Training loss : 35.6511, Validation loss: 22.7698
 Epoch : 2, Training loss : 28.4114, Validation loss: 35.2234
 Epoch : 500, Training loss : 7.5778, Validation loss: 13.6940
 Epoch : 1000, Training loss : 3.9653, Validation loss: 6.0994
 Epoch : 1500, Training loss : 3.3141, Validation loss: 3.8005
 Epoch : 2000, Training loss : 3.1967, Validation loss: 2.9913
 Epoch : 2500, Training loss : 3.1755, Validation loss: 2.6778
 Epoch : 3000, Training loss : 3.1717, Validation loss: 2.5501
 Epoch : 3500, Training loss : 3.1710, Validation loss: 2.4969
 Epoch : 4000, Training loss : 3.1709, Validation loss: 2.4745
 Epoch : 4500, Training loss : 3.1709, Validation loss: 2.4650


tensor([  5.2505, -16.4170], requires_grad=True)

### 5.5.4 Autograd nits and switching it off
- Seperated Computation graph will be created in train stage and validation stage
- Turn off Auto grad by torch.no_grad
- Use torch.set_grad_enabled to turn on/off gradient tracking

In [38]:
def training_loop(n_epochs, params, optimizer, train_t_u, train_t_c, val_t_u, val_t_c):
    for epoch in range(1,n_epochs+1):
        #training
        train_preds = model(*params,inputs=train_t_u)
        train_loss = loss_fn(train_preds, targets=train_t_c)
        #validating
        with torch. no_grad(): # turn off gradient tracking
            val_preds = model(*params, inputs=val_t_u)
            val_loss = loss_fn(val_preds,tartgets=val_t_c)
            assert val_loss.requires_grad == False
        #backward
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
    return params

In [39]:
# Using torch.set_grad_enabled on train or inference mode
def calc_forward(is_train:bool,params, inputs=t_u, targets=t_c):
    with torch.set_grad_enabled(is_train):
        preds = model(*params, inputs)
        loss = loss_fn(preds, targets)
    return loss 

## Exercise
- Use model as : w2 * t_u ** 2 + w1 * t_u + b

In [None]:
# debug import pdb; pdb.set_trace()

data

In [92]:
#input
inputs =  [35.7, 55.9, 58.2, 81.9, 56.3, 48.9, 33.9, 21.8, 48.4, 60.4, 68.4]

#labels temp in C
targets =  [0.5, 14.0, 15.0, 28.0, 11.0, 8.0, 3.0, -4.0, 6.0, 13.0, 21.0]

#convert into tensor
inputs = torch.tensor(inputs)
targets = torch.tensor(targets)

Split data

In [93]:
n_samples = inputs.shape[0]
n_split = int(0.2 * n_samples)
n_split

2

In [94]:
#shuffle indicies 
shuffled_idx = torch.randperm(n_samples)

train_idx = shuffled_idx[:-n_split]
val_idx = shuffled_idx[-n_split:]

train_inputs = inputs[train_idx]
train_targets = targets[train_idx]

val_inputs = inputs[val_idx]
val_targets = targets[val_idx]

#scale inputs
train_inputs_sc = 0.1 * train_inputs
train_targets_sc = 0.1 * train_targets
train_inputs, val_inputs

(tensor([35.7000, 81.9000, 33.9000, 58.2000, 48.4000, 55.9000, 60.4000, 21.8000,
         56.3000]),
 tensor([48.9000, 68.4000]))

model

In [95]:
#Use 3 variable
def model(w1, w2, b, inputs=None):
    out = w2 * inputs**2 + w1 * inputs + b
    return out

loss function

In [96]:
def loss_fn(preds, targets):
    sqr_diff = (preds - targets)**2
    mean_sqr_diff = sqr_diff.mean()
    return mean_sqr_diff
    

In [97]:
def calc_forward(inputs,targets,params,is_train:bool=True):
    with torch.set_grad_enabled(is_train): #keep track gradient or not
        preds = model(*params,inputs)
        loss = loss_fn(preds, targets)
        return loss

In [134]:
def training_loop(n_epochs,optimizer,params,fn_forward,train_inputs,val_inputs,train_targets,val_targets):
    for epoch in range(1,n_epochs+1):
        #training stage
        # train_loss = fn_forward(train_inputs,train_targets,params)
        with torch.set_grad_enabled(True): #keep track gradient or not
            preds = model(*params,inputs)
            train_loss = loss_fn(preds, targets)
        #validating stage
        val_loss = fn_forward(val_inputs,val_targets,params,is_train=False)
        
        #backward
        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()
        
        #log_file
        # if epoch in {1,2,5,10,11,20,100,1000,4000,5000}:
        #     print(f"epoch: {epoch}")
        #     print(f"Train loss: {train_loss.item():.4f}")
        #     print(f"    params: {params}")
        #     print(f"  gradient: {params.grad}")
        #     print("===" *100)
        #log
        if epoch < 5 or epoch % 500 == 0:
            print(f"Epoch: {epoch}, Train loss: {train_loss.item():.4f}, Val loss: {val_loss.item():.4f}") 
        
    return params

In [135]:
params = torch.tensor([0.5,1.0,0.0],requires_grad=True)
learning_rate = 1e-3
optimizer = optim.Adam([params],lr = learning_rate)

In [136]:
training_loop(
    n_epochs= 5000,
    optimizer = optimizer,
    params = params,
    fn_forward = calc_forward,
    train_inputs= train_inputs_sc,
    val_inputs = val_inputs,
    train_targets = train_targets_sc,
    val_targets = val_targets 
)

Epoch: 1, Train loss: 11529303.0000, Val loss: 13904721.0000
Epoch: 2, Train loss: 11505981.0000, Val loss: 13876582.0000
Epoch: 3, Train loss: 11482681.0000, Val loss: 13848470.0000
Epoch: 4, Train loss: 11459407.0000, Val loss: 13820390.0000
Epoch: 500, Train loss: 3574321.0000, Val loss: 4308052.0000
Epoch: 1000, Train loss: 736202.5625, Val loss: 885813.9375
Epoch: 1500, Train loss: 83884.9219, Val loss: 100301.9531
Epoch: 2000, Train loss: 4306.6470, Val loss: 4970.8213
Epoch: 2500, Train loss: 100.8723, Val loss: 75.9647
Epoch: 3000, Train loss: 23.3180, Val loss: 8.7107
Epoch: 3500, Train loss: 22.8300, Val loss: 10.1078
Epoch: 4000, Train loss: 22.6845, Val loss: 10.0948
Epoch: 4500, Train loss: 22.4989, Val loss: 10.0065
Epoch: 5000, Train loss: 22.2625, Val loss: 9.8934


tensor([-0.4803,  0.0117, -0.9707], requires_grad=True)