In [1]:
%matplotlib inline
import numpy as np
import torch
torch.set_printoptions(edgeitems=2, linewidth=75)

In [2]:
t_c = torch.tensor([0.5, 14.0, 15.0, 28.0, 11.0,
                    8.0, 3.0, -4.0, 6.0, 13.0, 21.0])
t_u = torch.tensor([35.7, 55.9, 58.2, 81.9, 56.3, 48.9,
                    33.9, 21.8, 48.4, 60.4, 68.4])
t_un = 0.1 * t_u

In [3]:
def model(t_u, w, b):
    return w * t_u + b

In [4]:
def loss_fn(t_p, t_c):
    squared_diffs = (t_p - t_c)**2
    return squared_diffs.mean()

## 5.5.2 Optimizers a la carte


In [6]:
import torch.optim as optim
dir(optim)

['ASGD',
 'Adadelta',
 'Adagrad',
 'Adam',
 'AdamW',
 'Adamax',
 'LBFGS',
 'NAdam',
 'Optimizer',
 'RAdam',
 'RMSprop',
 'Rprop',
 'SGD',
 'SparseAdam',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__path__',
 '__spec__',
 '_functional',
 '_multi_tensor',
 'lr_scheduler',
 'swa_utils']

Every optimizer constructor takes a list of parameters (aka PyTorch tensors, typically
with requires_grad set to True) as the first input.

STRANA 158 IMA JAKO DOBRO OBJAŠNJENO KAKO RADE OPTIMIZERS skupa sa backwards passom za njih

Let’s create params and instantiate a gradient descent optimizer:

In [15]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)#SGD stands for stochastic gradient descent. stohastic = 'random approach'

In [17]:
t_p = model(t_un, *params)
loss = loss_fn(t_p, t_c)
optimizer.zero_grad()#zeroat ovo
loss.backward()
optimizer.step()
params

tensor([2.0848, 0.1303], requires_grad=True)

 Had we called the previous code in a loop, gradients would have accumulated in the leaves at every call to backward, and our gradient
descent would have been all over the place!<br> zato ne zabroravi ZEROAT GRADIENTE

#### sad to stavimo u loop

In [18]:
def training_loop(n_epochs, optimizer, params, t_u, t_c):
    for epoch in range(1, n_epochs + 1):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        if epoch % 500 == 0:
            print('Epoch %d, Loss %f' % (epoch, float(loss)))
    return params

In [19]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

In [20]:
training_loop(
n_epochs = 5000,
optimizer = optimizer,
params = params, 
t_u = t_un,
t_c = t_c)

Epoch 500, Loss 7.860115
Epoch 1000, Loss 3.828538
Epoch 1500, Loss 3.092191
Epoch 2000, Loss 2.957698
Epoch 2500, Loss 2.933134
Epoch 3000, Loss 2.928648
Epoch 3500, Loss 2.927830
Epoch 4000, Loss 2.927679
Epoch 4500, Loss 2.927652
Epoch 5000, Loss 2.927647


tensor([  5.3671, -17.3012], requires_grad=True)

### novi optimizer, ne SGD nego Adam

manje je sensitive na scaling od varijabli i learning rate, ne moramo mnozit w s 0.1 , ...

In [21]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-1
optimizer = optim.Adam([params], lr=learning_rate)#novi optimizer

In [22]:
training_loop(
n_epochs = 2000,
optimizer = optimizer,
params = params,
t_u = t_u,#stari nepromijenjeni t_u
t_c = t_c)

Epoch 500, Loss 7.612898
Epoch 1000, Loss 3.086700
Epoch 1500, Loss 2.928579
Epoch 2000, Loss 2.927644


tensor([  0.5367, -17.3021], requires_grad=True)

## 5.5.3 Training, validation, and overfitting


simpler model may not fit the training data as perfectly as a
more complicated model would, but it will likely behave more regularly in between
data points.

increase the size until it fits,
and then scale it down until it stops overfitting

SPLITTING A DATASET<br>
Shuffling the elements of a tensor amounts to finding a permutation of its indices.
The randperm function does exactly this:

In [24]:
n_samples = t_u.shape[0] #koliko imamo pointoova
n_val = int(0.2 * n_samples) # zelimo sacuvat 20% za validation

shuffled_indices = torch.randperm(n_samples)

train_indices = shuffled_indices[:-n_val]
val_indices = shuffled_indices[-n_val:]

train_indices, val_indices



(tensor([ 9,  0, 10,  7,  1,  8,  2,  5,  4]), tensor([6, 3]))

In [26]:
train_t_u = t_u[train_indices]
train_t_c = t_c[train_indices]
val_t_u = t_u[val_indices]
val_t_c = t_c[val_indices]

train_t_un = 0.1 * train_t_u
val_t_un = 0.1 * val_t_u

Our training loop doesn’t really change. We just want to additionally evaluate the validation loss at every epoch, to have a chance to recognize whether we’re overfitting

In [28]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        val_t_p = model(val_t_u, *params) 
        val_loss = loss_fn(val_t_p, val_t_c)
        optimizer.zero_grad()
        train_loss.backward()#nema val loss backward jer ne želimo trenirati model sa validation setom
        optimizer.step()
        if epoch <= 3 or epoch % 500 == 0:
            print(f"Epoch {epoch}, Training loss {train_loss.item():.4f},"f" Validation loss {val_loss.item():.4f}")
    return params

In [30]:
params = torch.tensor([1.0, 0.0], requires_grad=True)
learning_rate = 1e-2
optimizer = optim.SGD([params], lr=learning_rate)

opet koristimo SDG pa koristimo normalizirane parametre

In [32]:
training_loop(
n_epochs = 3000,
optimizer = optimizer,
params = params,
train_t_u = train_t_un,
val_t_u = val_t_un, 
train_t_c = train_t_c,
val_t_c = val_t_c)

Epoch 1, Training loss 2.7439, Validation loss 4.7369
Epoch 2, Training loss 2.7438, Validation loss 4.7368
Epoch 3, Training loss 2.7438, Validation loss 4.7367
Epoch 500, Training loss 2.7362, Validation loss 4.6969
Epoch 1000, Training loss 2.7340, Validation loss 4.6816
Epoch 1500, Training loss 2.7333, Validation loss 4.6751
Epoch 2000, Training loss 2.7331, Validation loss 4.6722
Epoch 2500, Training loss 2.7331, Validation loss 4.6707
Epoch 3000, Training loss 2.7331, Validation loss 4.6700


tensor([  5.3196, -17.4450], requires_grad=True)

## 5.5.4 Autograd nits and switching it off

There’s another element for discussion here. Since we’re not ever calling backward on val_loss, why are we building the graph in the first place? We could in fact
just call model and loss_fn as plain functions, without tracking the computation.
However optimized, building the autograd graph comes with additional costs that we
could totally forgo during the validation pass, especially when the model has millions
of parameters.

In [33]:
def training_loop(n_epochs, optimizer, params, train_t_u, val_t_u, train_t_c, val_t_c):
    for epoch in range(1, n_epochs + 1):
        train_t_p = model(train_t_u, *params)
        train_loss = loss_fn(train_t_p, train_t_c)
        #iskljucujemo grad za val
        with torch.no_grad():
            val_t_p = model(val_t_u, *params)
            val_loss = loss_fn(val_t_p, val_t_c)
            assert val_loss.requires_grad == False

        optimizer.zero_grad()
        train_loss.backward()
        optimizer.step()

We could, for instance,
define a calc_forward function that takes data as input and runs model and loss_fn
with or without autograd according to a Boolean train_is argument:

In [34]:
#i onda vjerojatno ovo zvat u training loopu?
def calc_forward(t_u, t_c, is_train):
    with torch.set_grad_enabled(is_train):
        t_p = model(t_u, *params)
        loss = loss_fn(t_p, t_c)
    return loss