In [1]:
import numpy as np
import torch

import metrics
import utils

# Gradient Descent

In [2]:
## Model for training

IN_SIZE = 28 * 28
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 256
OUT_SIZE = 10

NEPOCHS = 5
BATCH_SIZE = 64

train_loader, test_loader = utils.load_mnist(BATCH_SIZE)
train_loader_01, test_loader_01 = utils.load_mnist_01(BATCH_SIZE)

def compute_accuracy(y_preds, y):
    total = len(y_preds)
    correct = np.equal(y_preds, y).sum()
    return correct, total

def run_tests(net, test_loader):
    test_y_preds, test_y_true = utils.get_class_output(net, test_loader)
    correct, total = compute_accuracy(test_y_preds, test_y_true)
    acc = float(correct) / total
    print('Test accuracy = {} ({}/{})'.format(acc, correct, total))

class Net(torch.nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, OUT_SIZE)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x)
        return y_logits


criterion = torch.nn.CrossEntropyLoss(reduction='sum')

## Stochastic gradient descent

- Go over the whole training set for several passes, each pass is called an epoch
- For each epoch, divide the training set into x random grousps with a fixed bath size, and iterative over all the batchs of data
- For each batch, compute the gradient of the weights, and update them accordingly to the learning rate $\epsilon$:

$$\theta \leftarrow \theta - \epsilon \nabla_\theta J$$

In [3]:
LEARNING_RATE = 0.001

net = Net()

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            lr_k = LEARNING_RATE
            w.data.sub_(w.grad.data * lr_k)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.0416259765625
1[301] Loss = 33.40361404418945
1[601] Loss = 28.80887794494629
1[901] Loss = 21.72443389892578
Test accuracy = 0.9229 (9229/10000)
2[1] Loss = 15.224136352539062
2[301] Loss = 13.496562004089355
2[601] Loss = 16.52216339111328
2[901] Loss = 7.9787516593933105
Test accuracy = 0.9389 (9389/10000)
3[1] Loss = 12.116883277893066
3[301] Loss = 11.854864120483398
3[601] Loss = 9.45172119140625
3[901] Loss = 16.465801239013672
Test accuracy = 0.9564 (9564/10000)
4[1] Loss = 12.34631061553955
4[301] Loss = 7.600798606872559
4[601] Loss = 7.513490200042725
4[901] Loss = 7.896974086761475
Test accuracy = 0.9603 (9603/10000)
5[1] Loss = 2.8604655265808105
5[301] Loss = 12.922957420349121
5[601] Loss = 12.881921768188477
5[901] Loss = 6.0581512451171875
Test accuracy = 0.965 (9650/10000)


## SGD with linearly decaying learning rate

The learning rate $\epsilon$ linearly decrease over the whole training.  
$\epsilon_0$: initial learning rate.  
$\epsilon_\tau$: final learning rate.  
$\tau$: number of iterations to reach $\epsilon_\tau$.

$$\alpha = \frac{k}{\tau}$$

Learning rate at iteration $k$:

$$
\epsilon_k = 
\begin{cases}
    (1 - \alpha)\epsilon_0 + \alpha \epsilon_\tau & \text{if } k \leq \tau\\
    \epsilon_\tau & \text{if } k > \tau
\end{cases}
$$

In [4]:
LR_0 = 0.005
LR_T = 0.0001
T = len(train_loader) * NEPOCHS

net = Net()

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            lr_k = max(LR_T, (1 - k/T) * LR_0 + k/T * LR_T)
            w.data.sub_(w.grad.data * lr_k)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.7835693359375
1[301] Loss = 19.255510330200195
1[601] Loss = 15.425236701965332
1[901] Loss = 14.171222686767578
Test accuracy = 0.9362 (9362/10000)
2[1] Loss = 6.965445041656494
2[301] Loss = 10.846342086791992
2[601] Loss = 7.963724613189697
2[901] Loss = 9.435616493225098
Test accuracy = 0.9576 (9576/10000)
3[1] Loss = 3.6737990379333496
3[301] Loss = 6.052556991577148
3[601] Loss = 1.3734350204467773
3[901] Loss = 16.566186904907227
Test accuracy = 0.9685 (9685/10000)
4[1] Loss = 3.8570051193237305
4[301] Loss = 4.3883771896362305
4[601] Loss = 5.736208438873291
4[901] Loss = 9.423709869384766
Test accuracy = 0.9733 (9733/10000)
5[1] Loss = 5.546458721160889
5[301] Loss = 9.521862030029297
5[601] Loss = 0.7484488487243652
5[901] Loss = 2.3971590995788574
Test accuracy = 0.9759 (9759/10000)


### SGD with momemtum

Keep tracks of an exponentially decaying moving average of all gradients $v$.  
Update the wieghts in the direction of v.  

$$v \leftarrow \alpha v - \epsilon \nabla_\theta J$$
$$\theta \leftarrow \theta + v$$  

$\epsilon$: learning rate.  
$\alpha$: momemtum coefficient.  
The initial value of $v$ influence the converge, it usually starts at 0.

Commom values for $\alpha$ are $0.5$, $0.9$ and $0.99$

In [5]:
LR = 0.001
ALPHA = 0.9

net = Net()

vs = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    vs.append(v)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)

        net.zero_grad()
        loss.backward()
        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            vs[i].data = ALPHA * vs[i].data - LR * w.grad.data
            w.data.add_(vs[i].data)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.8729248046875
1[301] Loss = 14.689855575561523
1[601] Loss = 5.42473030090332
1[901] Loss = 20.71593475341797
Test accuracy = 0.9366 (9366/10000)
2[1] Loss = 7.896501541137695
2[301] Loss = 2.8708600997924805
2[601] Loss = 12.732126235961914
2[901] Loss = 5.810671806335449
Test accuracy = 0.9523 (9523/10000)
3[1] Loss = 13.077378273010254
3[301] Loss = 8.24414348602295
3[601] Loss = 6.599118709564209
3[901] Loss = 4.978887557983398
Test accuracy = 0.9643 (9643/10000)
4[1] Loss = 8.29452133178711
4[301] Loss = 3.3276140689849854
4[601] Loss = 5.042981147766113
4[901] Loss = 9.86808967590332
Test accuracy = 0.9615 (9615/10000)
5[1] Loss = 12.975824356079102
5[301] Loss = 1.485975980758667
5[601] Loss = 5.177299976348877
5[901] Loss = 2.8250112533569336
Test accuracy = 0.9689 (9689/10000)


### SGD with Nestverov momentum

Evaluates gradient after the current velocity is applied.  

$$v \leftarrow \alpha v - \epsilon \nabla_\theta J(\theta + \alpha v)$$
$$\theta \leftarrow \theta + v$$

In [18]:
LR = 0.001
ALPHA = 0.9

net = Net()

vs = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    vs.append(v)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        params = list(net.parameters())
        for i in range(len(params)):
            params[i].data.add_(ALPHA * vs[i])
        
        y_logits = net(X)       
        loss = criterion(y_logits, y)

        net.zero_grad()
        loss.backward()

        for i in range(len(params)):
            w = params[i]
            w.data.sub_(ALPHA * vs[i])
            vs[i].data = ALPHA * vs[i].data - LR * w.grad.data
            w.data.add_(vs[i].data)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 148.1569366455078
1[301] Loss = 22.189186096191406
1[601] Loss = 25.577085494995117
1[901] Loss = 17.550888061523438
Test accuracy = 0.9499 (9499/10000)
2[1] Loss = 11.429716110229492
2[301] Loss = 11.429800033569336
2[601] Loss = 14.804403305053711
2[901] Loss = 12.393882751464844
Test accuracy = 0.9615 (9615/10000)
3[1] Loss = 9.213675498962402
3[301] Loss = 3.9947948455810547
3[601] Loss = 9.342105865478516
3[901] Loss = 2.3476529121398926
Test accuracy = 0.9697 (9697/10000)
4[1] Loss = 0.6105036735534668
4[301] Loss = 1.2539119720458984
4[601] Loss = 3.8320446014404297
4[901] Loss = 3.2328386306762695
Test accuracy = 0.968 (9680/10000)
5[1] Loss = 3.1399521827697754
5[301] Loss = 2.3626508712768555
5[601] Loss = 12.396028518676758
5[901] Loss = 3.1604554653167725
Test accuracy = 0.9726 (9726/10000)


### Adagrad 

Scale gradients by the inverse of the sum of all squared previous gradients.  

$$r\leftarrow r + \nabla_\theta J \odot \nabla_\theta J$$
$$\theta \leftarrow \theta - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  

$\epsilon$: learning rate.  
$\delta$: small constant ($10^{-7}$) to avoid division by $0$.  
$r$ is initialized at $0$

large gradient results in small learning rate.  
small gradients results in large learning rate. 
It makes progress on more gently slopes directions of the parameters space.  
Fast for convex optimization, but might also work for deep learning

In [19]:
LR = 0.001
DELTA = 1e-7

NEPOCHS = 5

net = Net()

rs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):        
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            rs[i].data.add_(w.grad.data ** 2)
            w.data.sub_(LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.806884765625
1[301] Loss = 38.82220458984375
1[601] Loss = 30.04961395263672
1[901] Loss = 24.438278198242188
Test accuracy = 0.8982 (8982/10000)
2[1] Loss = 23.91636085510254
2[301] Loss = 33.47201919555664
2[601] Loss = 17.666114807128906
2[901] Loss = 20.431621551513672
Test accuracy = 0.9089 (9089/10000)
3[1] Loss = 11.867258071899414
3[301] Loss = 26.43012237548828
3[601] Loss = 15.245670318603516
3[901] Loss = 14.666797637939453
Test accuracy = 0.9136 (9136/10000)
4[1] Loss = 16.747650146484375
4[301] Loss = 16.740951538085938
4[601] Loss = 17.106903076171875
4[901] Loss = 21.785045623779297
Test accuracy = 0.9181 (9181/10000)
5[1] Loss = 29.080963134765625
5[301] Loss = 12.126216888427734
5[601] Loss = 12.868086814880371
5[901] Loss = 9.228414535522461
Test accuracy = 0.9211 (9211/10000)


### RMSProp

It's a modified version of Adagrad.  
$r$ is now an exponentially dacying average.  

$$r\leftarrow \rho r + (1 - \rho) \nabla_\theta J \odot \nabla_\theta J$$
$$\theta \leftarrow \theta - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  

$\rho$: day rate.  
$\epsilon$: learning rate.  
$\delta$: small constant ($10^{-6}$) to avoid division by $0$.

It gives better results than Adagrad for deep models

In [17]:
LR = 0.001
DELTA = 1e-6
DECAY = 0.9

NEPOCHS = 5

net = Net()

rs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            rs[i].data = DECAY * rs[i].data + (1-DECAY) * w.grad.data**2
            w.data.sub_(LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.15213012695312
1[301] Loss = 16.530723571777344
1[601] Loss = 13.429508209228516
1[901] Loss = 7.354430675506592
Test accuracy = 0.9501 (9501/10000)
2[1] Loss = 11.361360549926758
2[301] Loss = 13.25605297088623
2[601] Loss = 8.930535316467285
2[901] Loss = 2.5876593589782715
Test accuracy = 0.9653 (9653/10000)
3[1] Loss = 1.5092804431915283
3[301] Loss = 2.7342429161071777
3[601] Loss = 1.9135382175445557
3[901] Loss = 3.1017608642578125
Test accuracy = 0.9663 (9663/10000)
4[1] Loss = 2.0715930461883545
4[301] Loss = 2.3610613346099854
4[601] Loss = 0.3909754753112793
4[901] Loss = 5.248087406158447
Test accuracy = 0.9693 (9693/10000)
5[1] Loss = 3.4261670112609863
5[301] Loss = 1.4230625629425049
5[601] Loss = 3.818699836730957
5[901] Loss = 8.033636093139648
Test accuracy = 0.9736 (9736/10000)


## RMSProp with Neskerov momentum

Add neskerov momemtum to RMSProp: 

$$r\leftarrow \rho r + (1 - \rho) \nabla_\theta J \odot \nabla_\theta J$$
$$v \leftarrow v - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  
$$\theta \leftarrow \theta + v$$  

$v$ is a moving average of the scaled gradients

In [20]:
LR = 0.001
DELTA = 1e-6
DECAY = 0.9
ALPHA = 0.9

NEPOCHS = 5

net = Net()

rs = list()
vs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    v = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)
    vs.append(v)
    
k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        params = list(net.parameters())
        for i in range(len(params)):
            params[i].data.add_(ALPHA * vs[i])
        
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        for i in range(len(params)):
            w = params[i]
            w.data.sub_(ALPHA * vs[i])
            rs[i].data = DECAY * rs[i].data + (1-DECAY) * w.grad.data**2
            vs[i].data = ALPHA * vs[i].data - LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data
            w.data.add_(vs[i].data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.29151916503906
1[301] Loss = 9.260927200317383
1[601] Loss = 19.946502685546875
1[901] Loss = 25.767475128173828
Test accuracy = 0.927 (9270/10000)
2[1] Loss = 13.981639862060547
2[301] Loss = 19.20871925354004
2[601] Loss = 8.406815528869629
2[901] Loss = 8.474459648132324
Test accuracy = 0.9257 (9257/10000)
3[1] Loss = 9.32218074798584
3[301] Loss = 11.439027786254883
3[601] Loss = 3.137418031692505
3[901] Loss = 32.93860626220703
Test accuracy = 0.9354 (9354/10000)
4[1] Loss = 11.794574737548828
4[301] Loss = 9.430360794067383
4[601] Loss = 12.386016845703125
4[901] Loss = 19.25663948059082
Test accuracy = 0.9275 (9275/10000)
5[1] Loss = 1.8568296432495117
5[301] Loss = 32.5167121887207
5[601] Loss = 8.295333862304688
5[901] Loss = 26.667999267578125
Test accuracy = 0.9375 (9375/10000)


### Adam

Compute an estimate of the first-order and second-order moments.  
Then correct the bias on these estimates.  
These corrected estimates are then used to compute the weight updates.

$$s \leftarrow \rho_1 s + (1 - \rho_1) \nabla_\theta J$$
$$r \leftarrow \rho_2 r + (1 - \rho_2) \nabla_\theta J \odot \nabla_\theta J$$
$$\hat{s} \leftarrow \frac{s}{1 - \rho_1^t}$$
$$\hat{r} \leftarrow \frac{r}{1 - \rho_2^t}$$
$$\theta \leftarrow \theta - \epsilon \frac{\hat{s}}{\sqrt{\hat{r}} + \delta}$$  

$t$: iteration counter.  
$s$ et $r$: respectively first et second-order moments.  
$\hat{s}$ et $\hat{r}$: respectively corrected first et second-order moments.  

$\epsilon$: learning rate (default: $0.001$)  
$\rho_1$: decay rate for first-order moments estimates (default: $0.9$)   
$\rho_2$: decay rate for second-order moments estimates (default: $0.999$)   
$\delta$: small contant to avoid division by zero (default: $10^{-8}$)

In [27]:
LR = 0.001
DELTA = 1e-8
DEC1 = 0.9
DEC2 = 0.999

NEPOCHS = 5

net = Net()

s = list()
r = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    s.append(torch.zeros(w.shape, dtype=w.dtype))
    r.append(torch.zeros(w.shape, dtype=w.dtype))
    
t = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            s[i].data = DEC1 * s[i].data + (1-DEC1) * w.grad.data
            r[i].data = DEC2 * r[i].data + (1-DEC2) * w.grad.data**2
            sc = s[i].data / (1 - DEC1**t)
            rc = r[i].data / (1 - DEC2**t)

            vs[i].data = ALPHA * vs[i].data - LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data
            w.data.sub_(LR * sc.data / (torch.sqrt(rc.data) + DELTA))
            t += 1
            
            
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 146.7119140625
1[301] Loss = 9.362897872924805
1[601] Loss = 18.847803115844727
1[901] Loss = 9.547917366027832
Test accuracy = 0.9514 (9514/10000)
2[1] Loss = 11.213214874267578
2[301] Loss = 5.74330472946167
2[601] Loss = 13.582456588745117
2[901] Loss = 19.507999420166016
Test accuracy = 0.9652 (9652/10000)
3[1] Loss = 1.296182632446289
3[301] Loss = 2.2929320335388184
3[601] Loss = 7.244189262390137
3[901] Loss = 3.0191147327423096
Test accuracy = 0.9683 (9683/10000)
4[1] Loss = 3.5898241996765137
4[301] Loss = 0.941504716873169
4[601] Loss = 7.777242660522461
4[901] Loss = 3.063556671142578
Test accuracy = 0.9743 (9743/10000)
5[1] Loss = 1.0956976413726807
5[301] Loss = 3.640176773071289
5[601] Loss = 0.3000774383544922
5[901] Loss = 3.1564254760742188
Test accuracy = 0.9765 (9765/10000)
