In [1]:
import sys
sys.path.append('../../pyutils')

import numpy as np
import torch

import metrics
import utils

# Gradient Descent

In [2]:
## Model for training

IN_SIZE = 28 * 28
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 256
OUT_SIZE = 10

NEPOCHS = 5
BATCH_SIZE = 64

train_loader, test_loader = utils.load_mnist(BATCH_SIZE)
train_loader_01, test_loader_01 = utils.load_mnist_01(BATCH_SIZE)

def compute_accuracy(y_preds, y):
    total = len(y_preds)
    correct = np.equal(y_preds, y).sum()
    return correct, total

def run_tests(net, test_loader):
    test_y_preds, test_y_true = utils.get_class_output(net, test_loader)
    correct, total = compute_accuracy(test_y_preds, test_y_true)
    acc = float(correct) / total
    print('Test accuracy = {} ({}/{})'.format(acc, correct, total))

class Net(torch.nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, OUT_SIZE)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x)
        return y_logits


criterion = torch.nn.CrossEntropyLoss(reduction='sum')

## Stochastic gradient descent

- Go over the whole training set for several passes, each pass is called an epoch
- For each epoch, divide the training set into x random grousps with a fixed bath size, and iterative over all the batchs of data
- For each batch, compute the gradient of the weights, and update them accordingly to the learning rate $\epsilon$:

$$\theta \leftarrow \theta - \epsilon \nabla_\theta J$$

In [3]:
LEARNING_RATE = 0.001

net = Net()

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            lr_k = LEARNING_RATE
            w.data.sub_(w.grad.data * lr_k)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.12203979492188
1[301] Loss = 35.03900909423828
1[601] Loss = 37.909244537353516
1[901] Loss = 10.858194351196289
Test accuracy = 0.9222 (9222/10000)
2[1] Loss = 10.571956634521484
2[301] Loss = 9.005818367004395
2[601] Loss = 11.808006286621094
2[901] Loss = 15.839620590209961
Test accuracy = 0.9414 (9414/10000)
3[1] Loss = 8.340900421142578
3[301] Loss = 9.040695190429688
3[601] Loss = 16.13271141052246
3[901] Loss = 11.940797805786133
Test accuracy = 0.9545 (9545/10000)
4[1] Loss = 13.607768058776855
4[301] Loss = 5.404769420623779
4[601] Loss = 6.2489190101623535
4[901] Loss = 7.608024597167969
Test accuracy = 0.9563 (9563/10000)
5[1] Loss = 5.324291706085205
5[301] Loss = 8.201653480529785
5[601] Loss = 4.807262420654297
5[901] Loss = 1.8956794738769531
Test accuracy = 0.9662 (9662/10000)


## SGD with linearly decaying learning rate

The learning rate $\epsilon$ linearly decrease over the whole training.  
$\epsilon_0$: initial learning rate.  
$\epsilon_\tau$: final learning rate.  
$\tau$: number of iterations to reach $\epsilon_\tau$.

$$\alpha = \frac{k}{\tau}$$

Learning rate at iteration $k$:

$$
\epsilon_k = 
\begin{cases}
    (1 - \alpha)\epsilon_0 + \alpha \epsilon_\tau & \text{if } k \leq \tau\\
    \epsilon_\tau & \text{if } k > \tau
\end{cases}
$$

In [4]:
LR_0 = 0.005
LR_T = 0.0001
T = len(train_loader) * NEPOCHS

net = Net()

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            lr_k = max(LR_T, (1 - k/T) * LR_0 + k/T * LR_T)
            w.data.sub_(w.grad.data * lr_k)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.43655395507812
1[301] Loss = 31.766691207885742
1[601] Loss = 26.692150115966797
1[901] Loss = 17.659358978271484
Test accuracy = 0.9371 (9371/10000)
2[1] Loss = 6.750217914581299
2[301] Loss = 2.5656964778900146
2[601] Loss = 3.859900712966919
2[901] Loss = 16.458829879760742
Test accuracy = 0.9605 (9605/10000)
3[1] Loss = 4.471502780914307
3[301] Loss = 1.9399080276489258
3[601] Loss = 4.323775768280029
3[901] Loss = 9.659144401550293
Test accuracy = 0.9718 (9718/10000)
4[1] Loss = 1.2469983100891113
4[301] Loss = 2.9788525104522705
4[601] Loss = 6.375918388366699
4[901] Loss = 4.068788051605225
Test accuracy = 0.976 (9760/10000)
5[1] Loss = 2.2555437088012695
5[301] Loss = 5.581851482391357
5[601] Loss = 2.3077499866485596
5[901] Loss = 4.521398544311523
Test accuracy = 0.9775 (9775/10000)


### SGD with momemtum

Keep tracks of an exponentially decaying moving average of all gradients $v$.  
Update the wieghts in the direction of v.  

$$v \leftarrow \alpha v - \epsilon \nabla_\theta J$$
$$\theta \leftarrow \theta + v$$  

$\epsilon$: learning rate.  
$\alpha$: momemtum coefficient.  
The initial value of $v$ influence the converge, it usually starts at 0.

Commom values for $\alpha$ are $0.5$, $0.9$ and $0.99$

In [5]:
LR = 0.001
ALPHA = 0.9

net = Net()

vs = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    vs.append(v)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)

        net.zero_grad()
        loss.backward()
        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            vs[i].data = ALPHA * vs[i].data - LR * w.grad.data
            w.data.add_(vs[i].data)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.76861572265625
1[301] Loss = 10.775410652160645
1[601] Loss = 13.397440910339355
1[901] Loss = 15.353017807006836
Test accuracy = 0.9441 (9441/10000)
2[1] Loss = 8.902735710144043
2[301] Loss = 6.238028526306152
2[601] Loss = 7.800531387329102
2[901] Loss = 18.204063415527344
Test accuracy = 0.9566 (9566/10000)
3[1] Loss = 8.70993423461914
3[301] Loss = 13.134697914123535
3[601] Loss = 3.8357884883880615
3[901] Loss = 11.406211853027344
Test accuracy = 0.9635 (9635/10000)
4[1] Loss = 1.6417672634124756
4[301] Loss = 1.8813347816467285
4[601] Loss = 4.965744495391846
4[901] Loss = 8.417074203491211
Test accuracy = 0.9665 (9665/10000)
5[1] Loss = 7.727075099945068
5[301] Loss = 6.806301593780518
5[601] Loss = 14.669434547424316
5[901] Loss = 4.21833610534668
Test accuracy = 0.9715 (9715/10000)


### SGD with Nestverov momentum

Evaluates gradient after the current velocity is applied.  

$$v \leftarrow \alpha v - \epsilon \nabla_\theta J(\theta + \alpha v)$$
$$\theta \leftarrow \theta + v$$

In [6]:
LR = 0.001
ALPHA = 0.9

net = Net()

vs = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    vs.append(v)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        params = list(net.parameters())
        for i in range(len(params)):
            params[i].data.add_(ALPHA * vs[i])
        
        y_logits = net(X)       
        loss = criterion(y_logits, y)

        net.zero_grad()
        loss.backward()

        for i in range(len(params)):
            w = params[i]
            w.data.sub_(ALPHA * vs[i])
            vs[i].data = ALPHA * vs[i].data - LR * w.grad.data
            w.data.add_(vs[i].data)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.99777221679688
1[301] Loss = 24.395904541015625
1[601] Loss = 4.777647018432617
1[901] Loss = 3.5287046432495117
Test accuracy = 0.9523 (9523/10000)
2[1] Loss = 4.8280181884765625
2[301] Loss = 19.450077056884766
2[601] Loss = 6.486504554748535
2[901] Loss = 11.65528392791748
Test accuracy = 0.962 (9620/10000)
3[1] Loss = 4.934237003326416
3[301] Loss = 14.634001731872559
3[601] Loss = 15.99342155456543
3[901] Loss = 2.7389774322509766
Test accuracy = 0.9628 (9628/10000)
4[1] Loss = 12.419322967529297
4[301] Loss = 8.72746753692627
4[601] Loss = 3.07930588722229
4[901] Loss = 5.15720272064209
Test accuracy = 0.9688 (9688/10000)
5[1] Loss = 3.1968774795532227
5[301] Loss = 22.001794815063477
5[601] Loss = 2.966553211212158
5[901] Loss = 3.1347455978393555
Test accuracy = 0.97 (9700/10000)


### Adagrad 

Scale gradients by the inverse of the sum of all squared previous gradients.  

$$r\leftarrow r + \nabla_\theta J \odot \nabla_\theta J$$
$$\theta \leftarrow \theta - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  

$\epsilon$: learning rate.  
$\delta$: small constant ($10^{-7}$) to avoid division by $0$.  
$r$ is initialized at $0$

large gradient results in small learning rate.  
small gradients results in large learning rate. 
It makes progress on more gently slopes directions of the parameters space.  
Fast for convex optimization, but might also work for deep learning

In [7]:
LR = 0.001
DELTA = 1e-7

NEPOCHS = 5

net = Net()

rs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):        
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            rs[i].data.add_(w.grad.data ** 2)
            w.data.sub_(LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 148.15203857421875
1[301] Loss = 23.832956314086914
1[601] Loss = 32.8941650390625
1[901] Loss = 25.956357955932617
Test accuracy = 0.8966 (8966/10000)
2[1] Loss = 18.474563598632812
2[301] Loss = 21.697593688964844
2[601] Loss = 31.860767364501953
2[901] Loss = 18.733186721801758
Test accuracy = 0.9075 (9075/10000)
3[1] Loss = 21.09713363647461
3[301] Loss = 14.383761405944824
3[601] Loss = 26.49991226196289
3[901] Loss = 16.354877471923828
Test accuracy = 0.9145 (9145/10000)
4[1] Loss = 24.2746524810791
4[301] Loss = 25.61415672302246
4[601] Loss = 12.282295227050781
4[901] Loss = 21.130104064941406
Test accuracy = 0.9197 (9197/10000)
5[1] Loss = 27.237180709838867
5[301] Loss = 22.39988136291504
5[601] Loss = 14.564952850341797
5[901] Loss = 13.574392318725586
Test accuracy = 0.923 (9230/10000)


### RMSProp

It's a modified version of Adagrad.  
$r$ is now an exponentially dacying average.  

$$r\leftarrow \rho r + (1 - \rho) \nabla_\theta J \odot \nabla_\theta J$$
$$\theta \leftarrow \theta - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  

$\rho$: day rate.  
$\epsilon$: learning rate.  
$\delta$: small constant ($10^{-6}$) to avoid division by $0$.

It gives better results than Adagrad for deep models

In [8]:
LR = 0.001
DELTA = 1e-6
DECAY = 0.9

NEPOCHS = 5

net = Net()

rs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            rs[i].data = DECAY * rs[i].data + (1-DECAY) * w.grad.data**2
            w.data.sub_(LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 146.45887756347656
1[301] Loss = 31.71043586730957
1[601] Loss = 6.81320333480835
1[901] Loss = 25.885921478271484
Test accuracy = 0.9478 (9478/10000)
2[1] Loss = 6.4659600257873535
2[301] Loss = 1.7969741821289062
2[601] Loss = 17.53740692138672
2[901] Loss = 9.817222595214844
Test accuracy = 0.9523 (9523/10000)
3[1] Loss = 4.522251129150391
3[301] Loss = 1.0729795694351196
3[601] Loss = 2.737246513366699
3[901] Loss = 2.9322943687438965
Test accuracy = 0.964 (9640/10000)
4[1] Loss = 3.486811399459839
4[301] Loss = 7.018847942352295
4[601] Loss = 11.26497745513916
4[901] Loss = 2.9154114723205566
Test accuracy = 0.9675 (9675/10000)
5[1] Loss = 10.053081512451172
5[301] Loss = 6.813046932220459
5[601] Loss = 9.22724437713623
5[901] Loss = 11.63058853149414
Test accuracy = 0.9796 (9796/10000)


## RMSProp with Neskerov momentum

Add neskerov momemtum to RMSProp: 

$$r\leftarrow \rho r + (1 - \rho) \nabla_\theta J \odot \nabla_\theta J$$
$$v \leftarrow v - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  
$$\theta \leftarrow \theta + v$$  

$v$ is a moving average of the scaled gradients

In [9]:
LR = 0.001
DELTA = 1e-6
DECAY = 0.9
ALPHA = 0.9

NEPOCHS = 5

net = Net()

rs = list()
vs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    v = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)
    vs.append(v)
    
k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        params = list(net.parameters())
        for i in range(len(params)):
            params[i].data.add_(ALPHA * vs[i])
        
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        for i in range(len(params)):
            w = params[i]
            w.data.sub_(ALPHA * vs[i])
            rs[i].data = DECAY * rs[i].data + (1-DECAY) * w.grad.data**2
            vs[i].data = ALPHA * vs[i].data - LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data
            w.data.add_(vs[i].data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.57423400878906
1[301] Loss = 19.61400604248047
1[601] Loss = 18.796306610107422
1[901] Loss = 6.967098236083984
Test accuracy = 0.9329 (9329/10000)
2[1] Loss = 20.245744705200195
2[301] Loss = 17.187490463256836
2[601] Loss = 23.345001220703125
2[901] Loss = 53.19303894042969
Test accuracy = 0.9312 (9312/10000)
3[1] Loss = 23.377840042114258
3[301] Loss = 20.069438934326172
3[601] Loss = 18.485942840576172
3[901] Loss = 21.879501342773438
Test accuracy = 0.9371 (9371/10000)
4[1] Loss = 12.797813415527344
4[301] Loss = 8.611550331115723
4[601] Loss = 17.89862060546875
4[901] Loss = 11.844609260559082
Test accuracy = 0.9357 (9357/10000)
5[1] Loss = 8.426780700683594
5[301] Loss = 7.616188049316406
5[601] Loss = 9.39261245727539
5[901] Loss = 12.512612342834473
Test accuracy = 0.9318 (9318/10000)


### Adam

Compute an estimate of the first-order and second-order moments.  
Then correct the bias on these estimates.  
These corrected estimates are then used to compute the weight updates.

$$s \leftarrow \rho_1 s + (1 - \rho_1) \nabla_\theta J$$
$$r \leftarrow \rho_2 r + (1 - \rho_2) \nabla_\theta J \odot \nabla_\theta J$$
$$\hat{s} \leftarrow \frac{s}{1 - \rho_1^t}$$
$$\hat{r} \leftarrow \frac{r}{1 - \rho_2^t}$$
$$\theta \leftarrow \theta - \epsilon \frac{\hat{s}}{\sqrt{\hat{r}} + \delta}$$  

$t$: iteration counter.  
$s$ et $r$: respectively first et second-order moments.  
$\hat{s}$ et $\hat{r}$: respectively corrected first et second-order moments.  

$\epsilon$: learning rate (default: $0.001$)  
$\rho_1$: decay rate for first-order moments estimates (default: $0.9$)   
$\rho_2$: decay rate for second-order moments estimates (default: $0.999$)   
$\delta$: small contant to avoid division by zero (default: $10^{-8}$)

In [10]:
LR = 0.001
DELTA = 1e-8
DEC1 = 0.9
DEC2 = 0.999

NEPOCHS = 5

net = Net()

s = list()
r = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    s.append(torch.zeros(w.shape, dtype=w.dtype))
    r.append(torch.zeros(w.shape, dtype=w.dtype))
    
t = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            s[i].data = DEC1 * s[i].data + (1-DEC1) * w.grad.data
            r[i].data = DEC2 * r[i].data + (1-DEC2) * w.grad.data**2
            sc = s[i].data / (1 - DEC1**t)
            rc = r[i].data / (1 - DEC2**t)

            vs[i].data = ALPHA * vs[i].data - LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data
            w.data.sub_(LR * sc.data / (torch.sqrt(rc.data) + DELTA))
            t += 1
            
            
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.97776794433594
1[301] Loss = 10.19133186340332
1[601] Loss = 15.101129531860352
1[901] Loss = 9.226604461669922
Test accuracy = 0.9548 (9548/10000)
2[1] Loss = 8.047613143920898
2[301] Loss = 8.893031120300293
2[601] Loss = 5.449779510498047
2[901] Loss = 5.434549808502197
Test accuracy = 0.9713 (9713/10000)
3[1] Loss = 3.4808223247528076
3[301] Loss = 8.432893753051758
3[601] Loss = 12.131757736206055
3[901] Loss = 4.407227039337158
Test accuracy = 0.9667 (9667/10000)
4[1] Loss = 10.379000663757324
4[301] Loss = 2.228745937347412
4[601] Loss = 0.6432175636291504
4[901] Loss = 6.346026420593262
Test accuracy = 0.9686 (9686/10000)
5[1] Loss = 2.1128761768341064
5[301] Loss = 1.1721291542053223
5[601] Loss = 5.869318008422852
5[901] Loss = 6.652130126953125
Test accuracy = 0.9679 (9679/10000)
