In [1]:
import sys
sys.path.append('..')

import numpy as np
import torch

import metrics
import utils

# Gradient Descent

In [2]:
## Model for training

IN_SIZE = 28 * 28
HIDDEN1_SIZE = 500
HIDDEN2_SIZE = 256
OUT_SIZE = 10

NEPOCHS = 5
BATCH_SIZE = 64

train_loader, test_loader = utils.load_mnist(BATCH_SIZE)
train_loader_01, test_loader_01 = utils.load_mnist_01(BATCH_SIZE)

def compute_accuracy(y_preds, y):
    total = len(y_preds)
    correct = np.equal(y_preds, y).sum()
    return correct, total

def run_tests(net, test_loader):
    test_y_preds, test_y_true = utils.get_class_output(net, test_loader)
    correct, total = compute_accuracy(test_y_preds, test_y_true)
    acc = float(correct) / total
    print('Test accuracy = {} ({}/{})'.format(acc, correct, total))

class Net(torch.nn.Module):

    def __init__(self):
        super(Net, self).__init__()
        self.l1 = torch.nn.Linear(IN_SIZE , HIDDEN1_SIZE)
        self.l2 = torch.nn.Linear(HIDDEN1_SIZE, HIDDEN2_SIZE)
        self.l3 = torch.nn.Linear(HIDDEN2_SIZE, OUT_SIZE)

    def forward(self, x):
        x = x.view(-1, IN_SIZE)
        x = torch.relu(self.l1(x))
        x = torch.relu(self.l2(x))
        y_logits = self.l3(x)
        return y_logits


criterion = torch.nn.CrossEntropyLoss(reduction='sum')

Downloading http://yann.lecun.com/exdb/mnist/train-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/train-labels-idx1-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-images-idx3-ubyte.gz
Downloading http://yann.lecun.com/exdb/mnist/t10k-labels-idx1-ubyte.gz
Processing...
Done!


## Stochastic gradient descent

- Go over the whole training set for several passes, each pass is called an epoch
- For each epoch, divide the training set into x random grousps with a fixed bath size, and iterative over all the batchs of data
- For each batch, compute the gradient of the weights, and update them accordingly to the learning rate $\epsilon$:

$$\theta \leftarrow \theta - \epsilon \nabla_\theta J$$

In [3]:
LEARNING_RATE = 0.001

net = Net()

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            lr_k = LEARNING_RATE
            w.data.sub_(w.grad.data * lr_k)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.5270233154297
1[301] Loss = 36.75208282470703
1[601] Loss = 17.57419204711914
1[901] Loss = 15.867212295532227
Test accuracy = 0.9056 (9056/10000)
2[1] Loss = 24.019798278808594
2[301] Loss = 20.23111343383789
2[601] Loss = 30.22503662109375
2[901] Loss = 8.016792297363281
Test accuracy = 0.9382 (9382/10000)
3[1] Loss = 11.811263084411621
3[301] Loss = 5.030627727508545
3[601] Loss = 6.9431562423706055
3[901] Loss = 6.288186550140381
Test accuracy = 0.957 (9570/10000)
4[1] Loss = 3.70430326461792
4[301] Loss = 7.358327865600586
4[601] Loss = 1.2877092361450195
4[901] Loss = 6.556319713592529
Test accuracy = 0.9618 (9618/10000)
5[1] Loss = 9.979596138000488
5[301] Loss = 10.257913589477539
5[601] Loss = 8.09604263305664
5[901] Loss = 10.197946548461914
Test accuracy = 0.9692 (9692/10000)


## SGD with linearly decaying learning rate

The learning rate $\epsilon$ linearly decrease over the whole training.  
$\epsilon_0$: initial learning rate.  
$\epsilon_\tau$: final learning rate.  
$\tau$: number of iterations to reach $\epsilon_\tau$.

$$\alpha = \frac{k}{\tau}$$

Learning rate at iteration $k$:

$$
\epsilon_k = 
\begin{cases}
    (1 - \alpha)\epsilon_0 + \alpha \epsilon_\tau & \text{if } k \leq \tau\\
    \epsilon_\tau & \text{if } k > \tau
\end{cases}
$$

In [4]:
LR_0 = 0.005
LR_T = 0.0001
T = len(train_loader) * NEPOCHS

net = Net()

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
    
        net.zero_grad()
        loss.backward()
        for w in net.parameters():
            lr_k = max(LR_T, (1 - k/T) * LR_0 + k/T * LR_T)
            w.data.sub_(w.grad.data * lr_k)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.2858123779297
1[301] Loss = 27.170085906982422
1[601] Loss = 11.047721862792969
1[901] Loss = 13.711531639099121
Test accuracy = 0.928 (9280/10000)
2[1] Loss = 23.381946563720703
2[301] Loss = 11.159976959228516
2[601] Loss = 6.116436958312988
2[901] Loss = 9.599953651428223
Test accuracy = 0.9603 (9603/10000)
3[1] Loss = 5.340740203857422
3[301] Loss = 10.129582405090332
3[601] Loss = 5.469518661499023
3[901] Loss = 3.2249715328216553
Test accuracy = 0.9686 (9686/10000)
4[1] Loss = 6.069091796875
4[301] Loss = 3.748361587524414
4[601] Loss = 1.6760823726654053
4[901] Loss = 6.5092453956604
Test accuracy = 0.9728 (9728/10000)
5[1] Loss = 15.878865242004395
5[301] Loss = 7.71419620513916
5[601] Loss = 8.514182090759277
5[901] Loss = 1.474599838256836
Test accuracy = 0.9742 (9742/10000)


### SGD with momemtum

Keep tracks of an exponentially decaying moving average of all gradients $v$.  
Update the wieghts in the direction of v.  

$$v \leftarrow \alpha v - \epsilon \nabla_\theta J$$
$$\theta \leftarrow \theta + v$$  

$\epsilon$: learning rate.  
$\alpha$: momemtum coefficient.  
The initial value of $v$ influence the converge, it usually starts at 0.

Commom values for $\alpha$ are $0.5$, $0.9$ and $0.99$

In [5]:
LR = 0.001
ALPHA = 0.9

net = Net()

vs = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    vs.append(v)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)

        net.zero_grad()
        loss.backward()
        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            vs[i].data = ALPHA * vs[i].data - LR * w.grad.data
            w.data.add_(vs[i].data)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.82102966308594
1[301] Loss = 19.815383911132812
1[601] Loss = 15.203666687011719
1[901] Loss = 6.132991313934326
Test accuracy = 0.9295 (9295/10000)
2[1] Loss = 8.83996868133545
2[301] Loss = 28.776630401611328
2[601] Loss = 18.577226638793945
2[901] Loss = 7.589788913726807
Test accuracy = 0.9556 (9556/10000)
3[1] Loss = 7.083834648132324
3[301] Loss = 13.001118659973145
3[601] Loss = 7.388951301574707
3[901] Loss = 21.047073364257812
Test accuracy = 0.9477 (9477/10000)
4[1] Loss = 16.090076446533203
4[301] Loss = 16.447124481201172
4[601] Loss = 2.34926700592041
4[901] Loss = 5.055661678314209
Test accuracy = 0.9599 (9599/10000)
5[1] Loss = 3.2772674560546875
5[301] Loss = 22.849519729614258
5[601] Loss = 1.2642607688903809
5[901] Loss = 0.819697380065918
Test accuracy = 0.9701 (9701/10000)


### SGD with Nestverov momentum

Evaluates gradient after the current velocity is applied.  

$$v \leftarrow \alpha v - \epsilon \nabla_\theta J(\theta + \alpha v)$$
$$\theta \leftarrow \theta + v$$

In [6]:
LR = 0.001
ALPHA = 0.9

net = Net()

vs = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    vs.append(v)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        params = list(net.parameters())
        for i in range(len(params)):
            params[i].data.add_(ALPHA * vs[i])
        
        y_logits = net(X)       
        loss = criterion(y_logits, y)

        net.zero_grad()
        loss.backward()

        for i in range(len(params)):
            w = params[i]
            w.data.sub_(ALPHA * vs[i])
            vs[i].data = ALPHA * vs[i].data - LR * w.grad.data
            w.data.add_(vs[i].data)
    
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 146.84693908691406
1[301] Loss = 20.71490478515625
1[601] Loss = 9.90667724609375
1[901] Loss = 14.843843460083008
Test accuracy = 0.9478 (9478/10000)
2[1] Loss = 6.96317720413208
2[301] Loss = 24.63340950012207
2[601] Loss = 12.019695281982422
2[901] Loss = 1.158607006072998
Test accuracy = 0.9607 (9607/10000)
3[1] Loss = 2.092935800552368
3[301] Loss = 7.6326775550842285
3[601] Loss = 6.785384178161621
3[901] Loss = 12.72946834564209
Test accuracy = 0.9586 (9586/10000)
4[1] Loss = 9.662019729614258
4[301] Loss = 8.608166694641113
4[601] Loss = 3.50508975982666
4[901] Loss = 18.84221839904785
Test accuracy = 0.9614 (9614/10000)
5[1] Loss = 5.554802894592285
5[301] Loss = 1.706033706665039
5[601] Loss = 0.5673184394836426
5[901] Loss = 4.645683288574219
Test accuracy = 0.9733 (9733/10000)


### Adagrad 

Scale gradients by the inverse of the sum of all squared previous gradients.  

$$r\leftarrow r + \nabla_\theta J \odot \nabla_\theta J$$
$$\theta \leftarrow \theta - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  

$\epsilon$: learning rate.  
$\delta$: small constant ($10^{-7}$) to avoid division by $0$.  
$r$ is initialized at $0$

large gradient results in small learning rate.  
small gradients results in large learning rate. 
It makes progress on more gently slopes directions of the parameters space.  
Fast for convex optimization, but might also work for deep learning

In [7]:
LR = 0.001
DELTA = 1e-7

NEPOCHS = 5

net = Net()

rs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):        
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            rs[i].data.add_(w.grad.data ** 2)
            w.data.sub_(LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.76698303222656
1[301] Loss = 25.988157272338867
1[601] Loss = 22.155385971069336
1[901] Loss = 17.08797264099121
Test accuracy = 0.8992 (8992/10000)
2[1] Loss = 23.854522705078125
2[301] Loss = 23.128379821777344
2[601] Loss = 21.91646957397461
2[901] Loss = 17.013355255126953
Test accuracy = 0.9097 (9097/10000)
3[1] Loss = 20.268390655517578
3[301] Loss = 18.043489456176758
3[601] Loss = 29.65180206298828
3[901] Loss = 14.703795433044434
Test accuracy = 0.9146 (9146/10000)
4[1] Loss = 16.749221801757812
4[301] Loss = 21.041324615478516
4[601] Loss = 10.169624328613281
4[901] Loss = 11.265661239624023
Test accuracy = 0.9189 (9189/10000)
5[1] Loss = 25.697978973388672
5[301] Loss = 15.206506729125977
5[601] Loss = 18.25164031982422
5[901] Loss = 30.931976318359375
Test accuracy = 0.9226 (9226/10000)


### RMSProp

It's a modified version of Adagrad.  
$r$ is now an exponentially dacying average.  

$$r\leftarrow \rho r + (1 - \rho) \nabla_\theta J \odot \nabla_\theta J$$
$$\theta \leftarrow \theta - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  

$\rho$: day rate.  
$\epsilon$: learning rate.  
$\delta$: small constant ($10^{-6}$) to avoid division by $0$.

It gives better results than Adagrad for deep models

In [8]:
LR = 0.001
DELTA = 1e-6
DECAY = 0.9

NEPOCHS = 5

net = Net()

rs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)

k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            rs[i].data = DECAY * rs[i].data + (1-DECAY) * w.grad.data**2
            w.data.sub_(LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.26217651367188
1[301] Loss = 17.58557891845703
1[601] Loss = 11.66818904876709
1[901] Loss = 13.68702507019043
Test accuracy = 0.9501 (9501/10000)
2[1] Loss = 8.958868980407715
2[301] Loss = 5.09785270690918
2[601] Loss = 8.713766098022461
2[901] Loss = 17.310115814208984
Test accuracy = 0.9627 (9627/10000)
3[1] Loss = 4.96223258972168
3[301] Loss = 19.404726028442383
3[601] Loss = 11.642419815063477
3[901] Loss = 7.8435821533203125
Test accuracy = 0.9691 (9691/10000)
4[1] Loss = 9.57171630859375
4[301] Loss = 3.6938445568084717
4[601] Loss = 2.975255250930786
4[901] Loss = 12.813283920288086
Test accuracy = 0.9683 (9683/10000)
5[1] Loss = 2.0546493530273438
5[301] Loss = 7.876154899597168
5[601] Loss = 1.3764848709106445
5[901] Loss = 4.71381950378418
Test accuracy = 0.9707 (9707/10000)


## RMSProp with Neskerov momentum

Add neskerov momemtum to RMSProp: 

$$r\leftarrow \rho r + (1 - \rho) \nabla_\theta J \odot \nabla_\theta J$$
$$v \leftarrow v - \frac{\epsilon}{\sqrt{\delta + r}} \odot \nabla_\theta J$$  
$$\theta \leftarrow \theta + v$$  

$v$ is a moving average of the scaled gradients

In [9]:
LR = 0.001
DELTA = 1e-6
DECAY = 0.9
ALPHA = 0.9

NEPOCHS = 5

net = Net()

rs = list()
vs = list()
for w in net.parameters():
    r = torch.zeros(w.shape, dtype=w.dtype)
    v = torch.zeros(w.shape, dtype=w.dtype)
    rs.append(r)
    vs.append(v)
    
k = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        params = list(net.parameters())
        for i in range(len(params)):
            params[i].data.add_(ALPHA * vs[i])
        
        y_logits = net(X)
        
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        for i in range(len(params)):
            w = params[i]
            w.data.sub_(ALPHA * vs[i])
            rs[i].data = DECAY * rs[i].data + (1-DECAY) * w.grad.data**2
            vs[i].data = ALPHA * vs[i].data - LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data
            w.data.add_(vs[i].data)
    
        k += 1
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 147.88803100585938
1[301] Loss = 10.70921802520752
1[601] Loss = 16.994647979736328
1[901] Loss = 9.389078140258789
Test accuracy = 0.9364 (9364/10000)
2[1] Loss = 9.92849349975586
2[301] Loss = 8.535120964050293
2[601] Loss = 10.694400787353516
2[901] Loss = 49.448326110839844
Test accuracy = 0.9374 (9374/10000)
3[1] Loss = 22.262073516845703
3[301] Loss = 28.935487747192383
3[601] Loss = 24.975175857543945
3[901] Loss = 23.655744552612305
Test accuracy = 0.9204 (9204/10000)
4[1] Loss = 27.42522430419922
4[301] Loss = 3.075234889984131
4[601] Loss = 9.480464935302734
4[901] Loss = 21.016372680664062
Test accuracy = 0.943 (9430/10000)
5[1] Loss = 22.950645446777344
5[301] Loss = 10.202150344848633
5[601] Loss = 19.90564727783203
5[901] Loss = 19.354583740234375
Test accuracy = 0.9327 (9327/10000)


### Adam

Compute an estimate of the first-order and second-order moments.  
Then correct the bias on these estimates.  
These corrected estimates are then used to compute the weight updates.

$$s \leftarrow \rho_1 s + (1 - \rho_1) \nabla_\theta J$$
$$r \leftarrow \rho_2 r + (1 - \rho_2) \nabla_\theta J \odot \nabla_\theta J$$
$$\hat{s} \leftarrow \frac{s}{1 - \rho_1^t}$$
$$\hat{r} \leftarrow \frac{r}{1 - \rho_2^t}$$
$$\theta \leftarrow \theta - \epsilon \frac{\hat{s}}{\sqrt{\hat{r}} + \delta}$$  

$t$: iteration counter.  
$s$ et $r$: respectively first et second-order moments.  
$\hat{s}$ et $\hat{r}$: respectively corrected first et second-order moments.  

$\epsilon$: learning rate (default: $0.001$)  
$\rho_1$: decay rate for first-order moments estimates (default: $0.9$)   
$\rho_2$: decay rate for second-order moments estimates (default: $0.999$)   
$\delta$: small contant to avoid division by zero (default: $10^{-8}$)

In [10]:
LR = 0.001
DELTA = 1e-8
DEC1 = 0.9
DEC2 = 0.999

NEPOCHS = 5

net = Net()

s = list()
r = list()
for w in net.parameters():
    v = torch.zeros(w.shape, dtype=w.dtype)
    s.append(torch.zeros(w.shape, dtype=w.dtype))
    r.append(torch.zeros(w.shape, dtype=w.dtype))
    
t = 1
for epoch in range(NEPOCHS):

    for batch_idx, (X, y) in enumerate(train_loader):
        y_logits = net(X)
        loss = criterion(y_logits, y)
        net.zero_grad()
        loss.backward()

        params = list(net.parameters())
        for i in range(len(params)):
            w = params[i]
            s[i].data = DEC1 * s[i].data + (1-DEC1) * w.grad.data
            r[i].data = DEC2 * r[i].data + (1-DEC2) * w.grad.data**2
            sc = s[i].data / (1 - DEC1**t)
            rc = r[i].data / (1 - DEC2**t)

            vs[i].data = ALPHA * vs[i].data - LR / torch.sqrt(DELTA+rs[i].data) * w.grad.data
            w.data.sub_(LR * sc.data / (torch.sqrt(rc.data) + DELTA))
            t += 1
            
            
        if batch_idx % 300 == 0:
            print('{}[{}] Loss = {}'.format(epoch + 1, 
                                            batch_idx+1, loss.data))
    
    run_tests(net, test_loader)

1[1] Loss = 146.546630859375
1[301] Loss = 20.987897872924805
1[601] Loss = 19.754451751708984
1[901] Loss = 25.493762969970703
Test accuracy = 0.9553 (9553/10000)
2[1] Loss = 12.02663803100586
2[301] Loss = 5.827609539031982
2[601] Loss = 10.173932075500488
2[901] Loss = 14.954172134399414
Test accuracy = 0.9683 (9683/10000)
3[1] Loss = 2.941495895385742
3[301] Loss = 8.274127960205078
3[601] Loss = 3.27241849899292
3[901] Loss = 9.198446273803711
Test accuracy = 0.9714 (9714/10000)
4[1] Loss = 2.223482370376587
4[301] Loss = 1.31793212890625
4[601] Loss = 11.347515106201172
4[901] Loss = 1.6574327945709229
Test accuracy = 0.9706 (9706/10000)
5[1] Loss = 3.419306516647339
5[301] Loss = 0.4741554260253906
5[601] Loss = 0.8854548931121826
5[901] Loss = 5.873857021331787
Test accuracy = 0.9731 (9731/10000)
