## Practical Session 3

In [1]:
import torch
from torch import Tensor
import dlc_practical_prologue as prologue

class Args:
    full = True
    tiny = True
    seed = 42
    cifar = False
    data_dir = './data'
    file = None

args = Args()

In [2]:
train_input, train_target, test_input, test_target = prologue.load_data(
     one_hot_labels=False, normalize=True, flatten=True
)

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


### 1- Activation Function

In [25]:
def sigma(x):
    return torch.tanh(x)

def dsigma(x):
    return 1-torch.tanh(x).pow(2)

def alt_dsigma(x):
    return 4 * (x.exp() + x.mul(-1).exp()).pow(-2)

In [29]:
x = train_input
y = train_target

In [34]:
y_test = test_target

In [32]:
x.size()

torch.Size([1000, 784])

In [35]:
y_test.size()

torch.Size([1000])

### 2 - Loss

In [131]:
def loss(v, t):
    return (v - t).pow(2).sum()


def dloss(v, t):
    return 2 * (v - t)

<h3><span style="color:green">Correction</span></h3>

<span style="color:green">The gradient with respect to v gets one gradient per sample! Even in batch gradient descent.</span>

In [41]:
def loss(v, t):
    return (v - t).sum().pow(2)


def dloss(v, t):
    return 2 * (v - t)

In [47]:
loss(y_test, y)

tensor(16789)

In [50]:
dloss(y_test, y)[0]

tensor(4)

### 3 - Forward and backward passes

In [59]:
def forward_pass(w1, b1, w2, b2, x):
    x0 = x
    s1 = w1 @ x0 + b1
    x1 = sigma(s1)
    s2 =  w2 @ x1 + b2
    x2 = sigma(s2)
    return x0, s1, x1, s2, x2

def backward_pass(w1, b1, w2, b2, t, x, s1, x1, s2, x2, dl_w1, dl_db1, dl_dw2, dl_db2):
    pass




## Correction

In [123]:
def forward_pass(w1, b1, w2, b2, x):
    x0 = x
    s1 = w1.mv(x0) + b1
    x1 = sigma(s1)
    s2 = w2.mv(x1) + b2
    x2 = sigma(s2)
    return x0, s1, x1, s2, x2

In [115]:
x.size()

torch.Size([784])

In [108]:
x = train_input[0].view(-1)
x.size()

torch.Size([784])

In [109]:
w1 = torch.randn(1, 784)
b1 = torch.randn(784, 1)
b1
w2 = torch.randn(1, 784)
b2 = torch.randn(784, 1)

### Correction: it's not great to add this second dimension to the bias.. it makes .mv() return an error.

In [141]:
w1 = torch.randn(1, 784)
b1 = torch.randn(784)
w2 = torch.randn(10, 784)
b2 = torch.randn(784, 10)

In [142]:
x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, x)

In [128]:
t = train_target[0]
t

tensor(5)

In [143]:
x2.size()

torch.Size([784, 10])

In [132]:
dl_w1 = 0
dl_db1 = 0 
dl_dw2 = 0 
dl_db2 = 0

def backward_pass(w1, b1, w2, b2, 
                  t, 
                  x, s1, x1, s2, x2, 
                  dl_w1, dl_db1, dl_dw2, dl_db2):
    
    loss = loss(t, x2)
    dl_dw2 += dloss(t, x2) * dsigma(x2).t() * x1.t() 
    dl_db2 += dloss(t, x2) * dsigma(x2) 
    dl_dw1 += w1 * dloss(t, x2) * dsigma(x2)  * dsigma(x1) * x0 
    dl_db1 += w1 * dloss(t, x2) * dsigma(x2)  * dsigma(x1)
    
    return dl_w1, dl_db1, dl_dw2, dl_db2


Fist improvement is to make a less nested function + be aware dsigma is applied on the state s, not on x ! (don't skip a node in the graph ! )

In [146]:
def backward_pass(w1, b1, w2, b2, 
                  t, 
                  x, s1, x1, s2, x2, 
                  dl_w1, dl_db1, dl_dw2, dl_db2):
    
    dl_dx2 = dloss(t, x2)
    dl_ds2 = dl_dx2 * dsigma(s2).t()

    dl_dx1 = w2 * dl_ds2
    dl_ds1 = dl_dx2 * dsigma(s1).t()
    
    dl_dw2 += dl_ds2 * x1.t() 
    dl_db2 += dl_ds2
    
    dl_dw1 += dl_ds1 * x0 
    dl_db1 += dl_ds1
    
    return dl_w1, dl_db1, dl_dw2, dl_db2


Then be aware of the order of computation... 

In [147]:
dl_w1, dl_db1, dl_dw2, dl_db2 = backward_pass(w1, b1, w2, b2, t, x, s1, x1, s2, x2, dl_w1, dl_db1, dl_dw2, dl_db2)

RuntimeError: The size of tensor a (10) must match the size of tensor b (784) at non-singleton dimension 1

### Correction

In [None]:
def backward_pass(w1, b1, w2, b2,
                  t,
                  x, s1, x1, s2, x2,
                  dl_dw1, dl_db1, dl_dw2, dl_db2):
    x0 = x
    dl_dx2 = dloss(x2, t)
    dl_ds2 = dsigma(s2) * dl_dx2
    dl_dx1 = w2.t().mv(dl_ds2)
    dl_ds1 = dsigma(s1) * dl_dx1

    dl_dw2.add_(dl_ds2.view(-1, 1).mm(x1.view(1, -1)))
    dl_db2.add_(dl_ds2)
    dl_dw1.add_(dl_ds1.view(-1, 1).mm(x0.view(1, -1)))
    dl_db1.add_(dl_ds1)

    return dl_dw1, dl_db1, dl_dw2, dl_db2

### 4 - Training the network

1. Load the data using the provided prologue.load˙data function, with one-hot label vectors
and normalized inputs. Multiply the target label vectors by ζ = 0.9 (so that they are strictly in
the value range of tanh).

In [150]:
train_input, train_target, test_input, test_target = prologue.load_data(
     one_hot_labels=False, normalize=True, flatten=True
)

train_target = train_target * 0.9
test_target = test_target * 0.9

* Using MNIST
** Reduce the data-set (use --full for the full thing)
** Use 1000 train and 1000 test samples


2. Create the four weight and bias tensors, and fill them with random values sampled according to
𝒩 (0, ϵ) with ϵ = 1e − 6.

In [172]:
import math

w1 = torch.empty(50, 784).normal_(0, math.exp(-6))
b1 = torch.empty(50).normal_(0, math.exp(-6))
w2 = torch.empty(10, 50).normal_(0, math.exp(-6))
b2 = torch.empty(10).normal_(0, math.exp(-6))

3. Create the four tensors to sum up the gradients on individual samples, with respect to the
weights and biases.

In [173]:
def backward_pass(w1, b1, w2, b2,
                  t,
                  x, s1, x1, s2, x2,
                  dl_dw1, dl_db1, dl_dw2, dl_db2):
    x0 = x
    dl_dx2 = dloss(x2, t)
    dl_ds2 = dsigma(s2) * dl_dx2
    dl_dx1 = w2.t().mv(dl_ds2)
    dl_ds1 = dsigma(s1) * dl_dx1

    dl_dw2.add_(dl_ds2.view(-1, 1).mm(x1.view(1, -1)))
    dl_db2.add_(dl_ds2)
    dl_dw1.add_(dl_ds1.view(-1, 1).mm(x0.view(1, -1)))
    dl_db1.add_(dl_ds1)

    return dl_dw1, dl_db1, dl_dw2, dl_db2

4. Perform 1, 000 gradient steps with a step size η equal to 0.1 divided by the number of training
samples.
Each of these steps requires to reset to zero the tensors for summing up the gradients, and
doing a forward and a backward pass for each training example.
Compute and print the training loss, training error and test error after every step using the class
of maximum response as the predicted one.

In [174]:
torch.full((1,), 0)

tensor([0])

In [179]:
eta = 0.1

i = 0 

dl_dw1 = torch.empty(w1.size())
dl_db1 = torch.empty(b1.size())
dl_dw2 = torch.empty(w2.size())
dl_db2 = torch.empty(b2.size())

for i, row in enumerate(train_input):
    dl_dw1.zero_()
    dl_db1.zero_()
    dl_dw2.zero_()
    dl_db2.zero_()


    x = row
    t = train_target[i]
    
    x0, s1, x1, s2, x2 = forward_pass(w1, b1, w2, b2, x)
    
    dl_w1, dl_db1, dl_dw2, dl_db2 = backward_pass(w1, b1, w2, b2, t, x, s1, x1, s2, x2, dl_w1, dl_db1, dl_dw2, dl_db2)
    w1.substract(eta * dl_dw1)
    b1.substract(eta * dl_db1)
    w2.substract(eta * dl_dw2)
    b2.substract(eta * dl_db2)
    

    i += 1

    

RuntimeError: output with shape [1] doesn't match the broadcast shape [50, 784]

In [171]:
dl_ds1.size()

NameError: name 'dl_ds1' is not defined