In [1]:
import numpy as np
import matplotlib.pyplot as plt
import time

# Exercise 4

## Contents
- Stochastic Gradient Descent
- PyTorch: Coding Example
- Training a Network: Coding Example

## Stochastic Gradient Descent

**Gradient Descent:** $\theta(k+1) = \theta(k) - \tau * \nabla E(\theta(k))$

**Problem:** Expensive for a lot of training examples e.g. $E \in \mathbb{R}^{1,000,000 \times 10}$

**Solution:** Approximate gradient vector by only using a few samples at a time (mini-batches)
- This is what we call **Stochastic Gradient Descent**!

**Practical Implementation**

In [2]:
for e in range(epochs):
    # shuffle training data
    for i in range(0, total_number_of_training_examples, minibatch_size):
        # take chunk i:i+minibatch_size out of (shuffeled) training data
        # do gradient descent step with suitable step size only using loss computed on sampled in minibatch

IndentationError: expected an indented block (851538047.py, line 5)

## PyTorch: Coding Example


Deep Learning framework providing powerful tools for simple and efficient implementations of neural networks

In [3]:
import torch

### Tensors
- Basic building block
- Similar to what you did in `toolbox.py`

In [4]:
a = torch.tensor(np.array([[1., 2.], [3., 4.]]))
b = torch.ones(2, 2)
c = torch.empty(5)

### Backpropagation in PyTorch

- Uses gradient taping (same as you did in the last exercise)
- Same attributes and functions (`grad`, `grad_fn`, `.backward()`, ...)

In [5]:
a = torch.tensor(1., requires_grad=True)
b = torch.tensor(2.)

c = a + b
d = c + a
d = d * d

d.backward()

print(a.grad, b.grad)
print(c.grad_fn, d.grad_fn)

tensor(16.) None
<AddBackward0 object at 0x7fa13819d5b0> <MulBackward0 object at 0x7fa138193340>


  Variable._execution_engine.run_backward(  # Calls into the C++ engine to run the backward pass


### `torch.no_grad()` 

- Used during inference (feeding data through network to get prediction)
- Causes computation graph not to be saved

### GPU Support (CUDA)
- GPUs offer high amount of parallelization & efficient calculation of simple (mathmatical) operations
- All major Deep Learning frameworks offer possibility of utilizing GPUs
- PyTorch: 
    - `.cuda()` moves Tensor to GPU
    - `.device` tells which device Tensor is on
    
**Disclaimer:** If you don't have a GPU, try Google Colab 

In [6]:
a = torch.tensor(2.)
print(a.device)

a = a.cuda()
print(a.device)

cpu


RuntimeError: The NVIDIA driver on your system is too old (found version 10010). Please update your GPU driver by downloading and installing a new version from the URL: http://www.nvidia.com/Download/index.aspx Alternatively, go to: https://pytorch.org to install a PyTorch version that has been compiled with your version of the CUDA driver.

### `torch.utils.data.Dataset`
- Provides some handy tools to make treatment of data very easy
- Base class overloads functions ``__len__`` and ``__getitem__`` (allow easy access to data contained in the set)
- `torchvision.datasets` provides popular public datasets (e.g. MNIST dataset)

In [None]:
from torchvision.datasets import MNIST
from torchvision import transforms

train_set = MNIST('data', download=True, transform=transforms.ToTensor())
val_set = MNIST('data', train=False, transform=transforms.ToTensor())

print(train_set)
print(val_set)

In [7]:
el = train_set[0]
print('Element type: {}; Shape of element: {}; Label: {}'.format(type(el) ,el[0].shape, el[1]))

plt.imshow(el[0][0], cmap='gray', vmin=0., vmax=1.)
plt.title('Label: {}'.format(el[1]))

NameError: name 'train_set' is not defined

### `torch.utils.data.DataLoader`
- Efficiently load data to processing unit.
- Takes care of minibatching (only have to provide dataset & define batch size)
- Additionally, may specify further arguments (`shuffeling`, `num_workers`, ...)


In [8]:
from torch.utils.data import DataLoader

train_loader = DataLoader(train_set, batch_size=4, num_workers=4, shuffle=True)
val_loader = DataLoader(val_set, batch_size=4, num_workers=4, shuffle=True)

batch = iter(train_loader).next()

data, labels = batch[0], batch[1]

for i in range(4):
    plt.subplot(1, 4, i+1)
    plt.imshow(data[i, 0], cmap='gray', vmin=0., vmax=1.)
    plt.title('Label: {}'.format(labels[i]))

NameError: name 'train_set' is not defined

### `torch.nn`

- Provides a broad variety of pre-implemented tools & layers you can use in neural networks
- ``Module`` in ``torch.nn`` serves as base class for neural networks & their layers
- For defining network, have to inherit from ``torch.nn.Module`` and define the ``forward`` function
- `torch.nn.Sequential` = condense multiple layers

**NOTE:** not sufficient to store layers in list - will be not registered as submodules of the network.

In [9]:
from torch import nn

class FullyConnectedNet(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(in_features=28*28, out_features=32)
        self.layer2 = nn.ReLU()
        self.layer3 = nn.Linear(in_features=32, out_features=32)
        self.layer4 = nn.ReLU()
        self.layer5 = nn.Linear(32, 10)
    
    def forward(self, x):
        x = x.reshape(-1, 28*28)
        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)
        x = self.layer5(x)
        return x
    
network = FullyConnectedNet()

### Losses & Optimizers
- PyTorch offers a lot of pre-implemented losses and optimizers
- Optimizers are directly linked with the weights of a network.
- Have a look at the documentation and see what you already know!

In [10]:
learning_rate = 1e-2

loss_fun = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(network.parameters(), learning_rate)

### Parameters
- Can access learnable parameters of a network by calling its `parameters` function
- `state_dict` function: also returns buffers etc.  (used when saving models)


In [11]:
list(network.parameters())

[Parameter containing:
 tensor([[ 0.0256,  0.0100,  0.0276,  ...,  0.0113, -0.0242,  0.0037],
         [ 0.0259,  0.0284, -0.0182,  ..., -0.0300,  0.0345,  0.0276],
         [-0.0272,  0.0260,  0.0024,  ...,  0.0119, -0.0282, -0.0070],
         ...,
         [-0.0224, -0.0237,  0.0078,  ...,  0.0120, -0.0120,  0.0315],
         [-0.0100, -0.0035, -0.0057,  ...,  0.0164,  0.0320,  0.0219],
         [-0.0080,  0.0014, -0.0079,  ...,  0.0094,  0.0277, -0.0294]],
        requires_grad=True),
 Parameter containing:
 tensor([-0.0101, -0.0337, -0.0162,  0.0327,  0.0194, -0.0230, -0.0057,  0.0243,
          0.0219, -0.0137, -0.0185, -0.0338,  0.0176,  0.0338,  0.0080,  0.0281,
          0.0175, -0.0252,  0.0034,  0.0079, -0.0212,  0.0150,  0.0261, -0.0084,
         -0.0111, -0.0178,  0.0324, -0.0014,  0.0191, -0.0295,  0.0350,  0.0245],
        requires_grad=True),
 Parameter containing:
 tensor([[-0.0237,  0.0200, -0.1440,  ...,  0.1325, -0.1747,  0.0891],
         [ 0.0877, -0.0413,  0.0057,

In [12]:
network.state_dict()

OrderedDict([('layer1.weight',
              tensor([[ 0.0256,  0.0100,  0.0276,  ...,  0.0113, -0.0242,  0.0037],
                      [ 0.0259,  0.0284, -0.0182,  ..., -0.0300,  0.0345,  0.0276],
                      [-0.0272,  0.0260,  0.0024,  ...,  0.0119, -0.0282, -0.0070],
                      ...,
                      [-0.0224, -0.0237,  0.0078,  ...,  0.0120, -0.0120,  0.0315],
                      [-0.0100, -0.0035, -0.0057,  ...,  0.0164,  0.0320,  0.0219],
                      [-0.0080,  0.0014, -0.0079,  ...,  0.0094,  0.0277, -0.0294]])),
             ('layer1.bias',
              tensor([-0.0101, -0.0337, -0.0162,  0.0327,  0.0194, -0.0230, -0.0057,  0.0243,
                       0.0219, -0.0137, -0.0185, -0.0338,  0.0176,  0.0338,  0.0080,  0.0281,
                       0.0175, -0.0252,  0.0034,  0.0079, -0.0212,  0.0150,  0.0261, -0.0084,
                      -0.0111, -0.0178,  0.0324, -0.0014,  0.0191, -0.0295,  0.0350,  0.0245])),
             ('layer3.weigh

## Training a Network: Coding Example

In [13]:
epochs = 5
train_loader = DataLoader(train_set, batch_size=100, shuffle=True)
val_loader = DataLoader(val_set, batch_size=100, shuffle=False)

start = time.time()

for e in range(epochs):
    print('Epoch {} of {}'.format(e, epochs))

    # Training
    train_loss = 0.
    for i, (x, y) in enumerate(train_loader):
        y_pred = network.forward(x)
        batch_loss = loss_fun(y_pred, y)
        train_loss += batch_loss
        network.zero_grad()
        batch_loss.backward()
        optimizer.step()
        if i % 100 == 0:
            print("Batch: {}/{}; Loss: {}".format(i, len(train_loader), batch_loss))

    # Inference
    val_loss = 0.
    with torch.no_grad():
        for i, (x, y) in enumerate(val_loader):
            y_pred = network.forward(x)
            val_loss += loss_fun(y_pred, y)

    print("Epoch: {}/{}; Training loss: {}; Validation loss {}".format(e, epochs, train_loss, val_loss))

end = time.time()
hours, rem = divmod(end - start, 3600)
minutes, seconds = divmod(rem, 60)
print("\nFinal time elapsed: {:0>2}:{:0>2}:{:05.2f}\n".format(int(hours), int(minutes), seconds))

NameError: name 'train_set' is not defined