# Exercise 5

In [None]:
%load_ext autoreload
%autoreload 2

import numpy as np
import torch
import torch.nn as nn
import matplotlib.pyplot as plt

## Contents

- Custom DataLoaders
- Learning Rate Schedules
- Weight Initalizations
- BatchNormalization

## Custom DataLoaders

### Task 1: Defining your own DataLoader with z-score normalisation
- Define a new Class `WineDataset` which inherits from class `torch.utils.data.Dataset` and load the `wine.csv` file correctly. To do so, overwrite the `__init__`, `__len__` and `__getitem__` function of the class you created
- Within the `__init__` function make it so that your data is z-score normalised, i.e. each record is zero-centered and normalised.

In [None]:
from torch.utils.data import Dataset

class WineDataset(Dataset):
    def __init__(self, path, range_data):
        pass
    
    def __len__(self):
        pass
    
    def __getitem__(self, idx):
        pass
    
training_set = WineDataset('wine.csv', range(140))
validation_set = WineDataset('wine.csv', range(140,170))

train_loader = torch.utils.data.DataLoader(training_set, batch_size=4, shuffle=True)
val_loader = torch.utils.data.DataLoader(validation_set, batch_size=4, shuffle=False)

### Take-Home Assignment 1
- Write a similar DataLoader for the Wine Quality dataset from Exercise 1. Apply now z-score normalization per column(!).

### Task 2: Learning Rate Schedules

- Extend the train loop as you know it from previous exercises such that it lowers the learning rate after 10 epochs by a factor of 10.

In [None]:
class LinearClassificationNet(nn.Module):
    def __init__(self):
        super().__init__()
        self.layer1 = nn.Linear(13, 3)
    
    def forward(self, x):
        x = self.layer1.forward(x)
        return x
    

def train_loop_schedule(train_loader, val_loader, network, loss_fun, optimizer, epochs, print_freq=1):
    for e in range(epochs):
        train_loss = 0.

        # Training
        for i, (x, y) in enumerate(train_loader):
            x, y = x.float(), y.long()
            
            # Prediction
            y_pred = network.forward(x)
            batch_loss = loss_fun(y_pred, y)
            train_loss += batch_loss
            
            # Optimization
            network.zero_grad()
            batch_loss.backward()
            optimizer.step()
        
        # Inference
        val_loss = 0.
        with torch.no_grad():
            for i, (x, y) in enumerate(val_loader):
                x, y = x.float(), y.long()
                y_pred = network.forward(x)
                val_loss += loss_fun(y_pred, y)
        
        # Print epoch results only every print_freq iterations
        if (e + 1) % print_freq == 0:
            print("Epoch: {}/{}; Training loss: {}; Validation loss {}"
                  .format(e+1, epochs, train_loss / len(train_loader), val_loss / len(val_loader)))
        
        ### YOUR CODE HERE

            

Test your implementation!

In [None]:
epochs = 30
lr = 0.1

loss_fun = nn.CrossEntropyLoss()
network = LinearClassificationNet()
optimizer = torch.optim.SGD(network.parameters(), lr=lr, momentum=0.9)

train_loop_schedule(train_loader, val_loader, network, loss_fun, optimizer, epochs)

### Take-Home Assignment 2

- Now, extend the train loop such that it lowers the learning rate if the network has not improved in 5 epochs. To do so you need to track the current best loss that you have achieved so far as well as during which epoch said loss occured.

## Weight Initalization

During lecture you saw undesirable property of networks, namely activations either drift apart or converge to zero the deeper we get in our network. This is undesireable as it complicates training. In general we want that activations live in the same distribution across layers. Will make our training more efficient. To demontrate you this consider the example below:

In [None]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(10, 100, bias=False),
                                    nn.ReLU(),
                                    nn.Linear(100, 100, bias=False),
                                    nn.ReLU(),
                                    nn.Linear(100, 100, bias=False),
                                    nn.ReLU(),
                                    nn.Linear(100, 1, bias=False),
                                   )
    
    def forward(self, x):
        return self.layers(x)

def track_activations(activations):
    def hook_fun(module, input, output):
        activations.append(output.detach())
    return hook_fun

In [None]:
x = torch.randn(1000, 10)
net = Network()

hooks = []
activations = []
hook_fun = track_activations(activations)

for layer in net.layers:
    hook = layer.register_forward_hook(hook_fun)
    hooks.append(hook)

with torch.no_grad():
    res = net(x)

for hook in hooks:
    hook.remove()
    
plt.figure(figsize=(15, 15))

for i, act in enumerate(activations):
    plt.subplot(4, 2, i+1)
    plt.hist(act.reshape(-1).numpy(), bins=9, density=True, range=(-2.1,2.1))
    plt.gca().set_ylim(0, 1)
plt.show()

As you can see the activations get smaller and smaller the deeper we go in our network. To dampen this effect we can use weight initalization schemes. You got to know Kaiming He's initialization scheme. Using this scheme we make it so that activations across layers are approximately distributed in the same normal distribution.

### Task 3: Using Weight Initalization

Now, initalize the weights of your network before starting training! Use Kaiming He's initialisation which you know from the lecture (see https://pytorch.org/docs/stable/nn.init.html). Write the `init_weight` function which iteratively checks each module and initalizes the weights if the module is of type `nn.Linear`

In [None]:
def init_weights(m):
    pass
        
net.apply(init_weights)

In [None]:
hooks = []
activations = []
hook_fun = track_activations(activations)

for layer in net.layers:
    hook = layer.register_forward_hook(hook_fun)
    hooks.append(hook)

with torch.no_grad():
    res = net(x)

for hook in hooks:
    hook.remove()
    
plt.figure(figsize=(15, 15))

for i, act in enumerate(activations):
    plt.subplot(4, 2, i+1)
    plt.hist(act.reshape(-1).numpy(), bins=9, density=True, range=(-2.1,2.1))
    plt.gca().set_ylim(0, 1)

## BatchNormalization

Batch Normalization is another way to ensure that activations across layers stay normally distributed. It more aggressive as in that it intermediatly normalizes the data again.

### Task 4: Using BatchNorm Layers

- Extend the network below by adding BatchNorm layers after each Linear layer. Use PyTorch's `BatchNorm1D` class (https://pytorch.org/docs/stable/generated/torch.nn.BatchNorm1d.html) to do so. See how the BatchNorm layer changes your data along the flow of your network.

In [None]:
class Network(nn.Module):
    
    def __init__(self):
        super().__init__()
        self.layers = nn.Sequential(nn.Linear(10, 100, bias=False),
                                    nn.ReLU(),
                                    nn.Linear(100, 100, bias=False),
                                    nn.ReLU(),
                                    nn.Linear(100, 100, bias=False),
                                    nn.ReLU(),
                                    nn.Linear(100, 1, bias=False),
                                   )
    
    def forward(self, x):
        return self.layers(x)

In [None]:
x = torch.randn(1000, 10)
net = Network()

hooks = []
activations = []
hook_fun = track_activations(activations)

for layer in net.layers:
    hook = layer.register_forward_hook(hook_fun)
    hooks.append(hook)

with torch.no_grad():
    res = net(x)

for hook in hooks:
    hook.remove()
    
plt.figure(figsize=(15, 15))

for i, act in enumerate(activations):
    plt.subplot(4, 3, i+1)
    plt.hist(act.reshape(-1).numpy(), bins=9, density=True, range=(-2.1,2.1))
    plt.gca().set_ylim(0, 1)
plt.show()

## Pros and Cons for both methods:
- **Weight Initalization:**
    - Pro: Minimal computation overhead; Con: Effect wears off the deeper you go
- **BatchNorm:**
    - Pro: Can be applied even at deep layers; Con: Expensive operation

### Take-Home Assignment 2:
- Try incorporating BatchNorm layers and different weight initialisation in your MNIST training code. Play around with deep networks with and without said techniques and exhibit how the loss changes. 