In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import numpy as np
import os
import torch
import torch.nn as nn
import torchvision as vision
from torchvision.datasets import MNIST
from torchvision import transforms
import matplotlib.pyplot as plt

# Exercise 7

## Contents
- Convolutions & Cross-correlations
- Pooling layers
- Handwritten Digit Classifier

## Convolutions & Cross-correlations

**Question:** How to apply what we learned on images?

**Bad Solution:** Treat each pixel as an input for a fully-connected net (**vectorize image**)
- E.g. single fully-connected layer with one megapixel color images into 1000 categories **= 3 billion parameters!**

**Better Solution:** Convolutions

**Basic Idea:** 
- Shift convolution kernels over image computing local pointwise product between kernel and local pixels
- Summation of these values = Activation of convolutional kernel at said point in image
- Special cases:
    - **Padding:** How to treat outer values kernel does not reach? (ignore, zero, circular or replicate)
    - **Stride:** How many pixels to shift the kernel after each computation? (default: 1)

<img src="conv.jpg" alt="Drawing" style="width: 700px;"/>

**Cross-Correlations**:
- Definition of convolution would flip the kernel. Cross-correlation = same as convolution, but no flipping
- **Convolution is cross-correlation with a kernel rotated by 180 degrees!**

**NOTE:** In Deep Learning we refer to Convolution what would actually be Cross-Correlation. If we talk about Convolutions in this lecture we refer to the cross-correlation computation 

**3D-Case:**
- Typically convolve with a 3D-Filter extending in the third dimension = number of channels of data
- Thus, we do not move in the third dimension

<img src="2d_conv.gif" alt="Drawing" style="width: 500px;"/>

## Pooling layers

**Goal:** Reduce the dimensionality of your data (e.g. size of image)

**Similar to convolution:** slide kernel over image (usually in non-overlapping fashion); at each stride respective "pixels" get reduced to one value.

**Variants:**
- **Max-Pooling:** Only keep the maximum pixel value among all pixel values at each stride
- **Avg-Pooling:** Only keep the average value of all pixel values at each stride
- **Fractional Max-Pooling:** Same as Avg-Pooling but takingt the $l^p$-norm

## Handwritten Digit Classifier

In [None]:
batch_size_train = 64
batch_size_test = 100

train_set = MNIST('data', download=True, transform=transforms.ToTensor())
test_set = MNIST('data', train=False, transform=transforms.ToTensor())

train_loader = torch.utils.data.DataLoader(train_set, batch_size=batch_size_train, num_workers=4, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=batch_size_test, num_workers=4, shuffle=True)

### Task 1: Define your network
Write a network architecture which is structured as follows:
1. 5x5 convolution with 10 output channels
2. Max-Pooling with kernel size 2
3. 5x5 convolution with 20 output channels
4. Max-Pooling with kernel size 2
5. Fully connected layer with 50 output channels
6. Dropout (p = 50%)
7. Fully connected layer with 10 output channels (for the number of digits)

**Note:** Every layer except the last two is followed by a ReLU. 

In [None]:
class MNISTClassifier(nn.Module):
    def __init__(self):
        super().__init__()
        
        
    def forward(self, x):
        # first convolution block: conv - max_pool - relu
        
        # dropout

        # second convolution block: conv - max_pool - relu
        
        # converting convolution output for linear layers
        
        # linear classifier
        
        return x

In [None]:
def train(train_loader, network, loss_fun, optimizer):
    train_loss = 0.
    train_correct = 0
        
    # Training
    network.train()
    for i, (x, y) in enumerate(train_loader):
        x, y = x.float(), y.long()
            
        # Prediction
        output = network.forward(x)
        batch_loss = loss_fun(output, y)
        train_loss += batch_loss
        _, pred = output.data.max(1, keepdim=True)
        train_correct += pred.eq(y.data.view_as(pred)).sum()
            
        # Optimization
        network.zero_grad()
        batch_loss.backward()
        optimizer.step()
    
    train_loss /= len(train_loader)
    train_acc = 100. * train_correct / len(train_loader.dataset)
    print("Avg. Batch-Training loss: {:.4f} & accuracy: {}/{} ({:.2f}%)".format(train_loss, train_correct, len(train_loader.dataset), train_acc))
    
    return train_loss, train_acc

In [None]:
def test(test_loader, network, loss_fun):
    test_loss = 0.
    test_correct = 0

    # Initialize the prediction and label lists(tensors)
    pred_list = torch.zeros(0, dtype=torch.long)
    gt_list = torch.zeros(0, dtype=torch.long)
    
    # Inference
    network.eval()
    with torch.no_grad():
        for i, (x, y) in enumerate(test_loader):
            x, y = x.float(), y.long()
            
            # Prediction
            output = network.forward(x)
            test_loss += loss_fun(output, y)
            _, pred = output.data.max(1, keepdim=True)
            
            # Eval
            test_correct += pred.eq(y.data.view_as(pred)).sum()
            pred_list = torch.cat([pred_list, pred.view(-1)])
            gt_list = torch.cat([gt_list, y.view(-1)])
            
    test_loss /= len(test_loader)
    test_acc = 100. * test_correct / len(test_loader.dataset)
    print('Avg. Batch-Test Loss: {:.4f} & accuracy: {}/{} ({:.2f}%)'.format(test_loss, test_correct, len(test_loader.dataset), test_acc))
    
    return test_loss, test_acc, pred_list, gt_list

In [None]:
net = MNISTClassifier()
opt = torch.optim.SGD(net.parameters(), lr=0.01, momentum=0.5)
loss = nn.CrossEntropyLoss()
epochs = 5

conv_train_losses, conv_train_accs = [], []
conv_test_losses, conv_test_accs = [], []

for e in range(epochs):
    print("Epoch: {}/{}".format(e+1, epochs))
    
    train_loss, train_acc = train(train_loader, net, loss, opt)
    test_loss, test_acc, conv_pred_list, conv_gt_list = test(test_loader, net, loss)
    
    conv_train_losses.append(train_loss.detach().numpy())
    conv_train_accs.append(train_acc.detach().numpy())
    
    conv_test_losses.append(test_loss)
    conv_test_accs.append(test_acc)

In [None]:
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))
fig.suptitle('MNIST Loss & Accuracy Development')
ax1.plot(range(1, epochs+1), conv_train_losses, color='blue')
ax1.plot(range(1, epochs+1), conv_test_losses, color='red')
ax1.legend(['Train Loss', 'Test Loss'], loc='upper right')
ax1.set_xlabel('number of epochs')
ax1.set_ylabel('negative log likelihood loss')
ax1.title.set_text('Loss curve')

ax2.plot(range(1, epochs+1), conv_train_accs, color='blue')
ax2.plot(range(1, epochs+1), conv_test_accs, color='red')
ax2.legend(['Train Loss', 'Test Loss'], loc='upper left')
ax2.set_xlabel('number of epochs')
ax2.set_ylabel('accuracy')
ax2.title.set_text('Accuracy curve')

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay

conf_mat = confusion_matrix(conv_gt_list.numpy(), conv_pred_list.numpy())
ConfusionMatrixDisplay(coanfusion_matrix=conf_mat).plot()

### Task 2: Where to go from here?
- Discuss: how can we improve the performance of our network? What comes to your mind?

### Take Home Assignment:
- Compare our convolutional network to our fully-connected solution using also the additional evaluation tools provided in this notebook. Which one produces better results? Which one is faster? 