## Framework

In the previous workshop I proposed a simple framework:

 * An intro section
 * A repeated block
 * A classification section

We should use that again now and then each of us can try out different techniques to see what works best.

In [None]:
from typing import *

import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision.models as models
import torch.optim as optim

import numpy as np
from torchvision import datasets, transforms

from tqdm import tqdm

import PIL

In [None]:
CUDA_AVAILABLE = torch.cuda.is_available()

if not CUDA_AVAILABLE:
    print("If you are running this on Google Colab then")
    print("Menu -> Runtime -> Change runtime type -> Hardware Accelerator -> GPU")
    print("Then try this again...")

In [None]:
def to_image(image: torch.Tensor) -> PIL.Image:
    # the rescaling also reverses the normalization (close enough)
    image -= image.min()
    image /= image.max()
    return transforms.functional.to_pil_image(image.cpu(), 'RGB')

In [None]:
def train(
    model: nn.Module,
    train: torch.utils.data.DataLoader,
    valid: torch.utils.data.DataLoader,
    epochs: int = 4,
    lr: float = 0.001
) -> None:
    optimizer = optim.AdamW(params=model.parameters(), lr=lr)
    loss = nn.CrossEntropyLoss()

    train_batches = len(train)
    train_loss = 0.

    valid_batches = len(valid)
    valid_loss = 0.

    for epoch in range(epochs):
        train_loss = eval_loss = 0.

        for inputs, targets in tqdm(train, desc=f"train {epoch}"):
            if CUDA_AVAILABLE:
                inputs, targets = inputs.cuda(), targets.cuda()

            optimizer.zero_grad()
            outputs = model(inputs)
            loss_value = loss(outputs, targets)
            loss_value.backward()
            optimizer.step()

            train_loss += loss_value.item()

        with torch.no_grad():
            for inputs, targets in tqdm(valid, desc=f"valid {epoch}"):
                if CUDA_AVAILABLE:
                    inputs, targets = inputs.cuda(), targets.cuda()

                outputs = model(inputs)
                loss_value = loss(outputs, targets)
                valid_loss += loss_value.item()
        
        # remember tensorboardx makes pretty graphs
        train_loss /= train_batches
        valid_loss /= valid_batches
        print()
        print(f"train loss: {train_loss:.2e}")
        print(f"valid loss: {valid_loss:.2e}")

In [None]:
def score(
    model: nn.Module,
    valid: torch.utils.data.DataLoader,
) -> float:
    correct, incorrect = 0, 0

    with torch.no_grad():
        for inputs, targets in tqdm(valid, desc=f"validation"):
            if CUDA_AVAILABLE:
                inputs, targets = inputs.cuda(), targets.cuda()

            outputs = model(inputs)
            matching = torch.eq(targets, outputs.argmax(dim=1)).sum().item()

            correct += matching
            incorrect += inputs.shape[0] - matching
    
    return correct / (correct + incorrect)

---

### Data

Maybe you want to change the augmentation?
You can just alter this bit...

In [None]:
train_ds = datasets.CIFAR10(
    'data',
    download=True,
    train=True,
    transform=transforms.Compose([
        # This lets you randomly apply all the transformations in this list.
        # The test is not once per transform, it either skips all transforms or applies all transforms.
        transforms.RandomApply([
            transforms.ColorJitter(brightness=0.1, contrast=0.1, saturation=0.1, hue=0.1)
        ]),
        
        # This is a combination of RandomApply and HorizontalFlip, by default has a 50% chance of flipping the image
        transforms.RandomHorizontalFlip(),

        # We can only train with tensors, so we convert the image to a tensor.
        transforms.ToTensor(),

        # A very good thing to do is to normalize the tensors.
        # This ensures the resulting tensors have a mean of 0 and a standard deviation of 1.
        # For pre-existing datasets you can look up the normalization values, or you can calculate them like I did above.
        transforms.Normalize(mean=(0.49139968, 0.48215841, 0.44653091), std=(0.24703223, 0.24348513, 0.26158784)),
    ]),
)

In [None]:
valid_ds = datasets.CIFAR10(
    'data',
    download=True,
    train=False,
    transform=transforms.Compose([
        transforms.ToTensor(),
        # Must apply the same normalization!
        transforms.Normalize(mean=(0.49139968, 0.48215841, 0.44653091), std=(0.24703223, 0.24348513, 0.26158784)),
    ]),
)

In [None]:
BATCH_SIZE = 128

train_dl = torch.utils.data.DataLoader(
    train_ds,
    batch_size=BATCH_SIZE,
    shuffle=True,
    num_workers=4
)
valid_dl = torch.utils.data.DataLoader(
    valid_ds,
    batch_size=BATCH_SIZE * 2,
    shuffle=True,
    num_workers=4,
)

---

### Framework

Here is the framework.
The preparation and classification layers will be the same as the previous workshop.
This is because the CIFAR-10 dataset has only 10 classes.

In [None]:
def how_it_shrinks(train_dl: torch.utils.data.DataLoader, layer: nn.Module = nn.Conv2d(3, 3, kernel_size=(1, 1), stride=(2, 2))):
    image, _ = next(iter(train_dl))
    print(f"Start: {image.shape}")

    for _ in range(5):
        image = layer(image)
        print(image.shape)

how_it_shrinks(train_dl)

As you can see above we can fit in 4 reductions, so if your repeated block has a single stride downwards then you can repeat that 4 times.

---

#### Intro

I'm going to copy the intro block that was used before.
Feel free to change this!

 * Maybe you want more convolutions?
 * Maybe change the activation?
 * Maybe add some more exotic layers?

In [None]:
def intro() -> nn.Module:
    return nn.Sequential(
        nn.Conv2d(
            in_channels=3,
            out_channels=8,
            kernel_size=(7, 7),
            padding=(4, 4),
            padding_mode='reflect',
        ),
        nn.Tanh(),
    )
    # resnet18 also used a stride at this point, you could try that out

---

#### Classification

I'm going to copy the classification block that was used before. Feel free to change this!

 * Maybe change AdaptiveMaxPool to something else?
 * Maybe more linear layers with activations?

Remember that the maximum output of this is the chosen class, so adding something to the end that just scales will not change the results.
It might change the training though.

In [None]:
def classification() -> nn.Module:
    return nn.Sequential(
        nn.AdaptiveMaxPool2d(output_size=(1,1)), # could try AdaptiveAvg
        nn.Flatten(), # SNEAKY
        nn.Linear(in_features=64, out_features=10)
    )

---

#### Repeating Block

This is the best place to change. YOU MUST CHANGE THIS!

 * More layers?
 * Less layers?
 * Different layers?
 * More parameters to control the innards?

Halving the size of the output is nice but not necessary.

When applying a convolution remember to use a padding.
The padding is half the kernel size, rounded down.
This keeps the image the same size.

In [None]:
def repeating_block(in_channels: int, out_channels: int) -> nn.Module:
    return nn.Sequential(
        nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            padding=(1, 1),
            padding_mode='reflect',
        ),
        nn.Tanh(),
        nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            padding=(1, 1),
            padding_mode='reflect',
        ),
        nn.Tanh(),
        nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(2, 2),
            padding=(1, 1),
            padding_mode='reflect',
        ),
        nn.Tanh(),
    )

---

### Define Your Model!

Again I'm copying the old model, feel free to change this (change it in the Train section though).

 * More features?
 * Less features?
 * etc

In [None]:
# this is redefined in the train and evaluate so you can just copy that block
# you can use this as a reference, change the one below

raise ValueError("Change it in the cell below")

model = nn.Sequential(
    intro(),
    repeating_block(8, 8), # intro increased to 8 already
    repeating_block(8, 16), # image now 8x8
    repeating_block(16, 32), # image now 4x4
    repeating_block(32, 64), # image now 2x2
    classification()
)

if CUDA_AVAILABLE:
    model = model.cuda()

---

## Train and Evaluate!

Run this to see how well you did.
Good luck!

You can just copy the cell lots of times to try out different approaches and compare scores.

In [None]:
# model is here so that it is created fresh every time you evaluate it

def intro() -> nn.Module:
    return nn.Sequential(
        nn.Conv2d(
            in_channels=3,
            out_channels=8,
            kernel_size=(7, 7),
            padding=(4, 4),
            padding_mode='reflect',
        ),
        nn.Tanh(),
    )
    # resnet18 also used a stride at this point, you could try that out

def classification() -> nn.Module:
    return nn.Sequential(
        nn.AdaptiveMaxPool2d(output_size=(1,1)), # could try AdaptiveAvg
        nn.Flatten(), # SNEAKY
        nn.Linear(in_features=64, out_features=10)
    )

def repeating_block(in_channels: int, out_channels: int) -> nn.Module:
    return nn.Sequential(
        nn.Conv2d(
            in_channels=in_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            padding=(1, 1),
            padding_mode='reflect',
        ),
        nn.Tanh(),
        nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            padding=(1, 1),
            padding_mode='reflect',
        ),
        nn.Tanh(),
        nn.Conv2d(
            in_channels=out_channels,
            out_channels=out_channels,
            kernel_size=(3, 3),
            stride=(2, 2),
            padding=(1, 1),
            padding_mode='reflect',
        ),
        nn.Tanh(),
    )

model = nn.Sequential(
    intro(),
    repeating_block(8, 8), # intro increased to 8 already
    repeating_block(8, 16), # image now 8x8
    repeating_block(16, 32), # image now 4x4
    repeating_block(32, 64), # image now 2x2
    classification()
)

if CUDA_AVAILABLE:
    model = model.cuda()

train(model, train_dl, valid_dl, epochs=4, lr=0.001)
score(model, valid_dl)