# Name: Nalet Meinen¶

# Assignment 2 ATML 2020
## Classification with limited data
ImageNet is a well known dataset with 1000 image classes. We will be working on a subset of the dataset (60k images, 100 classes, 600 images per class 80$\times$80 pixels, RGB) and train a model to classify an image into one of the 100 classes. The dataset is located under the "data" directory. Training and validation data splits are under "data/train" and "data/val" directories respectively. Both splits consist of 100 directories, each representing an object category.

## Task 1. Implement ImageNetLimited class for data loading in datasets.py file

In [1]:
import numpy as np
import torch
from torchvision.datasets import ImageFolder
from torchvision.transforms import Resize, ToTensor, Normalize, Compose, CenterCrop
from datasets import ImageNetLimited
from torch.utils.data import DataLoader
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
if device.type == 'cuda':
    print("This nootebook ran on",device.type,"With number of GPU:",torch.cuda.device_count())

train_dir = 'data/train'
validation_dir = 'data/val'
len_classes = 100

# write your code
transforms = Compose(   [Resize(64),
                        CenterCrop(56),
                        ToTensor(),
                        Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

_train_dataset = ImageFolder(train_dir, transform=transforms)
_val_dataset = ImageFolder(validation_dir, transform=transforms)

train_dataset = ImageNetLimited(_train_dataset)
val_dataset = ImageNetLimited(_val_dataset)

batch_size = 64
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=0) #set num_workers = 0, to fix BrokenPipeError in Windows
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=0)

This nootebook ran on cuda With number of GPU: 1


HBox(children=(FloatProgress(value=0.0, max=60000.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=29855.0), HTML(value='')))




## Task 2. CNN Architecture
Design and implement a Convolutional Neural Network architecture for image classification in a **ConvNet** class in the notebook. Some examples of popular classification models are: AlexNet, VGG, ResNet, ... Justify your design choices in the report. The input to your model must be an image of size $64 \times 64$ pixels.

In [2]:
import torch.nn as nn

def conv3x3(in_planes, out_planes, stride=1, groups=1, dilation=1):
    """3x3 convolution with padding"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                     padding=dilation, groups=groups, bias=False, dilation=dilation)


def conv1x1(in_planes, out_planes, stride=1):
    """1x1 convolution"""
    return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)


class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class Bottleneck(nn.Module):
    # Bottleneck in torchvision places the stride for downsampling at 3x3 convolution(self.conv2)
    # while original implementation places the stride at the first 1x1 convolution(self.conv1)
    # according to "Deep residual learning for image recognition"https://arxiv.org/abs/1512.03385.
    # This variant is also known as ResNet V1.5 and improves accuracy according to
    # https://ngc.nvidia.com/catalog/model-scripts/nvidia:resnet_50_v1_5_for_pytorch.

    expansion = 4

    def __init__(self, inplanes, planes, stride=1, downsample=None, groups=1,
                 base_width=64, dilation=1, norm_layer=None):
        super(Bottleneck, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        width = int(planes * (base_width / 64.)) * groups
        # Both self.conv2 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv1x1(inplanes, width)
        self.bn1 = norm_layer(width)
        self.conv2 = conv3x3(width, width, stride, groups, dilation)
        self.bn2 = norm_layer(width)
        self.conv3 = conv1x1(width, planes * self.expansion)
        self.bn3 = norm_layer(planes * self.expansion)
        self.relu = nn.ReLU(inplace=True)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x):
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)
        out = self.relu(out)

        out = self.conv3(out)
        out = self.bn3(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out

class ConvNet(nn.Module):
    def __init__(self, block, layers, num_classes=1000, zero_init_residual=False,
                 groups=1, width_per_group=64, replace_stride_with_dilation=None,
                 norm_layer=None):
        super(ConvNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0])
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0])
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1])
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2])
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)
                elif isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)

    def _make_layer(self, block, planes, blocks, stride=1, dilate=False):
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer))

        return nn.Sequential(*layers)

    def _forward_impl(self, x):
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def forward(self, x):
        return self._forward_impl(x)



## Task 3. Train Model
Implement training and evaluation code for your model. Choose an appropriate loss function and evaluate the model on the validation set using classification accuracy. You are not allowed to use a pre-trained model (must train from scratch on the provided data).<br>
<font color='red'>Your model should achieve an accuracy of at least 40.0% on the validation set (Model with performance smaller than 40.0% will result in 0 points for this task).</font><br>

In [3]:
def train_epoch(model, train_dataloader, optimizer, loss_fn):
    losses = []
    correct_predictions = 0
    # Iterate mini batches over training dataset
    for images, labels in tqdm(train_dataloader):
        images = images.to(device)
        labels = labels.to(device)
        # Run predictions
        output = model(images)
        # Set gradients to zero
        optimizer.zero_grad()
        # Compute loss
        loss = loss_fn(output, labels)
        # Backpropagate (compute gradients)
        loss.backward()
        # Make an optimization step (update parameters)
        optimizer.step()
        # Log metrics
        losses.append(loss.item())
        predicted_labels = output.argmax(dim=1)
        correct_predictions += (predicted_labels == labels).sum().item()
    accuracy = 100.0 * correct_predictions / len(train_dataloader.dataset)
    # Return loss values for each iteration and accuracy
    mean_loss = np.array(losses).mean()
    return mean_loss, accuracy

def evaluate(model, dataloader, loss_fn):
    losses = []
    correct_predictions = 0
    with torch.no_grad():
        for images, labels in dataloader:
            images = images.to(device)
            labels = labels.to(device)
            # Run predictions
            output = model(images)
            # Compute loss
            loss = loss_fn(output, labels)
            # Save metrics
            predicted_labels = output.argmax(dim=1)
            correct_predictions += (predicted_labels == labels).sum().item()
            losses.append(loss.item())
    mean_loss = np.array(losses).mean()
    accuracy = 100.0 * correct_predictions / len(dataloader.dataset)
    # Return mean loss and accuracy
    return mean_loss, accuracy

def train(model, train_dataloader, val_dataloader, optimizer, n_epochs, loss_fn):
    # We will monitor loss functions as the training progresses
    train_losses = []
    val_losses = []
    train_accuracies = []
    val_accuracies = []

    for epoch in tqdm(range(n_epochs)):
        model.train()
        train_loss, train_accuracy = train_epoch(model, train_dataloader, optimizer, loss_fn)
        model.eval()
        val_loss, val_accuracy = evaluate(model, val_dataloader, loss_fn)
        train_losses.append(train_loss)
        val_losses.append(val_loss)
        train_accuracies.append(train_accuracy)
        val_accuracies.append(val_accuracy)
        print('Epoch {:02d}/{:02d}: train_loss: {:.4f}, train_accuracy: {:08.4f}, val_loss: {:.4f}, val_accuracy: {:.4f}'.format(epoch+1, n_epochs,
                                                                                                      train_losses[-1],
                                                                                                      train_accuracies[-1],
                                                                                                      val_losses[-1],
                                                                                                      val_accuracies[-1]))
    return train_losses, val_losses, train_accuracies, val_accuracies

def train_model(learning_rate,n_epochs,train_dataloader, val_dataloader, model, size=64*64*3):
    print('Run with ' + str(learning_rate) + ' learning rate and number of epochs: ' + str(n_epochs))
    model = model.to(device)
    loss_fn = torch.nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate)
    train_losses, val_losses, train_accuracies, val_accuracies = train(model, train_dataloader, val_dataloader, optimizer, n_epochs, loss_fn)

    return learning_rate,n_epochs, train_losses, val_losses, train_accuracies, val_accuracies, model, loss_fn

In [4]:
model = ConvNet(Bottleneck, [3, 4, 23, 3])
result_test_01_10 = train_model(0.1, 5, train_dataloader, val_dataloader, model)
result_test_001_20 = train_model(0.01, 5, train_dataloader, val_dataloader, model)
result_test_0001_30 = train_model(0.001, 5, train_dataloader, val_dataloader, model)

Run with 0.1 learning rate and number of epochs: 5


HBox(children=(FloatProgress(value=0.0, max=5.0), HTML(value='')))

HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))


Epoch 01/05: train_loss: 4.5637, train_accuracy: 003.1850, val_loss: 8.3985, val_accuracy: 3.0179


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))


Epoch 02/05: train_loss: 4.0174, train_accuracy: 007.2217, val_loss: 3.9528, val_accuracy: 8.5212


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))


Epoch 03/05: train_loss: 3.8137, train_accuracy: 009.8717, val_loss: 4.1418, val_accuracy: 8.5346


HBox(children=(FloatProgress(value=0.0, max=938.0), HTML(value='')))

KeyboardInterrupt: 

In [None]:
def plot_results(train_results, image_size=(64,64,3), previous_results=None):

    learning_rate,n_epochs, train_losses, val_losses, train_accuracies, val_accuracies, model, loss_fn = train_results

    print('Results from ' + str(learning_rate) + ' learning rate and number of epochs: ' + str(n_epochs))

    parameters_t = list(model.parameters()).copy()

    maxVal = parameters_t[0].max()
    minVal = abs(parameters_t[0].min())
    maxVal = max(maxVal,minVal)
    parameters_t[0] = parameters_t[0] / maxVal
    parameters_t[0] = parameters_t[0] / 2
    parameters_t[0] = parameters_t[0] + 0.5

    fig, axs = plt.subplots(1, 10, figsize=(30, 3))
    for ax, w_index in zip(axs, range(0,10)):
        raw_weights = parameters_t[0][w_index].cpu().detach().numpy()
        image_size_c = image_size[0]*image_size[1]
        image_weights = np.empty(image_size)
        image_weights[..., 0] = raw_weights[:image_size_c*1].reshape((image_size[0],image_size[1]))
        image_weights[..., 1] = raw_weights[image_size_c*1:image_size_c*2].reshape((image_size[0],image_size[1]))
        image_weights[..., 2] = raw_weights[image_size_c*2:image_size_c*3].reshape((image_size[0],image_size[1]))
        ax.imshow(image_weights)
        ax.axis('off')
        ax.set_title(w_index + 1)

    fig.suptitle("Weights of each layer")
    fig.subplots_adjust(top=0.85)
    fig.patch.set_facecolor('xkcd:gray')      
    plt.show()
    
    fig, axs = plt.subplots(1, 2,figsize=(15, 4))
    if previous_results != None:
        p_learning_rate,p_n_epochs, p_train_losses, p_val_losses, p_train_accuracies, p_val_accuracies, p_model, p_loss_fn = previous_results
    
    axs[0].plot(np.arange(n_epochs), train_losses)
    axs[0].plot(np.arange(n_epochs), val_losses)
    if previous_results != None:
        axs[0].plot(np.arange(p_n_epochs), p_train_losses)
        axs[0].plot(np.arange(p_n_epochs), p_val_losses)
        axs[0].legend(['train_loss', 'val_loss', 'previous train_loss', 'previous val_loss'])
    else:
        axs[0].legend(['train_loss', 'val_loss'])
    axs[0].set_xlabel('epoch')
    axs[0].set_ylabel('loss value')
    axs[0].set_title('Train/val loss')

    axs[1].plot(np.arange(n_epochs), train_accuracies)
    axs[1].plot(np.arange(n_epochs), val_accuracies)
    if previous_results != None:
        axs[1].plot(np.arange(p_n_epochs), p_train_accuracies)
        axs[1].plot(np.arange(p_n_epochs), p_val_accuracies)
        axs[1].legend(['train_acc', 'val_acc','previous train_acc', 'previous val_acc'])
    else:
        axs[1].legend(['train_acc', 'val_acc'])
    axs[1].set_xlabel('epoch')
    axs[1].set_ylabel('accuracy')
    axs[1].set_title('Train/val accuracy')

    plt.show()

In [None]:
plot_results(result_test_01_10)
plot_results(result_test_001_20)
plot_results(result_test_0001_30)

## Task 4. Ablations
Try to find the best performing model by tuning the model design and hyper-parameters on the validation set. Perform ablation experiments to illustrate the effect of the most important hyper-parameters. Some examples of ablations: training parameters (e.g., optimizer, learning rates, batch size), network architecture (e.g., number of layers, number of units, activation function, normalization layers), model regularization (e.g., data augmentation, dropout, weight decay, early stopping), test-time augmentation, etc...  <br>**Perform at least 5 ablations and report the performance of each on the validation set.**

In [None]:
# write your code

## Task 5. Model Errors
Evaluate the trained model on the validation set and plot 10 random mistakes that your model made.

In [None]:
# write your code

## Task 6. Competition time!
Read the images from "data/test" folder. There are no labels for these images. Run your best model on these images and save the image IDs (names) and predicted label in a file LastName.csv. You will receive a link via email to upload the CSV file to  an online system which will give you the score of your model on the held-out test set. Top 5 students with at least 40% classification accuracy will obtain bonus points.

In [None]:
# write your code