# Tutorial 07 - Convolutional Neural Networks

## Cats vs Dogs
Dataset can be downloaded from http://files.fast.ai/data/dogscats.zip  
Classification to 2 classes  
Basic setup as before:

In [1]:
import numpy as np
import torch
import torch.nn as nn
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print("Device: " + str(torch.cuda.is_available()) + " Count: " + str(torch.cuda.device_count()))

Device: True Count: 1


In [2]:
from torchvision.datasets import ImageFolder
from torchvision.transforms import Resize, ToTensor, Normalize, Compose
root_dir = '../data/dogscats/train'

target_size = (32, 32)
transforms = Compose([Resize(target_size), # Resizes image
                    ToTensor(),           # Converts to Tensor, scales to [0, 1] float (from [0, 255] int)
                    Normalize(mean=(0.5, 0.5, 0.5,), std=(0.5, 0.5, 0.5)), # scales to [-1.0, 1.0]
                    ])

train_dataset = ImageFolder(root_dir, transform=transforms)

In [3]:
len(train_dataset)

23000

In [4]:
from torch.utils.data import DataLoader
batch_size = 32
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=32)

In [5]:
# Same for validation dataset
val_root_dir = '../data/dogscats/valid'
val_dataset = ImageFolder(val_root_dir, transform=transforms)
val_dataloader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)
print(len(val_dataset))

2000


print(len(val_dataset))

# Baseline - MLP model
### We will compare MLP and ConvNets in terms of accuracy, number of parameters, etc.

from utils_train import fit

In [6]:
from utils_train import fit
class MLPModel(nn.Module):
    
    def __init__(self, input_dim, hidden_dim):
        super(MLPModel, self).__init__()
        self.layers = nn.Sequential(
            nn.Linear(input_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, hidden_dim),
            nn.ReLU(),
            nn.Linear(hidden_dim, 2)
        )
    
    def forward(self, input):
        input = input.view(input.size(0), -1)
        return self.layers(input)

In [7]:
model_mlp = MLPModel(32*32*3, 128)
model_mlp = model_mlp.to(device)
learning_rate = 0.001
optimizer = torch.optim.Adam(model_mlp.parameters(), lr=learning_rate)
n_epochs = 25
loss_fn = nn.CrossEntropyLoss()

In [8]:
curves_mlp = fit(train_dataloader, val_dataloader, model_mlp, optimizer, loss_fn, n_epochs)

Epoch 1/25: train_loss: 0.6539, train_accuracy: 61.0435, val_loss: 0.6403, val_accuracy: 61.7500
Epoch 2/25: train_loss: 0.6190, train_accuracy: 65.3783, val_loss: 0.6265, val_accuracy: 64.9500
Epoch 3/25: train_loss: 0.5992, train_accuracy: 66.8826, val_loss: 0.6253, val_accuracy: 64.4000
Epoch 4/25: train_loss: 0.5770, train_accuracy: 69.3000, val_loss: 0.6311, val_accuracy: 63.7500
Epoch 5/25: train_loss: 0.5556, train_accuracy: 70.6783, val_loss: 0.6149, val_accuracy: 65.6000
Epoch 6/25: train_loss: 0.5344, train_accuracy: 72.5087, val_loss: 0.6336, val_accuracy: 66.8000
Epoch 7/25: train_loss: 0.5106, train_accuracy: 74.0478, val_loss: 0.6381, val_accuracy: 66.3500
Epoch 8/25: train_loss: 0.4819, train_accuracy: 76.0565, val_loss: 0.6669, val_accuracy: 66.7000
Epoch 9/25: train_loss: 0.4553, train_accuracy: 77.4478, val_loss: 0.6743, val_accuracy: 65.4500
Epoch 10/25: train_loss: 0.4271, train_accuracy: 79.1739, val_loss: 0.6525, val_accuracy: 67.1000
Epoch 11/25: train_loss: 0.39

### With some regularization the best accuracy we got before for this MLP model was ~68%

## Convolutional Neural Network
### Two convolutions with Max Pooling, followed by linear layer on flattened feature maps

In [9]:
import torch.nn as nn
    
class ConvModel(nn.Module):
    
    def __init__(self):
        super(ConvModel, self).__init__()
        self.conv_layers = nn.Sequential(
            # input size: 3x32x32
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, stride=1, padding=0),
            # output: 16x28x28 (because kernel_size=5 and no padding)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # output: 16x14x14
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=0),
            nn.ReLU(),
            # output: 32x10x10
            nn.MaxPool2d(kernel_size=2, stride=2),
            # output: 32x5x5
        )
        self.linear_layer = nn.Linear(32*5*5, 2) # map output of conv layers to number of outputs (2 classes in this case)
    
    def forward(self, input):
        output = self.conv_layers(input) # NO FLATTENING HERE, expected input is of dimension batch_size x n_channels x im_height x im_width
        output = output.view(output.size(0), 32*5*5) # Flatten to a vector before passing to a linear layer
        output = self.linear_layer(output)
        return output

In [10]:
model_conv = ConvModel()
model_conv = model_conv.to(device)
learning_rate = 0.001
optimizer = torch.optim.Adam(model_conv.parameters(), lr=learning_rate)
n_epochs = 25
loss_fn = nn.CrossEntropyLoss()

In [11]:
curves_conv1 = fit(train_dataloader, val_dataloader, model_conv, optimizer, loss_fn, n_epochs)

Epoch 1/25: train_loss: 0.6104, train_accuracy: 66.4652, val_loss: 0.5894, val_accuracy: 68.9000
Epoch 2/25: train_loss: 0.5417, train_accuracy: 72.8870, val_loss: 0.5232, val_accuracy: 74.4500
Epoch 3/25: train_loss: 0.5057, train_accuracy: 75.3348, val_loss: 0.5073, val_accuracy: 75.2500
Epoch 4/25: train_loss: 0.4715, train_accuracy: 77.4087, val_loss: 0.4845, val_accuracy: 77.0500
Epoch 5/25: train_loss: 0.4431, train_accuracy: 79.2696, val_loss: 0.4544, val_accuracy: 78.1000
Epoch 6/25: train_loss: 0.4210, train_accuracy: 80.6261, val_loss: 0.4382, val_accuracy: 78.8500
Epoch 7/25: train_loss: 0.4060, train_accuracy: 81.2565, val_loss: 0.4220, val_accuracy: 80.7000
Epoch 8/25: train_loss: 0.3896, train_accuracy: 82.1217, val_loss: 0.4485, val_accuracy: 78.4000
Epoch 9/25: train_loss: 0.3743, train_accuracy: 83.2174, val_loss: 0.4571, val_accuracy: 77.1000
Epoch 10/25: train_loss: 0.3646, train_accuracy: 83.4870, val_loss: 0.4361, val_accuracy: 79.8000
Epoch 11/25: train_loss: 0.35

KeyboardInterrupt: 

In [None]:
import torch.nn as nn
    
class ConvModel2(nn.Module):
    
    def __init__(self):
        super(ConvModel2, self).__init__()
        self.conv_layers = nn.Sequential(
            # input size: 3x32x32
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, stride=1, padding=0),
            # output: 16x28x28 (because kernel_size=5 and no padding)
            nn.ReLU(),
            nn.MaxPool2d(kernel_size=2, stride=2),
            # output: 16x14x14
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, stride=1, padding=0),
            nn.ReLU(),
            # output: 32x10x10
            nn.MaxPool2d(kernel_size=2, stride=2),
            # output: 32x5x5
            nn.AvgPool2d(kernel_size=5, stride=5) # alternatively: AdaptiveAvgPool(1)
        )
        self.linear_layer = nn.Linear(32, 2)
    
    def forward(self, input):
        output = self.conv_layers(input)
        output = output.view(output.size(0), 32)
        output = self.linear_layer(output)
        return output

In [None]:
model_conv2 = ConvModel2()
model_conv2 = model_conv2.to(device)
learning_rate = 0.001
optimizer = torch.optim.Adam(model_conv2.parameters(), lr=learning_rate)
n_epochs = 25
loss_fn = nn.CrossEntropyLoss()

In [None]:
curves_conv2 = fit(train_dataloader, val_dataloader, model_conv2, optimizer, loss_fn, n_epochs)

# Compare number of parameters

In [None]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [None]:
print('MLP: {} parameters'.format(count_parameters(model_mlp)))
print('ConvNet1: {} parameters'.format(count_parameters(model_conv)))
print('ConvNet2: {} parameters'.format(count_parameters(model_conv2)))

__Model MLP__:
- 1st layer: (32 * 32 * 3)*128 + 128 (biases) __// input size dependant, 92% of parameters!__
- 2nd layer: 128 * 128 + 128
- 3rd layer: 128 * 128 + 128
- 4th layer: 128 * 2 + 2

__Model ConvNet1__:
- 1st layer (conv): 16 * 3 * 5 * 5 + 16 (biases)
- 2nd layer (conv): 32 * 16 * 5 * 5 + 32
- 3rd layer (linear): (32 * 5 * 5) * 2 + 2 __// input size dependant__ <- can grow very big for bigger input size and large number of filters

__Model ConvNet2__:
- 1st layer (conv): 16 * 3 * 5 * 5 + 16 (biases)
- 2nd layer (conv): 32 * 16 * 5 * 5 + 32
- 3rd layer (linear): 32 * 2 + 2  
^ Independent of input size (average pool must be set accordingly)

# First conv layer filter visualization
## What does the network look for in the raw image?

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

### Example: AlexNet, trained on ImageNet dataset (>1 million images, 1000 classes) https://github.com/pytorch/vision/blob/master/torchvision/models/alexnet.py

In [None]:
import torchvision
alexnet = torchvision.models.alexnet(pretrained=True)

In [None]:
alexnet

In [None]:
filters = alexnet.features[0].weight.detach().cpu().numpy()

In [None]:
filters.shape

In [None]:
filters = filters.transpose(0, 2, 3, 1)

In [None]:
filters_ = filters.copy()

In [None]:
for i in range(len(filters_)):
    filters_[i] -= np.min(filters_[i])
    filters_[i] /= np.max(filters_[i])

In [None]:

fig, axes = plt.subplots(8, 8, figsize=(10, 10))
for i in range(64):
    axes[i // 8][i % 8].imshow(filters_[i], cmap='gray')
    axes[i // 8][i % 8].axis('off')