In [1]:
import torch
import torchvision
import matplotlib.pyplot as plt
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import numpy as np
from tqdm import tqdm

In this exercise we will create a simple convolutional network and manipulate it to discover the effect of
different hyper-parameters. We will implement different versions of the following network:

<p align = "center">
    <img src = "img/cnn.png">
</p>


On the diagram the sizes of the outputs of each layer are represented as follows: `d@hxw`. Where d, h and w
represent depth, height and width respectively. We will then use the function
`nn.Conv2d` to apply convolutions.

**Question 4.** Implement and train this network on the FashionMNIST dataset by
using a 5x5 sized kernel. Input images must be normalized. We
train with a learning rate of 1e-3 out of 5 epoch with the cost function
`nn.CrossEntropyLoss()`. Manipulate the padding and stride to get exactly the
output image sizes displayed on the diagram.
Note that to apply a dense (linear) layer on the output of a layer
convolutional it is necessary to change the dimension of the data. You can use
.view() for that.

In [2]:
fashion_train = torchvision\
    .datasets\
    .FashionMNIST(
        '.',
        train = True,
        transform= transforms.ToTensor(),
        download= True,
    )

fashion_test =  torchvision\
    .datasets\
    .FashionMNIST(
        '.',
        train = False,
        transform= transforms.ToTensor(),
        download= True,
    )


In [3]:
fashion_train

Dataset FashionMNIST
    Number of datapoints: 60000
    Root location: .
    Split: Train
    StandardTransform
Transform: ToTensor()

In [4]:
train_loader = DataLoader(
    fashion_train,
    batch_size = 64    
)

test_loader = DataLoader(
    fashion_test,
    batch_size=64
)

In [5]:
class MyFirstCNN(torch.nn.Module):
    def __init__(self,) -> None:
        super().__init__()

        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels=1,
                out_channels=4,
                kernel_size=5,
                stride= 1,
                padding=5
            ),
            torch.nn.MaxPool2d(
                kernel_size=7,
                stride=2
            )
        )

        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels=4,
                out_channels=8,
                kernel_size=5,
                stride= 1,
                padding=5
            ),
            torch.nn.MaxPool2d(
                kernel_size=2,
            )
        )

        self.layer3 = torch.nn.Conv2d(
            8, 
            16,
            4
        )

        self.dense = torch.nn.Linear(
            16 * 7 * 7,
            10
        )

    def forward(self, 
                x :torch.Tensor):
        x = self.layer1(x)
        # print(f"Layer 1: {x.shape}")
        x = self.layer2(x)
        # print(f"Layer 2: {x.shape}")
        x = self.layer3(x)
        # print(f"Layer 3: {x.shape}")
        x = x.view(x.size(0), -1)
        output = self.dense(x)

        return output    

In [6]:
cnn = MyFirstCNN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    cnn.parameters(),
    lr = 1e-3
)

In [137]:
n_epochs= 5
for epoch in tqdm(range(n_epochs)):
    for X_batch, y_batch in train_loader:

        y_pred = cnn(X_batch)

        loss = loss_fn(y_pred, y_batch)
        
        optimizer.zero_grad()
        
        loss.backward()
        optimizer.step()

100%|██████████| 5/5 [01:44<00:00, 20.97s/it]


In [150]:
success_rate = torch.tensor([])

with torch.no_grad():
    for X_batch, y_batch in test_loader:

        y_pred = cnn(X_batch)

        success_rate = torch\
            .cat([
                success_rate, 
                y_pred.argmax(dim = 1) == y_batch
            ])

In [151]:
success_rate.sum()/success_rate.shape[0]

tensor(0.8614)

In [28]:
def train_model(model : torch.nn.Module,
                loader : DataLoader,
                optimizer : torch.optim,
                loss_fn: torch.Callable,
                n_epochs : int = 5):
    
    errors = torch.tensor([])
    
    for epoch in tqdm(range(n_epochs)):
        batch_error = torch.tensor([])
        for X_batch, y_batch in loader:

            y_pred = model(X_batch)

            loss = loss_fn(y_pred, y_batch)
            batch_error = torch.cat([batch_error, torch.tensor([loss.item()])])
            
            optimizer.zero_grad()
            
            loss.backward()
            optimizer.step()

        errors = torch.cat([errors, torch.tensor([batch_error.mean()])])


    return errors

In [29]:
def model_accuracy(loader : DataLoader,
                   model : torch.nn.Module)->float:
    success_rate = torch.tensor([])

    with torch.no_grad():
        for X_batch, y_batch in loader:

            y_pred = model(X_batch)

            success_rate = torch\
                .cat([
                    success_rate, 
                    y_pred.argmax(dim = 1) == y_batch
                ])
            
    return success_rate.sum()/success_rate.shape[0]

**Question 5.** Implement and train the same network using filters of
convolution half as deep (depth 2, 4, and 8 instead of 4, 8 and 16). What can you conclude?

In [173]:
class LessDimensionalCNN(torch.nn.Module):
    def __init__(self,) -> None:
        super().__init__()

        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels=1,
                out_channels=2,
                kernel_size=5,
                stride= 1,
                padding=5
            ),
            torch.nn.MaxPool2d(
                kernel_size=7,
                stride=2
            )
        )

        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels=2,
                out_channels=4,
                kernel_size=5,
                stride= 1,
                padding=5
            ),
            torch.nn.MaxPool2d(
                kernel_size=2,
            )
        )

        self.layer3 = torch.nn.Conv2d(
            4, 
            8,
            4
        )

        self.dense = torch.nn.Linear(
            8 * 7 * 7,
            10
        )

    def forward(self, 
                x :torch.Tensor):
        x = self.layer1(x)
        # print(f"Layer 1: {x.shape}")
        x = self.layer2(x)
        # print(f"Layer 2: {x.shape}")
        x = self.layer3(x)
        # print(f"Layer 3: {x.shape}")
        x = x.view(x.size(0), -1)
        output = self.dense(x)

        return output    

In [174]:
less_dim_cnn = LessDimensionalCNN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    less_dim_cnn.parameters(),
    lr = 1e-3
)

In [168]:
cost_train = train_model(
    less_dim_cnn, 
    train_loader, 
    optimizer,
    loss_fn
)

100%|██████████| 5/5 [01:30<00:00, 18.20s/it]


In [167]:
model_accuracy(test_loader, less_dim_cnn)

tensor(0.8443)

We can conclude that since we had less parameters to train, our performance was worst

**Question 6.** Implement and train the same network using filters of
narrower convolutions (of size 3x3 instead of 5x5). What can you conclude?

In [202]:
class Filter3CNN(torch.nn.Module):
    def __init__(self,) -> None:
        super().__init__()

        self.layer1 = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels=1,
                out_channels=4,
                kernel_size=3,
                stride= 1,
                padding=4
            ),
            torch.nn.MaxPool2d(
                kernel_size=7,
                stride=2
            )
        )

        self.layer2 = torch.nn.Sequential(
            torch.nn.Conv2d(
                in_channels=4,
                out_channels=8,
                kernel_size=3,
                stride= 1,
                padding=1
            ),
            torch.nn.MaxPool2d(
                kernel_size=2,
            )
        )

        self.layer3 = torch.nn.Conv2d(
            in_channels=8, 
            out_channels=16,
            kernel_size=3,
            padding=1
        )

        self.dense = torch.nn.Linear(
            16 * 7 * 7,
            10
        )

    def forward(self, 
                x :torch.Tensor):
        x = self.layer1(x)
        # print(f"Layer 1: {x.shape}")
        x = self.layer2(x)
        # print(f"Layer 2: {x.shape}")
        x = self.layer3(x)
        # print(f"Layer 3: {x.shape}")
        x = x.view(x.size(0), -1)
        output = self.dense(x)

        return output  

In [203]:
cnn_3_filter = Filter3CNN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    cnn_3_filter.parameters(),
    lr = 1e-4
)

In [209]:
history = train_model(
    cnn_3_filter,
    train_loader,
    optimizer,
    loss_fn
)

  0%|          | 0/5 [00:00<?, ?it/s]

 20%|██        | 1/5 [00:17<01:11, 17.81s/it]

Epoch: 0, loss : 0.5130930542945862


 40%|████      | 2/5 [00:38<00:59, 19.73s/it]

Epoch: 1, loss : 0.5016865730285645


 60%|██████    | 3/5 [01:00<00:40, 20.41s/it]

Epoch: 2, loss : 0.49180662631988525


 80%|████████  | 4/5 [01:20<00:20, 20.41s/it]

Epoch: 3, loss : 0.4831910729408264


100%|██████████| 5/5 [01:40<00:00, 20.19s/it]

Epoch: 4, loss : 0.47563838958740234





In [210]:
model_accuracy(
    test_loader,
    cnn_3_filter
)

tensor(0.8211)

## Influence of network depth
Increasing the depth of a network allows it to create features, intermediate outputs,
more and more abstract: thus we can increase the capacity of the network to recognize images while maintaining
the number of parameters in O(1) relative to the size of the input image.

**Question 7.** Calculate the number of parameters of the first network of the exercise and the
number of network settings below.

<p align = "center">
    <img src = "img/second_net.png">
</p>

In [22]:
def get_num_parameters(model : torch.nn.Module):
    model_parameters = filter(lambda p: p.requires_grad, model.parameters())
    params = sum([np.prod(p.size()) for p in model_parameters])

    return params

In [10]:
get_num_parameters(cnn)

10826

In [31]:
class OneHiddenLayerCNN(torch.nn.Module):
    def __init__(self, ) -> None:
        super().__init__()
        self.conv = torch.nn.Conv2d(
            in_channels=1,
            out_channels=4,
            kernel_size=1
        )

        self.dense = torch.nn.Linear(
            in_features=4 * 28* 28,
            out_features=10
        )

    def forward(self, x : torch.Tensor):
        x = self.conv(x)
        x = x.view(x.size(0), -1)
        output = self.dense(x)

        return output


In [24]:
one_hidden_layer_cnn = OneHiddenLayerCNN()

In [25]:
get_num_parameters(one_hidden_layer_cnn)

31378

**Question 8.** Implement and train this network.

In [36]:
one_hidden_layer_cnn = OneHiddenLayerCNN()
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(
    one_hidden_layer_cnn.parameters(),
    lr = 1e-4
)

In [38]:
history = train_model(one_hidden_layer_cnn,
                      train_loader,
                      optimizer,
                      loss_fn)

100%|██████████| 5/5 [00:56<00:00, 11.20s/it]


In [39]:
model_accuracy(test_loader,
               one_hidden_layer_cnn)

tensor(0.8324)