# Building deep networks, step by step

Includes steps for building deep networks as well as normalization.

## Setup

In [1]:
import matplotlib.pyplot as plt
import torchvision
import torch
from PIL import Image

In [2]:
size = (128, 128)

In [3]:
transform = torchvision.transforms.Compose([
    torchvision.transforms.Resize(size),
    torchvision.transforms.ToTensor()
])

In [7]:
# here, let's download the dataset and keep it in its native format
# instead of converting to a list.
train_dataset = (
    torchvision.datasets.Flowers102(
        "./flowers", "train", transform=transform, download=True
    )
)
test_dataset = (
    torchvision.datasets.Flowers102(
        "./flowers", "test", transform=transform, download=True
    )
)

First, let's look at these datasets

In [8]:
train_dataset

Dataset Flowers102
    Number of datapoints: 1020
    Root location: ./flowers
    split=train
    StandardTransform
Transform: Compose(
               Resize(size=(128, 128), interpolation=bilinear, max_size=None, antialias=True)
               ToTensor()
           )

These are "container" objects that implement `__len__()` as well as `__getitem__()`. When we call `__getitem__()` (i.e., if we index it) the image will be loaded from disk and converted to a PyTorch tensor.

## Creating a dataloader

We now will batch the images into "minibatches" using PyTorch's `dataloader` class. That way, we can run our stochastic gradient descent on batches of images instead of individual images.

In [11]:
batch_size = 64

In [12]:
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)
test_loader = torch.utils.data.DataLoader(
    test_dataset, batch_size=batch_size, shuffle=True
)

We can iterate over the `train_loader` to get each batch.

In [13]:
single_batch = next(iter(train_loader))

Each batch in `train_loader` contains both the images and the labels.

In [17]:
print(type(single_batch))
print(len(single_batch))

images, labels = single_batch
print(images.shape)
print(labels.shape)

<class 'list'>
2
torch.Size([64, 3, 128, 128])
torch.Size([64])


In [18]:
# re-instantiate the train_loader
train_loader = torch.utils.data.DataLoader(
    train_dataset, batch_size=batch_size, shuffle=True
)

## Defining the model

First, we set up our model

In [21]:
class MyModel(torch.nn.Module):
    def __init__(self, layer_size = [512, 512, 512]) -> None:
        super().__init__()
        layers = []
        # for images, we have 3 channels (RGB) and so we need to flatten the
        # image to feed it to the first layer. This is done using the Flatten
        # layer.
        layers.append(torch.nn.Flatten())
        c = 128 * 128 * 3
        for s in layer_size:
            # for each layer, we add a Linear layer followed by a ReLU activation
            # We map the initial dimensions (c) to the size of the layer (s)
            # and then update c to s for the next iteration.

            # for example, if we have layer_size = [512, 512, 512], then the
            # layers will be as follows:
            # Linear(128*128*3, 512)
            # ReLU()
            # Linear(512, 512)
            # ReLU()
            # Linear(512, 512)
            # ReLU()

            layers.append(torch.nn.Linear(c, s))
            layers.append(torch.nn.ReLU())
            c = s
        # finally, we add the output layer which maps the last layer to the
        # number of classes in the dataset.
        layers.append(torch.nn.Linear(c, 102))

        # we use Sequential to stack the layers in order.
        self.model = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [22]:
model = MyModel()

We also need to set up the other components of our model:

In [31]:
loss_fn = torch.nn.CrossEntropyLoss()
lr = 0.0001
momentum = 0.9
epochs = 100
optim = torch.optim.SGD(model.parameters(), lr=lr, momentum=momentum)

## Setting up our training loop

First, let's see if our logic works for 1 batch:

In [26]:
for imgs, labels in train_loader:
    pred = model(imgs)
    print(f"imgs shape: {imgs.shape}\n labels shape: {labels.shape}\n pred shape: {pred.shape}")
    break

imgs shape: torch.Size([64, 3, 128, 128])
 labels shape: torch.Size([64])
 pred shape: torch.Size([64, 102])


We can break down the shapes as follows:
- We have 64 images, each (3 x 128 x 128)
- We have 64 labels
- We have 64 preds, each with 102 classes.

## Running the training loop across all minibatches (one epoch)

Now let's set up our training loop. First, let's do just one epoch's worth.

In [27]:
for imgs, labels in train_loader:
    pred = model(imgs)
    loss_value = loss_fn(pred, labels)
    optim.zero_grad()
    loss_value.backward()
    optim.step()
    print(f"Loss: {loss_value.item()}")

Loss: 4.625837802886963
Loss: 4.6275835037231445
Loss: 4.633312702178955
Loss: 4.629975318908691
Loss: 4.627366065979004
Loss: 4.621416091918945
Loss: 4.623439788818359
Loss: 4.6265106201171875
Loss: 4.621596336364746
Loss: 4.6210479736328125
Loss: 4.631072044372559
Loss: 4.6187543869018555
Loss: 4.625478267669678
Loss: 4.624572277069092
Loss: 4.628574371337891
Loss: 4.6273674964904785


The loss is slowly descending.

## Running the training loop across epochs.

Let's now run this for multiple epochs. For this case, let's encapsulate the previous logic in an outer loop across epochs.

In [30]:
epochs = 10

In [32]:
for epoch in range(epochs):
    losses = []
    for imgs, labels in train_loader:
        pred = model(imgs)
        loss_value = loss_fn(pred, labels)
        optim.zero_grad()
        loss_value.backward()
        optim.step()
        # print(f"Loss: {loss_value.item()}")
        losses.append(loss_value.item())
    average_loss = sum(losses) / len(losses)
    print(f"Epoch {epoch}, Loss: {average_loss}")

Epoch 0, Loss: 4.603039115667343
Epoch 1, Loss: 4.602484464645386
Epoch 2, Loss: 4.601881712675095
Epoch 3, Loss: 4.6011567413806915
Epoch 4, Loss: 4.600630074739456
Epoch 5, Loss: 4.599941849708557
Epoch 6, Loss: 4.5993668138980865
Epoch 7, Loss: 4.598799079656601
Epoch 8, Loss: 4.598156690597534
Epoch 9, Loss: 4.597478151321411
Epoch 10, Loss: 4.596847593784332
Epoch 11, Loss: 4.59621199965477
Epoch 12, Loss: 4.595578491687775
Epoch 13, Loss: 4.594901472330093
Epoch 14, Loss: 4.5943571627140045
Epoch 15, Loss: 4.593642294406891
Epoch 16, Loss: 4.5928977727890015
Epoch 17, Loss: 4.592243075370789
Epoch 18, Loss: 4.591604918241501
Epoch 19, Loss: 4.590942770242691
Epoch 20, Loss: 4.5902363657951355
Epoch 21, Loss: 4.589542418718338
Epoch 22, Loss: 4.588779300451279
Epoch 23, Loss: 4.58808708190918
Epoch 24, Loss: 4.587442874908447
Epoch 25, Loss: 4.586704194545746
Epoch 26, Loss: 4.585962802171707
Epoch 27, Loss: 4.58517724275589
Epoch 28, Loss: 4.584380745887756
Epoch 29, Loss: 4.5836

Ideally this would run faster if we put the model and the images on GPU.

## Generic form of a network

Below is an example of a generic form of a network. We'll use a simple example, independent of the image sample above.

In [33]:
class Net(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.fc1 = torch.nn.Linear(1, 10)
        self.fc2 = torch.nn.Linear(10, 1)
        self.act = torch.nn.ReLU()

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        x = self.fc1(x)
        x = self.act(x)
        x = self.fc2(x)
        return x

In [34]:
net = Net()

In [35]:
net(torch.tensor([1.0]))

tensor([-0.7369], grad_fn=<ViewBackward0>)

## Normalizations

We can now take this and build on it to add normalization.

We consider three main types of normalization:
- **Layer normalization**: normalizing across all the layers of a single input. For an image, this means collapsing across length, width, and all channels. This means that for a batch of size $n$, there are $n$ means and standard deviations to normalize by.
- **Group normalization**: Similar to layer normalization, but you normalize across a group of channels, instead of all channels. Should result in a mean tensor of $n x (c // g)$, where $c$ = # of channels (e.g., 3 in images) and $g$ = group size.
- **Batch normalization**: normalizing across each channel in a batch. For color images (which have 3 channels), this would result in a $3x1$ tensor for mean and also for standard deviation, one for each channel.

We can add the normalization either before the nonlinearity or after the nonlinearity:
- **Before the nonlinearity**: a little tougher since you need to add in a learnable bias and weight after the normalization step. This is because the nonlinearity can remove any signal from the Gaussian output of the normalization centered around 0. This is most evident in a ReLU, where anything less than 0 is filtered out, so therefore any negative values (of which 50% are negative in a Gaussian) will be removed. However, empirically it does have marginally better results than doing it after and so in practice, a lot of people add it before the nonlinearity.
- **After the nonlinearity**: easier to implement, since no additional learned weight and bias is needed.

In practice, layer normalization is what should be used, unless you're looking at images (in which case, batch normalization can be used).

Let's take the same model as before. We have the model below, which has a total of 4 layers (3 hidden, 1 output).

In [36]:
class MyModel(torch.nn.Module):
    def __init__(self, layer_size = [512, 512, 512]) -> None:
        super().__init__()
        layers = []
        layers.append(torch.nn.Flatten())
        c = 128 * 128 * 3
        for s in layer_size:
            layers.append(torch.nn.Linear(c, s))
            layers.append(torch.nn.ReLU())
            c = s
        layers.append(torch.nn.Linear(c, 102))

        # we use Sequential to stack the layers in order.
        self.model = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [37]:
net = MyModel()

Let's try this for various sizes of the network.

In [40]:
x = torch.randn(10, 3, 128, 128)
net0 = MyModel([])
print(f"{net0(x).norm()=}")
net1 = MyModel([512])
print(f"{net1(x).norm()=}")
net2 = MyModel([512, 512])
print(f"{net2(x).norm()=}")
net3 = MyModel([512, 512, 512])
print(f"{net3(x).norm()=}")
net4 = MyModel([512, 512, 512, 512])
print(f"{net4(x).norm()=}")

net0(x).norm()=tensor(19.1381, grad_fn=<LinalgVectorNormBackward0>)
net1(x).norm()=tensor(7.6016, grad_fn=<LinalgVectorNormBackward0>)
net2(x).norm()=tensor(3.2654, grad_fn=<LinalgVectorNormBackward0>)
net3(x).norm()=tensor(1.4725, grad_fn=<LinalgVectorNormBackward0>)
net4(x).norm()=tensor(1.0034, grad_fn=<LinalgVectorNormBackward0>)


We observe that the outputs will get smaller, which means that we're seeing the vanishing gradient happen. Eventually the loss asymptotes, but it's because the bias term keeps some values nonzero. We can check this empirically by setting `torch.nn.Linear(c, s, bias=False)` and `torch.nn.Linear(c, 102, bias=False)` in order to remove the bias terms and then rerunning our experiment.

### Adding batch normalization

Let's now update our model definition to add in batch normalization

In [41]:
class MyModel(torch.nn.Module):
    def __init__(self, layer_size = [512, 512, 512]) -> None:
        super().__init__()
        layers = []
        layers.append(torch.nn.Flatten())
        c = 128 * 128 * 3
        for s in layer_size:
            layers.append(torch.nn.Linear(c, s))
            layers.append(torch.nn.BatchNorm1d(s))
            layers.append(torch.nn.ReLU())
            c = s
        layers.append(torch.nn.Linear(c, 102))

        # we use Sequential to stack the layers in order.
        self.model = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

Let's now create an arbitrarily large series of networks.

In [42]:
x = torch.randn(10, 3, 128, 128)
for n in range(10):
    netn = MyModel([512] * n)
    print(f"{netn(x).norm()=}")

netn(x).norm()=tensor(18.3021, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.8984, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.4140, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.6285, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.9244, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.7696, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.3852, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.0314, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.3189, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.1343, grad_fn=<LinalgVectorNormBackward0>)


As we see, the sizes of the outputs remain consistent, due to our normalization.

### Adding layer normalization

We can also do this with layer normalization

In [43]:
class MyModel(torch.nn.Module):
    def __init__(self, layer_size = [512, 512, 512]) -> None:
        super().__init__()
        layers = []
        layers.append(torch.nn.Flatten())
        c = 128 * 128 * 3
        for s in layer_size:
            layers.append(torch.nn.Linear(c, s))
            layers.append(torch.nn.LayerNorm(s))
            layers.append(torch.nn.ReLU())
            c = s
        layers.append(torch.nn.Linear(c, 102))

        # we use Sequential to stack the layers in order.
        self.model = torch.nn.Sequential(*layers)

    def forward(self, x):
        return self.model(x)

In [44]:
x = torch.randn(10, 3, 128, 128)
for n in range(10):
    netn = MyModel([512] * n)
    print(f"{netn(x).norm()=}")

netn(x).norm()=tensor(18.6853, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.6818, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.6936, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.8435, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.4358, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.7709, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(13.6983, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.0943, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.0792, grad_fn=<LinalgVectorNormBackward0>)
netn(x).norm()=tensor(12.8844, grad_fn=<LinalgVectorNormBackward0>)


We see a similar effect with layer normalization.