In [1]:
from fastai.vision.all import *

  from .autonotebook import tqdm as notebook_tqdm


### Continue with some actual data

In [2]:
path = untar_data(URLs.MNIST_SAMPLE)

In [3]:
(path).ls()

(#3) [Path('/Users/lucasvanwalstijn/.fastai/data/mnist_sample/valid'),Path('/Users/lucasvanwalstijn/.fastai/data/mnist_sample/labels.csv'),Path('/Users/lucasvanwalstijn/.fastai/data/mnist_sample/train')]

In [4]:
dblock = DataBlock(
    blocks=[ImageBlock(cls=PILImageBW), CategoryBlock],
    get_items=get_image_files,
    splitter=GrandparentSplitter('train','valid'),
    get_y=parent_label,
    batch_tfms=Normalize()
)

In [5]:
dls = dblock.dataloaders(path, bs=64)

In [7]:
xb, yb = dls.one_batch()
xb.shape

torch.Size([64, 1, 28, 28])

In [10]:
xb.mean((2,3)).shape

torch.Size([64, 1])

The problem with our previous approach was that if we have bigger images, we need a lot of stride 2 convolutions to arrive finally at a set of activations in which the grid size collapsed to (1,1):

In [21]:
def conv_series(inp):
    s = [inp] 
    while inp != 1:
        inp = int(np.ceil(inp / 2))
        s.append(inp)
    print(s, f'{len(s)=}', sep='\n')

In [22]:
conv_series(28)

[28, 14, 7, 4, 2, 1]
len(s)=6


In [25]:
conv_series(256)

[256, 128, 64, 32, 16, 8, 4, 2, 1]
len(s)=9


In [26]:
conv_series(512)

[512, 256, 128, 64, 32, 16, 8, 4, 2, 1]
len(s)=10


From looking at the above outputs, that doesn't seem like a huge problem, but it's still a reason given by JH to not use this approach, perhaps also because of the following:

Also, the network wouldn't be able to deal with images of any input size, since a different image size would require a different amount of conv layers to reduce to a grid size of (1,1)

So what to do then? Well, we could just do a fixed amount of convs and end up with whatever we then have, flatten it out and stick it into a fully connected layer. 

This is what VGG did. But the problem there is that these last FC layers are enormous. For example, the first FC layer has around 100 mio parameters:

![vgg](vgg.png)

In [28]:
n_in = 7*7*512
n_out = 4096

print('params: ', n_in*n_out)

params:  102760448


So instead let's use fully convolutional networks (no linear layers at the end) and instead use some feature reduction in the end. For example by using a simple:

In [31]:
def avg_pool(x): return x.mean((-2,-1))

In [32]:
avg_pool(xb).shape

torch.Size([64, 1])

So this just flattens out the last 2 dimensions (size of the "image"). PyTorch has the `AdaptiveAvgPool2d` for that purpose

In [37]:
ap = nn.AdaptiveAvgPool2d(1)

ap(xb).shape

torch.Size([64, 1, 1, 1])

In [39]:
# Or we specify some custom output size
ap = nn.AdaptiveAvgPool2d((4,2))

ap(xb).shape

torch.Size([64, 1, 4, 2])

In [40]:
# We saw a different Pooling layer before, the MaxPool2d, the "normal" one and the adaptive one do exactly the same thing, they just differ in the API:
# With AdaptivePooling you specify the output you want, with "normal" pooling you specify the stride and kernel size

In [41]:
xb.shape

torch.Size([64, 1, 28, 28])

In [42]:
mp = nn.MaxPool2d((2,2))

mp(xb).shape

torch.Size([64, 1, 14, 14])

In [43]:
amp = nn.AdaptiveMaxPool2d((14,14))

amp(xb).shape

torch.Size([64, 1, 14, 14])

In [44]:
torch.allclose(mp(xb), amp(xb))

True

Whats ofcourse nice of the Adaptive one is that you can specify just the output size you need, regardless of the input size. Which makes it great to work with different image sizes as input..

*Consider this question: would this approach makes sense for an optical character recognition (OCR) problem such as MNIST? The vast majority of practitioners tackling OCR and similar problems tend to use fully convolutional networks, because that's what nearly everybody learns nowadays. But it really doesn't make any sense! You can't decide, for instance, whether a number is a 3 or an 8 by slicing it into small pieces, jumbling them up, and deciding whether on average each piece looks like a 3 or an 8. But that's what adaptive average pooling effectively does! Fully convolutional networks are only really a good choice for objects that don't have a single correct orientation or size (e.g., like most natural photos).*

Link to [thread](https://forums.fast.ai/t/lesson-8-official-topic/97159/51?u=lucasvw) on the forums concerning conv sizes and VGG

### Resnets

Observation that a deeper model was training worse then a smaller model, intuition that leads to ResNets:

- Let's wrap the smaller model inside a deeper model
- Let's start in a configuration in which the deeper model is exactly the same as the smaller model
- The model itself can learn to use the additional layers

But instead of first training a smaller model, then a larger:

- Create ResBlocks, with skip connections
- since now we have `y = x + block(x)` the block is kind of like learning the residual (difference) between y and x

In [197]:
# So this is the basic conv layer in FastAI:
ConvLayer(3, 64)

ConvLayer(
  (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)

Order:

1. conv
2. batchnorm
3. activation

In [198]:
# This doesn't really work, since ni needs to be no

class ResBlock(nn.Module):
    def __init__(self, ni, no):
        super().__init__()
        
        self.convs = nn.Sequential(
            ConvLayer(ni, no),
            ConvLayer(no, no)
        )
        
    def forward(self, x):
        return x + self.convs(x)
        

In [199]:
def conv_block(ni, no, stride):
    return nn.Sequential(
        ConvLayer(ni, no, stride=stride),
        ConvLayer(no, no, act_cls=None)
    )

class ResBlock(nn.Module):
    def __init__(self, ni, no, stride=1):
        super().__init__()
        
        self.convs = conv_block(ni, no, stride)
        self.idconv = noop if ni == no else ConvLayer(ni, no, 1, act_cls=None)
        self.pool = noop if stride==1 else nn.AvgPool2d(2, ceil_mode=True)
        
    def forward(self, x):
        return F.relu(self.pool(self.idconv(x)) + self.convs(x))
        

In [200]:
rb = ResBlock(1, 4)
xb.shape, rb(xb).shape

(torch.Size([64, 1, 28, 28]), torch.Size([64, 4, 28, 28]))

In [201]:
rb

ResBlock(
  (convs): Sequential(
    (0): ConvLayer(
      (0): Conv2d(1, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): ConvLayer(
      (0): Conv2d(4, 4, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (idconv): ConvLayer(
    (0): Conv2d(1, 4, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (1): BatchNorm2d(4, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [202]:
rb = ResBlock(1, 4, stride=2)
xb.shape, rb(xb).shape

(torch.Size([64, 1, 28, 28]), torch.Size([64, 4, 14, 14]))

In [203]:
rb.idconv[0].weight.shape

torch.Size([4, 1, 1, 1])

In [204]:
# With 3 channel input:
rb = ResBlock(3, 8, stride=2)

In [205]:
rb.idconv[0].weight.shape

torch.Size([8, 3, 1, 1])

Passage from book:

*Changing the number of channels can be done by using a convolution. We want this skip connection to be as close to an identity map as possible, however, which means making this convolution as simple as possible. The simplest possible convolution is one where the kernel size is 1. That means that the kernel is size `ni*nf*1*1`, so it's only doing a dot product over the channels of each input pixel—it's not combining across pixels at all. This kind of *1x1 convolution* is very widely used in modern CNNs, so take a moment to think about how it works.*

Perhaps we should help it make as close as possible by using a identity in the upper left corner?!

In [206]:
with torch.no_grad():
    rb.idconv[0].weight[0:3, 0:3, 0, 0] = torch.eye(3,3)

In [207]:
rb1 = ResBlock(3, 4, stride=2)
rb1.idconv[0].weight[0:3, 0:3, 0, 0].data.copy_(torch.eye(3,3))

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.]])

In [236]:
rb.idconv[0].weight[:,:,0,0]

tensor([[ 1.0000,  0.0000,  0.0000],
        [ 0.0000,  1.0000,  0.0000],
        [ 0.0000,  0.0000,  1.0000],
        [-0.1280, -0.4397, -0.5567],
        [ 0.2435, -0.2283, -0.4996],
        [ 0.4334,  0.3092,  0.2609],
        [-0.5149,  0.3920, -0.3241],
        [ 0.5083,  0.0669,  0.0929]], grad_fn=<SelectBackward0>)

This is kind of interesting, a convolution with ks 1x1 is drilling through the depth of the grid-size. E.g. per location in the grid-size, it's multiplying this very matrix above with the inputs:

So let's say we have for a pixel location (0,0) 3 channels with values :

```
[0.4, 0.3, 0.5]
```

Then the output would be:


In [238]:
rb.idconv[0].weight[:,:,0,0] @ torch.tensor([0.4, 0.3, 0.5])

tensor([ 0.4000,  0.3000,  0.5000, -0.4614, -0.2209,  0.3965, -0.2504,  0.2698],
       grad_fn=<MvBackward0>)

So the first 3 channels are just "copied" over and the other 5 are mixes of the input channels according to the weights

Perhaps try out whether this actually helps convergence?!

In [209]:
def _resnet_stem(*sizes):
    layers = []
    for i, size in enumerate(sizes[:-1]):
        stride = 2 if i==0 else 1
        layers.append(ConvLayer(size, sizes[i+1], stride=stride))
    layers.append(nn.MaxPool2d(3, stride=2, padding=1))
    return layers
    

In [210]:
_resnet_stem(3, 32, 32, 64)

[ConvLayer(
   (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
   (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (2): ReLU()
 ),
 ConvLayer(
   (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (2): ReLU()
 ),
 ConvLayer(
   (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
   (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
   (2): ReLU()
 ),
 MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)]

`3*3*3 = 27` input pixels are getting mapped to `32` ouput pixels.. this is a smaller increase then what VGG is doing (increase to 64 channels) but it's still quite a large increase in my opinion and according to the earlier passage in the book, better would perhaps be 16 output channels..?

In [211]:
block_szs = [64, 64, 128, 256, 512]

for i,_ in enumerate(block_szs):
    print(block_szs[i:i+2])

[64, 64]
[64, 128]
[128, 256]
[256, 512]
[512]


In [212]:
class ResNet(nn.Sequential):
    
    def __init__(self, n_out, layers, expansion=1):
        self.block_szs = [64, 64, 128, 256, 512]
        for i in range(1,5): self.block_szs[i] *= expansion
        
        stem = _resnet_stem(3, 32, 32, 64)
        blocks = [self._make_resblock(*o) for o in enumerate(layers)]
        
        super().__init__(*stem, *blocks, 
                         nn.AdaptiveAvgPool2d(1), 
                         Flatten(),
                         nn.Linear(self.block_szs[-1], n_out))
        
    def _make_resblock(self, idx, n_layers):
        stride = 1 if idx==0 else 2
        ch_in, ch_out = self.block_szs[idx:idx+2]
        return nn.Sequential(*[
            ResBlock(ch_in if i==0 else ch_out, ch_out, stride if i==0 else 1) for i in range(n_layers)
        ])
        
        

In [213]:
rn = ResNet(10, [2,2,2,2])

In [214]:
rn(torch.randn(64, 3, 128, 128)).shape

torch.Size([64, 10])

In [215]:
rn(torch.randn(64, 3, 8, 8)).shape

torch.Size([64, 10])

So that seems to all work, but how ?!?!

In [216]:
rn

ResNet(
  (0): ConvLayer(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (1): ConvLayer(
    (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (2): ConvLayer(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  (4): Sequential(
    (0): ResBlock(
      (convs): Sequential(
        (0): ConvLayer(
          (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
          (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stat

Let's start with the stem, it consists of one stride 2 layer and one maxpool, so we go from 28 -> 14 -> 7 grid size

In [217]:
nn.Sequential(*_resnet_stem(3, 32, 32, 64))(torch.randn(64,3,28,28)).shape

torch.Size([64, 64, 7, 7])

In [218]:
inp = torch.randn(64,3,128,128)

for i in rn:
    print(i)
    out = i(inp)
    print(inp.shape, ' --> ', out.shape)
    inp = out
    print('-------------------------------------------------------')

ConvLayer(
  (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)
torch.Size([64, 3, 128, 128])  -->  torch.Size([64, 32, 64, 64])
-------------------------------------------------------
ConvLayer(
  (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)
torch.Size([64, 32, 64, 64])  -->  torch.Size([64, 32, 64, 64])
-------------------------------------------------------
ConvLayer(
  (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)
torch.Size([64, 32, 64, 64])  -->  torch.Size([64, 64, 64, 64])
-------------------------------------------------------
MaxPool2d(kernel_size=3, stride=2, padding=

In [231]:
def conv_block(ni, nf, stride):
    return nn.Sequential(
        ConvLayer(ni, nf//4, 1),
        ConvLayer(nf//4, nf//4, stride=stride),
        ConvLayer(nf//4, nf, 1, act_cls=None)
    )

In [232]:
rn1 = ResNet(10, [3, 4, 6, 3], 4)

In [234]:
inp = torch.randn(64,3,128,128)

for i in rn1:
    print(i)
    out = i(inp)
    print(inp.shape, ' --> ', out.shape)
    inp = out
    print('-------------------------------------------------------')

ConvLayer(
  (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)
torch.Size([64, 3, 128, 128])  -->  torch.Size([64, 32, 64, 64])
-------------------------------------------------------
ConvLayer(
  (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)
torch.Size([64, 32, 64, 64])  -->  torch.Size([64, 32, 64, 64])
-------------------------------------------------------
ConvLayer(
  (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)
torch.Size([64, 32, 64, 64])  -->  torch.Size([64, 64, 64, 64])
-------------------------------------------------------
MaxPool2d(kernel_size=3, stride=2, padding=

In [245]:
def repeated_identity_matrix(n_rows, n_cols):
    identity_matrix = torch.eye(n_cols)
    repeated_matrix = identity
    _matrix.repeat((n_rows // n_cols) + 1, 1)
    return repeated_matrix[:n_rows, :]

n_rows = 4
n_cols = 3
repeated_identity = repeated_identity_matrix(n_rows, n_cols)

print(repeated_identity)

tensor([[1., 0., 0.],
        [0., 1., 0.],
        [0., 0., 1.],
        [1., 0., 0.]])


### Resnets repeated

It all starts with a simple ConvLayer:

In [246]:
ConvLayer(1, 3)

ConvLayer(
  (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
  (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  (2): ReLU()
)

Then we create a ResBlock:

In [357]:
class ConvBlock(nn.Sequential):
    def __init__(self, n_in, n_out, stride):
        convs = [
            ConvLayer(n_in, n_out, stride=stride),
            ConvLayer(n_out, n_out, act_cls=None, norm_type=NormType.BatchZero)]
        super().__init__(*convs)

        
class ResBlock(nn.Module):
    def __init__(self, n_in, n_out, stride):
        super().__init__()
        self.conv_block = ConvBlock(n_in, n_out, stride)
        self.id_conv = noop if n_in == n_out else ConvLayer(n_in, n_out, 1, act_cls=None)
        self.pool = noop if stride == 1 else nn.AvgPool2d(2, ceil_mode=True)
        
    def forward(self, x):
        return F.relu(self.conv_block(x) + self.id_conv(self.pool(x)))


In [276]:
cv = ConvBlock(1, 3, 1)
cv

ConvBlock(
  (0): ConvLayer(
    (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (1): ConvLayer(
    (0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
)

In [277]:
cv(torch.randn(64,1,28,28)).shape

torch.Size([64, 3, 28, 28])

In [278]:
cv = ConvBlock(1, 3, 2)
cv(torch.randn(64,1,28,28)).shape

torch.Size([64, 3, 14, 14])

In [279]:
rb = ResBlock(1, 3, 2)
rb

ResBlock(
  (conv_block): ConvBlock(
    (0): ConvLayer(
      (0): Conv2d(1, 3, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): ConvLayer(
      (0): Conv2d(3, 3, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    )
  )
  (id_conv): ConvLayer(
    (0): Conv2d(1, 3, kernel_size=(1, 1), stride=(1, 1), bias=False)
    (1): BatchNorm2d(3, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
  )
  (pool): AvgPool2d(kernel_size=2, stride=2, padding=0)
)

In [274]:
rb(torch.randn(64,1,28,28)).shape

torch.Size([64, 3, 14, 14])

In [284]:
class ResnetStem(nn.Sequential):
    def __init__(self, sizes):
        layers = [ConvLayer(size[0], size[1], stride=2 if idx==0 else 1) for idx, size in enumerate(sizes)]
        layers.append(nn.MaxPool2d(3, 2, 1))
        super().__init__(*layers)

In [286]:
stem = ResnetStem([[3,32], [32,32], [32,64]])
stem

ResnetStem(
  (0): ConvLayer(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (1): ConvLayer(
    (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (2): ConvLayer(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU()
  )
  (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
)

In [288]:
stem(torch.randn(64,3,28,28)).shape

torch.Size([64, 64, 7, 7])

In [359]:
class ResNet(nn.Module):
    def __init__(self, n_out, sizes):
        super().__init__()
        self.stem = ResnetStem([[3,32], [32,32], [32,64]])
        self.resblock_layers = []
        for idx, size in enumerate(sizes):
            grouped_rb = [ResBlock(size[0] if i==0 else size[1],
                                   size[1],
                                   stride=1 if idx==0 else 2) for i in range(size[2])]
            grouped_rb = nn.Sequential(*grouped_rb)
            self.resblock_layers.append(grouped_rb)
        self.layers = nn.Sequential(*self.resblock_layers)
        self.head = nn.Sequential(nn.AdaptiveAvgPool2d(1),
                                  Flatten(),
                                  nn.Linear(sizes[-1][1], n_out))
        
    
    def forward(self, x):
        return self.head(self.layers(self.stem(x)))
    

In [360]:
rn = ResNet(10, [[64, 64, 2], [64, 128, 2], [128, 256, 2], [256, 512, 2]])

In [362]:
rn(torch.randn(64,3,128,128)).shape

torch.Size([64, 10])

In [354]:
rn

ResNet(
  (stem): ResnetStem(
    (0): ConvLayer(
      (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(2, 2), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (1): ConvLayer(
      (0): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (2): ConvLayer(
      (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
      (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (2): ReLU()
    )
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
  )
  (layers): Sequential(
    (0): Sequential(
      (0): ResBlock(
        (conv_block): ConvBlock(
          (0): ConvLayer(
            (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bia