In [4]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
import torchvision.transforms as transforms
import torchvision.models as models

ModuleNotFoundError: No module named 'torchvision'

In [2]:
class DenseBlock(nn.Module):
    def __init__(self, in_channels, growth_rate, num_layers):
        super(DenseBlock, self).__init__()
        self.layers = nn.ModuleList()
        for i in range(num_layers):
            self.layers.append(
                self._make_conv_block(in_channels + i * growth_rate, growth_rate)
            )

    def _make_conv_block(self, in_channels, out_channels):
        return nn.Sequential(
            nn.BatchNorm2d(in_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1, bias=False),
        )

    def forward(self, x):
        features = [x]
        for layer in self.layers:
            out = layer(torch.cat(features, 1))
            features.append(out)
        return torch.cat(features, 1)


class Transition(nn.Module):
    def __init__(self, in_channels, out_channels):
        super(Transition, self).__init__()
        self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, bias=False)
        self.pool = nn.AvgPool2d(kernel_size=2, stride=2)

    def forward(self, x):
        out = self.conv(x)
        out = self.pool(out)
        return out


class DenseNet(nn.Module):
    def __init__(self, num_classes, growth_rate=12, block_config=(6, 12, 24, 16)):
        super(DenseNet, self).__init__()
        self.features = nn.Sequential(
            nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=3, stride=2, padding=1),
        )
        in_channels = 64
        for i, num_layers in enumerate(block_config):
            block = DenseBlock(in_channels, growth_rate, num_layers)
            self.features.add_module(f"denseblock{i + 1}", block)
            in_channels += num_layers * growth_rate
            if i != len(block_config) - 1:
                trans = Transition(in_channels, in_channels // 2)
                self.features.add_module(f"transition{i + 1}", trans)
                in_channels = in_channels // 2
        self.features.add_module("norm5", nn.BatchNorm2d(in_channels))
        self.features.add_module("relu", nn.ReLU(inplace=True))
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.classifier = nn.Linear(in_channels, num_classes)

    def forward(self, x):
        features = self.features(x)
        out = F.relu(features, inplace=True)
        out = self.avgpool(out)
        out = torch.flatten(out, 1)
        out = self.classifier(out)
        return out

In [3]:
# Example instantiation
model = DenseNet(num_classes=10)
print(model)

DenseNet(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
    (denseblock1): DenseBlock(
      (layers): ModuleList(
        (0): Sequential(
          (0): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): ReLU(inplace=True)
          (2): Conv2d(64, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        )
        (1): Sequential(
          (0): BatchNorm2d(76, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (1): ReLU(inplace=True)
          (2): Conv2d(76, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        )
        (2): Sequential(
          (0): BatchNorm2d(88, eps=1e-05, momentum=0.1, affine=True, track_runn

1. **DenseBlock**:
   - This is the fundamental building block of the DenseNet architecture.
   - It consists of a series of densely connected convolutional layers.
   - Each convolutional layer produces k feature maps, where k is the growth rate.
   - The input to each convolutional layer is the concatenation of the feature maps from all preceding layers within the same block.

2. **Transition Layer**:
   - Transition layers are used to reduce the number of feature maps between dense blocks.
   - They consist of a 1x1 convolutional layer followed by average pooling.
   - The 1x1 convolutional layer reduces the number of feature maps, while average pooling reduces the spatial dimensions of the feature maps.

3. **DenseNet**:
   - The main architecture consists of multiple DenseBlocks interleaved with Transition layers.
   - The first layer of the DenseNet is a standard convolutional layer with 7x7 kernel size and 64 output channels, followed by batch normalization and ReLU activation.
   - After the initial convolutional layer, there is a max-pooling layer to downsample the spatial dimensions of the feature maps.
   - Subsequent layers consist of a sequence of DenseBlocks and Transition layers.
   - Each DenseBlock is followed by a Transition layer (except for the last DenseBlock).
   - After all dense blocks, there's a final batch normalization layer and ReLU activation.
   - The output of the final ReLU activation is passed through an adaptive average pooling layer to reduce the spatial dimensions to 1x1.
   - Finally, the output is flattened and fed into a fully connected layer (linear layer) to produce the final output logits.

4. **Batch Normalization**:
   - Batch normalization is applied before each convolutional layer and linear layer.
   - It normalizes the activations of the previous layer, helping to stabilize and accelerate the training process.

5. **ReLU Activation**:
   - Rectified Linear Unit (ReLU) activation function is applied after each batch normalization layer, introducing non-linearity to the network.

6. **Max Pooling**:
   - Max pooling is applied after the initial convolutional layer to downsample the spatial dimensions of the feature maps.

7. **Adaptive Average Pooling**:
   - Adaptive average pooling is used to reduce the spatial dimensions of the feature maps to a fixed size (1x1) regardless of the input size.
   - This ensures that the network can accept inputs of varying sizes.

8. **Linear Layer (Classifier)**:
   - The output of the adaptive average pooling layer is flattened and fed into a linear layer.
   - This linear layer acts as a classifier, mapping the feature representation to the final output logits, which represent the predicted class probabilities.