Importing required libraries

In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import datasets, transforms
from torch.utils.data import DataLoader

In [2]:
#defining constants
NUM_CLASSES = 10
BATCH_SIZE = 64
EPOCHS = 1
LEARNING_RATE = 0.1

In [3]:
#Standard CIFAR-10 transformations
transform = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)) #channel-wise mean and standard deviation calculated across the entire CIFAR-10 training dataset
])

In [4]:
# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

Custom DenseNet building blocks

The core purpose of a Bottleneck layer is to improve computational efficiency in deep networks by reducing the number of input feature maps before applying the more computationally expensive 3×3 convolution.

In [5]:
class _Bottleneck(nn.Module):
  def __init__(self, in_channels, growth_rate): #Standard Bottleneck layer (BN -> ReLU -> 1x1 Conv -> BN -> ReLU -> 3x3 Conv)
    #in_channels = the number of input feature maps to this layer
    #growth_rate = the number of output feature maps this layer will contribute to the DenseNet block
    super().__init__() #Calls the constructor of the parent class, nn.Module, which is necessary for proper PyTorch module initialization.
    # BN -> ReLU -> 1x1 Conv (Bottleneck)
    self.norm1 = nn.BatchNorm2d(in_channels)
    self.relu1 = nn.ReLU(inplace=True) #inplace=True saves memory by modifying the input directly.
    self.conv1 = nn.Conv2d(in_channels, 4 * growth_rate, kernel_size=1, stride=1, bias=False) # 4 * growth_rate (G) is a standard bottleneck ratio
    #It takes in_channels and reduces the channel count to 4×growth_rate. The 1×1 kernel changes channels but not the spatial resolution. bias=False is standard when followed immediately by Batch Normalization

    # BN -> ReLU -> 3x3 Conv
    self.norm2 = nn.BatchNorm2d(4 * growth_rate)
    self.relu2 = nn.ReLU(inplace=True)
    self.conv2 = nn.Conv2d(4 * growth_rate, growth_rate, kernel_size=3, stride=1, padding=1, bias=False)
    #It takes the 4×growth_rate channels and transforms them into growth_rate output channels. The 3×3 kernel and padding=1 ensure the output feature map has the same spatial dimensions as the input, which is essential for concatenation in DenseNet

  def forward(self, x):
    # The input x is concatenated output from all previous layers
    out = self.conv1(self.relu1(self.norm1(x)))
    out = self.conv2(self.relu2(self.norm2(out)))
    # Output is concatenated with input in the DenseBlock
    return out

A Dense Block is a core component of the Dense Convolutional Network (DenseNet) architecture. Its key idea is dense connectivity, where every layer within the block is connected to every other layer in a feed-forward fashion.

Specifically, for each layer l within a Dense Block, the feature maps of all preceding layers (0,1,…,l−1) are concatenated along the channel dimension and used as input for layer l. The output of layer l is then passed on as input to all subsequent layers (l+1,…).

In [6]:
class _DenseBlock(nn.Module):
  def __init__(self, num_layers, in_channels, growth_rate):
    #num_layers: The number of individual convolution layers (bottleneck layers) inside this dense block.
    super().__init__()
    self.layers = nn.ModuleList() #This is a list specifically designed to hold sub-modules (_Bottleneck layers in this case).
    for i in range(num_layers):
      layer_input_channels = in_channels + i * growth_rate
      #Calculates the required number of input channels for the current layer (i).
      #in_channels: The initial number of feature maps from outside the block.
      #i * growth_rate: The total number of new feature maps added by all previous i layers within this block.
      #The sum is the total number of channels that will be concatenated and fed as input to the current layer.
      self.layers.append(_Bottleneck(layer_input_channels, growth_rate)) #Creates a new _Bottleneck module

  def forward(self, init_features): #init_features: The initial feature maps (tensors) coming into the dense block.
    features = [init_features] #Initializes a list called features. This list will store the feature maps from the initial input and the output of every subsequent layer in the dense block.
    for layer in self.layers:
      #This is the core of the dense connectivity:
      #torch.cat(features, 1): All feature maps currently stored in the features list (the initial input + all previous layer outputs) are concatenated (stacked) along the channel dimension (dimension 1 in PyTorch's common N×C×H×W format). This becomes the input to the current layer.
      #layer(...): The concatenated features are passed through the current _Bottleneck layer, producing a tensor of growth_rate number of channels, called new_features.
      new_features = layer(torch.cat(features, 1))
      features.append(new_features) #The newly generated feature maps are added to the features list. This ensures that the output of the current layer will be included in the concatenated input for all subsequent layers.

    return torch.cat(features, 1) #After all layers in the block have been processed, the function returns the final output of the dense block by concatenating all the feature maps stored in the features list (the initial input + the output of all N layers). This composite feature map is then typically passed to a Transition Layer.

The transition layers are placed between Dense Blocks to control the model's complexity by reducing the spatial size and reducing the number of feature channels, while still allowing the DenseNet to benefit from its core feature reuse and strong gradient flow properties.

In [7]:
class _Transition(nn.Module):
  def __init__(self, in_channels, out_channels):
    super().__init__()
    self.norm = nn.BatchNorm2d(in_channels)
    self.relu = nn.ReLU(inplace=True)
    # 1x1 Conv for feature map compression
    self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=1, bias=False)
    # 2x2 Average Pooling for spatial downsampling
    self.pool = nn.AvgPool2d(kernel_size=2, stride=2)

  def forward(self, x):
    out = self.pool(self.conv(self.relu(self.norm(x))))
    return out

Assembling the DenseNet model

In [8]:
#Creating a simplified DenseNet model
class DenseNet_custom(nn.Module):
  def __init__(self, growth_rate=12, block_config=(6,12), num_init_features=24, num_classes=10):
    super().__init__()

    # Initial 3x3 Convolution (CIFAR-10 size doesn't need 7x7 with stride 2)
    self.features = nn.Sequential(
        nn.Conv2d(3, num_init_features, kernel_size=3, stride=1, padding=1, bias=False),
        nn.BatchNorm2d(num_init_features),
        nn.ReLU(inplace=True)
    )

    # Add Dense Blocks and Transition Layers
    num_features = num_init_features
    for i, num_layers in enumerate(block_config):
      # Adding dense block
      block = _DenseBlock(num_layers=num_layers, in_channels=num_features, growth_rate=growth_rate)
      self.features.add_module(f'denseblock{i+1}', block)
      num_features += num_layers * growth_rate

      #Adding transition layer if not the last block
      if i != len(block_config) - 1:
        # compression factor theta = 0.5 (standard)
        out_channels = int(num_features * 0.5)
        trans = _Transition(num_features, out_channels)
        self.features.add_module(f'transition{i+1}', trans)
        num_features = out_channels

    #final batch norm and global average pooling
    self.features.add_module('norm5', nn.BatchNorm2d(num_features))
    self.features.add_module('relu5', nn.ReLU(inplace=True))

    #final classifier
    self.classifier = nn.Linear(num_features, num_classes)

  def forward(self, x):
    features = self.features(x)
    out = nn.functional.adaptive_avg_pool2d(features, (1, 1))
    out = torch.flatten(out, 1)
    out = self.classifier(out)
    return out

In [9]:
# Instantiate the simplified model
# Config (6, 12) means two dense blocks with 6 and 12 layers respectively.
# Total layers: 1 (initial) + 2*(6*2) + 2*(12*2) + 2*(Transition) + 1 (Classifier)
model = DenseNet_custom().to(device)

In [10]:
# model summary
print(model)

DenseNet_custom(
  (features): Sequential(
    (0): Conv2d(3, 24, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (denseblock1): _DenseBlock(
      (layers): ModuleList(
        (0): _Bottleneck(
          (norm1): BatchNorm2d(24, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu1): ReLU(inplace=True)
          (conv1): Conv2d(24, 48, kernel_size=(1, 1), stride=(1, 1), bias=False)
          (norm2): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu2): ReLU(inplace=True)
          (conv2): Conv2d(48, 12, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        )
        (1): _Bottleneck(
          (norm1): BatchNorm2d(36, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
          (relu1): ReLU(inplace=True)
          (conv1): Conv2d(36, 48, kernel_si

Data loading & training

In [11]:
#Load CIFAR-10 data
train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

In [12]:
train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=BATCH_SIZE, shuffle=False)

In [13]:
#Defining Loss & Optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=0.9, weight_decay=1e-4) # SGD with momentum is common for scratch training
# momentum=0.9 : for faster convergence
# weight_decay: for overfitting
#le-4: L2 Regularization

In [14]:
#training loop
def training(model, criterion, optimizer, train_loader, epochs):
  for epoch in range(epochs):
    model.train() #sets the model to training mode
    running_loss = 0.0 #Initializes a float variable to accumulate the loss over all batches in the current epoch.
    correct_preds = 0 #Initializes an integer variable to count the number of correct predictions in the current epoch.
    total_samples = 0 #Initializes an integer variable to count the total number of processed samples in the current epoch (used for calculating accuracy).

    for i, (inputs, labels) in enumerate(train_loader):
      inputs, labels = inputs.to(device), labels.to(device)

      optimizer.zero_grad() #Clears the gradients of all optimized parameters. This must be done at the start of every batch iteration, otherwise gradients from previous batches will accumulate.
      outputs = model(inputs) #Passes the input data through the neural network model to get the predicted raw scores (logits).
      loss = criterion(outputs, labels)
      loss.backward() #Computes the gradient of the loss with respect to every model parameter that has requires_grad=True. These gradients are stored in the .grad attribute of the parameters.
      optimizer.step() #Updates the model's parameters based on the computed gradients and the optimization algorithm (e.g., SGD, Adam). This is the core learning step.

      running_loss += loss.item() * inputs.size(0) #Accumulates the loss. loss.item() extracts the numerical value from the scalar tensor. This is multiplied by the batch size (inputs.size(0)) to get the total loss contribution of the current batch.
      _, predicted = torch.max(outputs.data, 1) #Finds the predicted class. torch.max returns the maximum value and its index (the predicted class ID) along dimension 1 (the class dimension). We ignore the maximum value (_) and keep the index (predicted). .data is used to get the tensor without its connection to the computational graph.
      total_samples += labels.size(0)
      correct_preds += (predicted == labels).sum().item()

    epoch_loss = running_loss/len(train_loader.dataset)
    epoch_acc = correct_preds/total_samples

    print(f"Epoch {epoch+1}/{epochs} | Loss: {epoch_loss:.4f} | Accuracy: {epoch_acc*100:.2f}%")

In [15]:
training(model, criterion, optimizer, train_loader, EPOCHS)

Epoch 1/1 | Loss: 1.5456 | Accuracy: 42.38%
