# 稠密连接网络（DenseNet）

ResNet中的跨层连接设计引申出了数个后续工作。本节我们介绍其中的一个：稠密连接网络（DenseNet）。

<div align=center>
<img width="400" src="imgs/dense.jpg"/>
</div>
<div align=center> ResNet（左）与DenseNet（右）在跨层连接上的主要区别：使用相加和使用连结</div>

前后相邻的运算抽象为模块$A$和模块$B$。与ResNet的主要区别在于，DenseNet里模块$B$的输出不是像ResNet那样和模块$A$的输出相加，而是在通道维上连结。

DenseNet的主要构建模块是
1. 稠密块（dense block）定义了输入和输出是如何连结的
2. 过渡层（transition layer）用来控制通道数 1x1卷积 压缩信道

In [1]:
#DenseNet使用了ResNet改良版的“批量归一化、激活和卷积”结构，我们首先在`conv_block`函数里实现这个结构。

import time
import torch
from torch import nn, optim
import torch.nn.functional as F


import dl_utils
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

def conv_block(in_channels, out_channels):
    blk = nn.Sequential(nn.BatchNorm2d(in_channels), 
                        nn.ReLU(),
                        nn.Conv2d(in_channels, out_channels, kernel_size=3, padding=1))
    return blk

In [2]:
# Dense Block 由多个 conv block 构成，每次信道翻倍

class DenseBlock(nn.Module):
    def __init__(self, num_convs, in_channels, out_channels):
        super(DenseBlock, self).__init__()
        
        net = []
        for i in range(num_convs):
            # 信道加倍
            in_c = in_channels + i * out_channels
            net.append(conv_block(in_c, out_channels))
            
        self.net = nn.ModuleList(net)
        
        self.out_channels = in_channels + num_convs * out_channels # 计算输出通道数

    def forward(self, X):
        for blk in self.net:
            Y = blk(X)
            #print(X[0][0][0])
            #相当于每次输入，都是带有之前的输出层的全部特征
            X = torch.cat((X, Y), dim=1)  # 在通道维上将输入和输出连结
        return X


<div align=center>
<img width="600" src="imgs/densenet1.png"/>
</div>
<div align=center>DenseBlock 示意图</div>


<div align=center>
<img width="600" src="imgs/densenet2.png"/>
</div>
<div align=center>DenseNet</div>

In [3]:
net = DenseBlock(4,16,32)
print(net)
x = torch.randn(1,16,32,32)
x  = net(x)
print(x.shape)

DenseBlock(
  (net): ModuleList(
    (0): Sequential(
      (0): BatchNorm2d(16, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): ReLU()
      (2): Conv2d(16, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (1): Sequential(
      (0): BatchNorm2d(48, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): ReLU()
      (2): Conv2d(48, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (2): Sequential(
      (0): BatchNorm2d(80, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): ReLU()
      (2): Conv2d(80, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
    (3): Sequential(
      (0): BatchNorm2d(112, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      (1): ReLU()
      (2): Conv2d(112, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    )
  )
)
torch.Size([1, 144, 32, 32])


## 过渡层

由于每个稠密块都会带来通道数的增加，使用过多则会带来过于复杂的模型。过渡层用来控制模型复杂度。它通过$1\times1$卷积层来减小通道数，并使用步幅为2的平均池化层减半高和宽，从而进一步降低模型复杂度。

In [4]:
def transition_block(in_channels, out_channels):
    blk = nn.Sequential(
            nn.BatchNorm2d(in_channels), 
            nn.ReLU(),
            nn.Conv2d(in_channels, out_channels, kernel_size=1),
            nn.AvgPool2d(kernel_size=2, stride=2))
    return blk


In [5]:
blk = DenseBlock(2, 3, 10)
X = torch.rand(4, 3, 8, 8)
Y = blk(X)
print(Y.shape) # torch.Size([4, 23, 8, 8])

#对上一个例子中稠密块的输出使用通道数为10的过渡层。此时输出的通道数减为10，高和宽均减半。
blk = transition_block(23, 10)
blk(Y).shape # torch.Size([4, 10, 4, 4])

torch.Size([4, 23, 8, 8])


torch.Size([4, 10, 4, 4])

# DenseNet模型

In [7]:
net = nn.Sequential(
        nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
        nn.BatchNorm2d(64), 
        nn.ReLU(),
        nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

num_channels, growth_rate = 64, 32  # num_channels为当前的通道数
num_convs_in_dense_blocks = [4, 4, 4, 4]

for i, num_convs in enumerate(num_convs_in_dense_blocks):
    
    DB = DenseBlock(num_convs, num_channels, growth_rate)
    
    net.add_module("DenseBlosk_%d" % i, DB)
    
    # 上一个稠密块的输出通道数
    num_channels = DB.out_channels
    
    # 在稠密块之间加入通道数减半的过渡层
    if i != len(num_convs_in_dense_blocks) - 1:
        net.add_module("transition_block_%d" % i, transition_block(num_channels, num_channels // 2))
        num_channels = num_channels // 2
        

In [8]:
net.add_module("BN", nn.BatchNorm2d(num_channels))
net.add_module("relu", nn.ReLU())
net.add_module("global_avg_pool", dl_utils.GlobalAvgPool2d()) # GlobalAvgPool2d的输出: (Batch, num_channels, 1, 1)
net.add_module("fc", nn.Sequential(dl_utils.FlattenLayer(), nn.Linear(num_channels, 10))) 

In [9]:
x = torch.randn(16,3,8,8)
x.shape[2:]

torch.Size([8, 8])

In [10]:
class GlobalAvgPool2d(nn.Module):
    def __init__(self):
        super(GlobalAvgPool2d, self).__init__()
        
    def forward(self, x):
        return nn.functional.max_pool2d(x, x.shape[2:])
    

In [11]:
print(x.shape)
pool = GlobalAvgPool2d()
pool(x).shape

torch.Size([16, 3, 8, 8])


torch.Size([16, 3, 1, 1])

In [12]:
X = torch.rand((1, 1, 96, 96))
for name, layer in net.named_children():
    X = layer(X)
    print(name, ' output shape:\t', X.shape)

0  output shape:	 torch.Size([1, 64, 48, 48])
1  output shape:	 torch.Size([1, 64, 48, 48])
2  output shape:	 torch.Size([1, 64, 48, 48])
3  output shape:	 torch.Size([1, 64, 24, 24])
DenseBlosk_0  output shape:	 torch.Size([1, 192, 24, 24])
transition_block_0  output shape:	 torch.Size([1, 96, 12, 12])
DenseBlosk_1  output shape:	 torch.Size([1, 224, 12, 12])
transition_block_1  output shape:	 torch.Size([1, 112, 6, 6])
DenseBlosk_2  output shape:	 torch.Size([1, 240, 6, 6])
transition_block_2  output shape:	 torch.Size([1, 120, 3, 3])
DenseBlosk_3  output shape:	 torch.Size([1, 248, 3, 3])
BN  output shape:	 torch.Size([1, 248, 3, 3])
relu  output shape:	 torch.Size([1, 248, 3, 3])
global_avg_pool  output shape:	 torch.Size([1, 248, 1, 1])
fc  output shape:	 torch.Size([1, 10])


In [14]:
batch_size = 256
# 如出现“out of memory”的报错信息，可减小batch_size或resize
train_iter, test_iter = dl_utils.load_data_fashion_mnist(batch_size, resize=96)

lr, num_epochs = 0.001, 5
optimizer = torch.optim.Adam(net.parameters(), lr=lr)
dl_utils.train_ch5(net, train_iter, test_iter, batch_size, optimizer, device, num_epochs)

training on  cuda
epoch 1, loss 0.4050, train acc 0.856, test acc 0.853, time 42.6 sec
epoch 2, loss 0.1243, train acc 0.909, test acc 0.903, time 42.7 sec
epoch 3, loss 0.0709, train acc 0.922, test acc 0.911, time 41.9 sec
epoch 4, loss 0.0486, train acc 0.929, test acc 0.909, time 41.5 sec
epoch 5, loss 0.0348, train acc 0.937, test acc 0.922, time 41.2 sec
