In [1]:
import torch
from torch import nn
import torch.nn.functional as F
from utils.train_utils import train_classify
from utils.fashion_mnist import load_data_fashion_mnist
from utils.gpu_mem_maneger import GPUMemoryManager

device = 'cuda' if torch.cuda.is_available() else 'cpu'
gpu_memory_manager = GPUMemoryManager(0)

In [2]:
train_loader, test_loader = load_data_fashion_mnist(64, resize=224)

## NiN

在每个像素的通道上分别使用多层感知机。

NiN块以一个普通卷积层开始，后面是两个1 × 1的卷积层。这两个1 × 1卷积层充当带有ReLU激活函数的逐像素全连接层。第一层的卷积窗口形状通常由用户设置。随后的卷积窗口形状固定为1 × 1。

NiN和AlexNet之间的一个显著区别是NiN完全取消了全连接层。相反，NiN使用一个NiN块，其输出通道数等于标签类别的数量。最后放一个全局平均汇聚层（global average pooling layer），生成一个对数几率（logits）。NiN设计的一个优点是，移除全连接层可减少过拟合，同时显著减少NiN的参数。然而，在实践中，这种设计有时会增加训练模型的时间。

In [3]:
def nin_block(in_channels:int, out_channels:int, kernel_size:int, 
              strides:int, padding:int) -> nn.Module:
    return nn.Sequential(
        nn.Conv2d(in_channels, out_channels, kernel_size, padding=padding, stride=strides),
        nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU(),
        nn.Conv2d(out_channels, out_channels, kernel_size=1), nn.ReLU()
    )

In [4]:
nin_net = nn.Sequential(
    nin_block(1, 96, kernel_size=11, strides=4, padding=0),
    nn.MaxPool2d(3, stride=2),
    nin_block(96, 256, kernel_size=5, strides=1, padding=2),
    nn.MaxPool2d(3, stride=2),
    nin_block(256, 384, kernel_size=3, strides=1, padding=1),
    nn.MaxPool2d(3, stride=2),
    nn.Dropout(0.5),
    # 标签类别数是10
    nin_block(384, 10, kernel_size=3, strides=1, padding=1),
    nn.AdaptiveAvgPool2d((1, 1)),
    # 将四维的输出转成二维的输出，其形状为(批量大小,10)
    nn.Flatten()
)

In [6]:
X = torch.rand(size=(1, 1, 224, 224))
for layer in nin_net:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 96, 54, 54])
MaxPool2d output shape:	 torch.Size([1, 96, 26, 26])
Sequential output shape:	 torch.Size([1, 256, 26, 26])
MaxPool2d output shape:	 torch.Size([1, 256, 12, 12])
Sequential output shape:	 torch.Size([1, 384, 12, 12])
MaxPool2d output shape:	 torch.Size([1, 384, 5, 5])
Dropout output shape:	 torch.Size([1, 384, 5, 5])
Sequential output shape:	 torch.Size([1, 10, 5, 5])
AdaptiveAvgPool2d output shape:	 torch.Size([1, 10, 1, 1])
Flatten output shape:	 torch.Size([1, 10])


In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(nin_net.parameters(), lr=0.001, weight_decay=0.01)

train_classify(nin_net.to(device), train_loader, test_loader, optimizer, criterion, num_epochs=10)

Epoch 1: 100%|██████████| 469/469 [00:34<00:00, 13.46it/s, accuracy=58.4, loss=1.16]

Epoch: 1, loss: 1.162602079067149, acc: 58.365





Epoch: 1, test loss: 0.005268312713503837, test acc: 76.27000000000001


Epoch 2: 100%|██████████| 469/469 [00:32<00:00, 14.25it/s, accuracy=78.6, loss=0.587]

Epoch: 2, loss: 0.587463128350691, acc: 78.55666666666667





Epoch: 2, test loss: 0.004188328275084496, test acc: 80.76


Epoch 3: 100%|██████████| 469/469 [00:32<00:00, 14.24it/s, accuracy=81.3, loss=0.501]

Epoch: 3, loss: 0.5014112691508174, acc: 81.285





Epoch: 3, test loss: 0.003985622897744179, test acc: 81.81


Epoch 4: 100%|██████████| 469/469 [00:32<00:00, 14.25it/s, accuracy=83.6, loss=0.442]

Epoch: 4, loss: 0.4416831992328294, acc: 83.57833333333333





Epoch: 4, test loss: 0.0033349765121936797, test acc: 84.2


Epoch 5: 100%|██████████| 469/469 [00:32<00:00, 14.24it/s, accuracy=84.8, loss=0.407]

Epoch: 5, loss: 0.40685071823185187, acc: 84.75166666666667





Epoch: 5, test loss: 0.0031823406457901003, test acc: 84.53


Epoch 6: 100%|██████████| 469/469 [00:32<00:00, 14.25it/s, accuracy=85.8, loss=0.38] 

Epoch: 6, loss: 0.38040326163967025, acc: 85.79333333333334





Epoch: 6, test loss: 0.002981862010061741, test acc: 85.86


Epoch 7: 100%|██████████| 469/469 [00:32<00:00, 14.23it/s, accuracy=86.6, loss=0.362]

Epoch: 7, loss: 0.36157581470668443, acc: 86.56333333333333





Epoch: 7, test loss: 0.003056580322980881, test acc: 85.42


Epoch 8: 100%|██████████| 469/469 [00:32<00:00, 14.24it/s, accuracy=87.1, loss=0.344]

Epoch: 8, loss: 0.34404675694289744, acc: 87.12666666666667





Epoch: 8, test loss: 0.0029074276953935624, test acc: 86.26


Epoch 9: 100%|██████████| 469/469 [00:32<00:00, 14.23it/s, accuracy=87.7, loss=0.327]

Epoch: 9, loss: 0.3271451795152001, acc: 87.70333333333333





Epoch: 9, test loss: 0.0027069897502660753, test acc: 87.07000000000001


Epoch 10: 100%|██████████| 469/469 [00:32<00:00, 14.22it/s, accuracy=88.2, loss=0.316]

Epoch: 10, loss: 0.31554009970317265, acc: 88.2





Epoch: 10, test loss: 0.0024538601294159888, test acc: 88.53


In [13]:
gpu_memory_manager.clear_cache()
gpu_memory_manager.print_memory_stats()

Exception ignored in: <function _MultiProcessingDataLoaderIter.__del__ at 0x7f7762c38180>
Traceback (most recent call last):
  File "/home/jyl/anaconda3/envs/py3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1478, in __del__
    self._shutdown_workers()
  File "/home/jyl/anaconda3/envs/py3.11/lib/python3.11/site-packages/torch/utils/data/dataloader.py", line 1442, in _shutdown_workers
    w.join(timeout=_utils.MP_STATUS_CHECK_INTERVAL)
  File "/home/jyl/anaconda3/envs/py3.11/lib/python3.11/multiprocessing/process.py", line 149, in join
    res = self._popen.wait(timeout)
          ^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyl/anaconda3/envs/py3.11/lib/python3.11/multiprocessing/popen_fork.py", line 40, in wait
    if not wait([self.sentinel], timeout):
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "/home/jyl/anaconda3/envs/py3.11/lib/python3.11/multiprocessing/connection.py", line 930, in wait
    ready = selector.select(timeout)
            ^^^^^^^^^^^^^^^^^^^

GPU Memory Status:
Total Memory: 8187.5 MB
Reserved Memory: 1158.0 MB
Allocated Memory: 88.103 MB
Free Memory: 1069.897 MB


## GoogLeNet

该网络由Inception块构成。

**每个Inception块包含四条路径**：
* 前三条路径使用窗口大小为1×1、3×3和5×5的卷积层，从不同空间大小中提取信息。中间的两条路径在输入上执行1 × 1卷积，以减少通道数，从而降低模型的复杂性。
* 第四条路径使用3 × 3最大汇聚层，然后使用1 × 1卷积层来改变通道数。
* 这四条路径都使用合适的填充来使输入与输出的高和宽一致，最后我们将每条线路的输出在通道维度上连结，并构成Inception块的输出。
在Inception块中，通常调整的超参数是每层输出通道数。

In [3]:
class Inception(nn.Module):
    def __init__(self, in_channels, c1, c2, c3, c4, **kwargs) -> None:
        super(Inception, self).__init__(**kwargs)
        self.p1_1 = nn.Conv2d(in_channels, c1, kernel_size=1)

        self.p2_1 = nn.Conv2d(in_channels, c2[0], kernel_size=1)
        self.p2_2 = nn.Conv2d(c2[0], c2[1], kernel_size=3, padding=1)

        self.p3_1 = nn.Conv2d(in_channels, c3[0], kernel_size=1)
        self.p3_2 = nn.Conv2d(c3[0], c3[1], kernel_size=5, padding=2)

        self.p4_1 = nn.MaxPool2d(kernel_size=3, stride=1, padding=1)
        self.p4_2 = nn.Conv2d(in_channels, c4, kernel_size=1)

    def forward(self, x):
        p1 = F.relu(self.p1_1(x))
        p2 = F.relu(self.p2_2(F.relu(self.p2_1(x))))
        p3 = F.relu(self.p3_2(F.relu(self.p3_1(x))))
        p4 = F.relu(self.p4_2(self.p4_1(x)))

        return torch.cat((p1, p2, p3, p4), dim=1)

In [4]:
b1 = nn.Sequential(nn.Conv2d(1, 64, kernel_size=7, stride=2, padding=3),
                    nn.ReLU(),
                    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b2 = nn.Sequential(nn.Conv2d(64, 64, kernel_size=1),
                    nn.ReLU(),
                    nn.Conv2d(64, 192, kernel_size=3, padding=1),
                    nn.ReLU(),
                    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b3 = nn.Sequential(Inception(192, 64, (96, 128), (16, 32), 32),
                    Inception(256, 128, (128, 192), (32, 96), 64),
                    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

b4 = nn.Sequential(Inception(480, 192, (96, 208), (16, 48), 64),
                    Inception(512, 160, (112, 224), (24, 64), 64),
                    Inception(512, 128, (128, 256), (24, 64), 64),
                    Inception(512, 112, (144, 288), (32, 64), 64),
                    Inception(528, 256, (160, 320), (32, 128), 128),
                    nn.MaxPool2d(kernel_size=3, stride=2, padding=1))

In [5]:
b5 = nn.Sequential(Inception(832, 256, (160, 320), (32, 128), 128),
                    Inception(832, 384, (192, 384), (48, 128), 128),
                    nn.AdaptiveAvgPool2d((1,1)),
                    nn.Flatten())

GoogLeNet = nn.Sequential(b1, b2, b3, b4, b5, nn.Linear(1024, 10))

In [6]:
X = torch.rand(size=(1, 1, 96, 96))
for layer in GoogLeNet:
    X = layer(X)
    print(layer.__class__.__name__,'output shape:\t', X.shape)

Sequential output shape:	 torch.Size([1, 64, 24, 24])
Sequential output shape:	 torch.Size([1, 192, 12, 12])
Sequential output shape:	 torch.Size([1, 480, 6, 6])
Sequential output shape:	 torch.Size([1, 832, 3, 3])
Sequential output shape:	 torch.Size([1, 1024])
Linear output shape:	 torch.Size([1, 10])


In [7]:
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(GoogLeNet.parameters(), lr=0.001, weight_decay=0.01)
train_classify(GoogLeNet.to(device), train_loader, test_loader, optimizer, criterion, num_epochs=10)

Epoch 1: 100%|██████████| 938/938 [01:48<00:00,  8.62it/s, accuracy=62, loss=0.991]  

Epoch: 1, loss: 0.9911365643906187, acc: 62.03333333333333





Epoch: 1, test loss: 0.007464704896509648, test acc: 82.23


Epoch 2: 100%|██████████| 938/938 [01:46<00:00,  8.79it/s, accuracy=84.8, loss=0.407]

Epoch: 2, loss: 0.40687744789667474, acc: 84.83333333333333





Epoch: 2, test loss: 0.0060160385414958, test acc: 85.59


Epoch 3: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=87.4, loss=0.336]

Epoch: 3, loss: 0.33558323793510386, acc: 87.43833333333333





Epoch: 3, test loss: 0.005399196973443031, test acc: 87.52


Epoch 4: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=88.9, loss=0.296]

Epoch: 4, loss: 0.2958082041776638, acc: 88.92666666666666





Epoch: 4, test loss: 0.005425486113131046, test acc: 87.16000000000001


Epoch 5: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=90, loss=0.269]  

Epoch: 5, loss: 0.2686186593049752, acc: 89.99





Epoch: 5, test loss: 0.004328479720652104, test acc: 90.2


Epoch 6: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=91, loss=0.243]  

Epoch: 6, loss: 0.24317706288543464, acc: 91.03833333333333





Epoch: 6, test loss: 0.004134831077605486, test acc: 90.59


Epoch 7: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=91.5, loss=0.23] 

Epoch: 7, loss: 0.2295446720546179, acc: 91.45666666666666





Epoch: 7, test loss: 0.003912851490452886, test acc: 90.82000000000001


Epoch 8: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=92, loss=0.212]  

Epoch: 8, loss: 0.21241672967534775, acc: 92.01833333333333





Epoch: 8, test loss: 0.003709228190779686, test acc: 91.24


Epoch 9: 100%|██████████| 938/938 [01:46<00:00,  8.81it/s, accuracy=92.6, loss=0.201]

Epoch: 9, loss: 0.20071496609955836, acc: 92.575





Epoch: 9, test loss: 0.0036470425840467217, test acc: 91.61


Epoch 10: 100%|██████████| 938/938 [01:46<00:00,  8.80it/s, accuracy=92.9, loss=0.189]

Epoch: 10, loss: 0.1886555487270167, acc: 92.93333333333334





Epoch: 10, test loss: 0.004049591913819313, test acc: 91.01
