In [147]:
# import

import numpy as np
from datetime import datetime
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision import datasets
import matplotlib.pyplot as plt
import torchinfo

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
from pytz import timezone

In [148]:
# 하이퍼파라미터 설정
RANDOM_SEED = 4242
LEARNING_RATE = 0.01
BATCH_SIZE = 32
EPOCHS = 90
IMG_SIZE = 227
NUM_CLASSES = 1000

# 규제화 파라미터 설정
DROPOUT = 0.5
LRN_K = 2.0
LRN_ALPHA = 0.0001
LRN_BETA = 0.75
LRN_N = 5

In [None]:
# AlexNet 현대적으로 재해석한 책에서 제시된 버전으로 구현 (multi-GPU 구현하지 않음, 배치정규화 차용)
class ModernAlexNet(nn.Module):
    def __init__(self):
        super(ModernAlexNet, self).__init__()
        self.pool = nn.MaxPool2d(kernel_size=3, stride=2)

        self.conv1 = nn.Conv2d(in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=0)
        self.batchnorm1 = nn.BatchNorm2d(96)
        self.conv2 = nn.Conv2d(in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2)
        self.batchnorm2 = nn.BatchNorm2d(256)
        self.conv3 = nn.Conv2d(in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1)
        self.batchnorm3 = nn.BatchNorm2d(384)
        self.conv4 = nn.Conv2d(in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1)
        self.batchnorm4 = nn.BatchNorm2d(384)
        self.conv5 = nn.Conv2d(in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1)
        self.batchnorm5 = nn.BatchNorm2d(256)
        self.fc6 = nn.Linear(6 * 6 * 256, 4096)
        self.fc7 = nn.Linear(4096, 4096)
        self.fc8 = nn.Linear(4096, NUM_CLASSES)

    def forward(x, self):
        # Conv 1
        x = F.relu(self.conv1(x))
        x = self.pool(x)
        x = self.batchnorm1(x)
        # Conv 2
        x = F.relu(self.conv2(x))
        x = self.pool(x)
        x = self.batchnorm2(x)
        # Conv 3
        x = F.relu(self.conv3(x))
        x = self.batchnorm3(x)
        # Conv 4
        x = F.relu(self.conv4(x))
        x = self.batchnorm4(x)
        # Conv 5
        x = F.relu(self.conv5(x))
        x = self.batchnorm5(x)
        x = self.pool(x)
        # FC 6
        x = x.view(-1, 6 * 6 * 256)
        x = F.relu(self.fc6(x))
        x = self.dropout(x)
        # FC 7
        x = F.relu(self.fc7(x))
        x = self.dropout(x)
        # FC 8
        logits = self.fc8(x)
        return logits

torchinfo.summary(
    ModernAlexNet(),
    input_size=(1, 3, IMG_SIZE, IMG_SIZE),
    col_names=["input_size", "output_size", "num_params", "kernel_size"],
    row_settings=["depth", "var_names"],
)

In [149]:
# AlexNet 논문과 최대한 유사하게 구현 (multi-GPU를 표현만 하고, 실제 구현하지 않음)
class PseudoAlexNet(nn.Module):
    def __init__(self):
        super(PseudoAlexNet, self).__init__()
        self.lrn = nn.LocalResponseNorm(LRN_N, alpha=LRN_ALPHA, beta=LRN_BETA, k=LRN_K)
        self.dropout = nn.Dropout(DROPOUT)

        self.conv1_a = nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=0)
        self.conv1_b = nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=0)
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)

        self.conv2_a = nn.Conv2d(48, 128, kernel_size=5, stride=1, padding=2)
        self.conv2_b = nn.Conv2d(48, 128, kernel_size=5, stride=1, padding=2)
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)

        self.conv3_a = nn.Conv2d(128 * 2, 192, kernel_size=3, stride=1, padding=1)
        self.conv3_b = nn.Conv2d(128 * 2, 192, kernel_size=3, stride=1, padding=1)

        self.conv4_a = nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=1)
        self.conv4_b = nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=1)

        self.conv5_a = nn.Conv2d(192, 128, kernel_size=3, stride=1, padding=1)
        self.conv5_b = nn.Conv2d(192, 128, kernel_size=3, stride=1, padding=1)
        self.pool5 = nn.MaxPool2d(kernel_size=3, stride=2)

        self.fc6_a = nn.Linear(128 * 6 * 6 * 2, 2048)
        self.fc6_b = nn.Linear(128 * 6 * 6 * 2, 2048)

        self.fc7_a = nn.Linear(2048 * 2, 2048)
        self.fc7_b = nn.Linear(2048 * 2, 2048)

        self.fc8 = nn.Linear(2048 * 2, NUM_CLASSES)

    def forward(self, x):
        # Conv 1
        x_a = self.lrn(F.relu(self.conv1_a(x)))
        x_b = self.lrn(F.relu(self.conv1_b(x)))
        x_a = self.pool1(x_a)
        x_b = self.pool1(x_b)
        # Conv 2
        x_a = self.lrn(F.relu(self.conv2_a(x_a)))
        x_b = self.lrn(F.relu(self.conv2_b(x_b)))
        x_a = self.pool2(x_a)
        x_b = self.pool2(x_b)
        # Conv 3, GPU 데이터 합치고 각각 연산 수행
        x = torch.cat((x_a, x_b), dim=1)
        x_a = F.relu(self.conv3_a(x))
        x_b = F.relu(self.conv3_b(x))
        # Conv 4
        x_a = F.relu(self.conv4_a(x_a))
        x_b = F.relu(self.conv4_b(x_b))
        # Conv 5
        x_a = F.relu(self.conv5_a(x_a))
        x_b = F.relu(self.conv5_b(x_b))
        x_a = self.pool5(x_a)
        x_b = self.pool5(x_b)
        # FC 6, GPU 데이터 합치고 각각 연산 수행
        x = torch.cat((x_a, x_b), dim=1)
        x = x.view(x.size(0), -1)
        x_a = self.dropout(F.relu(self.fc6_a(x)))
        x_b = self.dropout(F.relu(self.fc6_b(x)))
        # FC 7, GPU 데이터 합치고 각각 연산 수행
        x = torch.cat((x_a, x_b), dim=1)
        x_a = self.dropout(F.relu(self.fc7_a(x)))
        x_b = self.dropout(F.relu(self.fc7_b(x)))
        # FC 8, GPU 데이터 합쳐서 최종 연산
        x = torch.cat((x_a, x_b), dim=1)
        logits = self.fc8(x)

        return logits


torchinfo.summary(
    PseudoAlexNet(),
    input_size=(1, 3, IMG_SIZE, IMG_SIZE),
    col_names=["input_size", "output_size", "num_params", "kernel_size"],
    row_settings=["depth", "var_names"],
)

Layer (type (var_name):depth-idx)        Input Shape               Output Shape              Param #                   Kernel Shape
AlexNet (AlexNet)                        [1, 3, 227, 227]          [1, 1000]                 --                        --
├─Conv2d (conv1_a): 1-1                  [1, 3, 227, 227]          [1, 48, 55, 55]           17,472                    [11, 11]
├─LocalResponseNorm (lrn): 1-2           [1, 48, 55, 55]           [1, 48, 55, 55]           --                        --
├─Conv2d (conv1_b): 1-3                  [1, 3, 227, 227]          [1, 48, 55, 55]           17,472                    [11, 11]
├─LocalResponseNorm (lrn): 1-4           [1, 48, 55, 55]           [1, 48, 55, 55]           --                        --
├─MaxPool2d (pool1): 1-5                 [1, 48, 55, 55]           [1, 48, 27, 27]           --                        3
├─MaxPool2d (pool1): 1-6                 [1, 48, 55, 55]           [1, 48, 27, 27]           --                        3
├─Co

In [150]:
# # AlexNet 2개 GPU로 나누는 연습
# class AlexNet(nn.Module):
#     def __init__(self):
#         super(AlexNet, self).__init__()
        
#         self.conv1_a = nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=0).to('cuda:0')
#         self.conv1_b = nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=0).to('cuda:1')
#         self.pool1_a = nn.MaxPool2d(kernel_size=4).to('cuda:0')
#         self.pool1_b = nn.MaxPool2d(kernel_size=4).to('cuda:1')
        
#         self.fc2 = nn.Linear(16224, NUM_CLASSES).to('cuda:0')

#     def forward(self, x):

#         x_a = x.to('cuda:0')
#         x_b = x.to('cuda:1')
#         x_a = F.relu(self.conv1_a(x_a))
#         x_b = F.relu(self.conv1_b(x_b))
#         x_a = self.pool1_a(x_a)
#         x_b = self.pool1_b(x_b)
        
#         print("2nd ", x_a.device)
#         print("2nd ", x_b.device)
        
#         x_b = x_b.to('cuda:0')

#         x = torch.cat((x_a, x_b), dim=1)
#         x = x.view(x.size(0), -1)
#         logits = self.fc2(x)

#         return logits

In [151]:
!nvidia-smi

import os
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"]= "0,1"

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

print('Device:', device)
print('Current cuda device:', torch.cuda.current_device())
print('Count of using GPUs:', torch.cuda.device_count())

Thu Jul  4 05:44:32 2024       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.90.07              Driver Version: 550.90.07      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   73C    P0             30W /   70W |    3009MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
|   1  Tesla T4                       Off |   00

In [None]:
# AlexNet 실제로 2개의 GPU가 지원되는 경우에 2개의 GPU를 사용하는 방식으로 구현 (multi-GPU 구현)
class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        self.lrn_a = nn.LocalResponseNorm(LRN_N, alpha=LRN_ALPHA, beta=LRN_BETA, k=LRN_K).to('cuda:0')
        self.lrn_b = nn.LocalResponseNorm(LRN_N, alpha=LRN_ALPHA, beta=LRN_BETA, k=LRN_K).to('cuda:1')
        self.dropout_a = nn.Dropout(DROPOUT).to('cuda:0')
        self.dropout_b = nn.Dropout(DROPOUT).to('cuda:1')
        self.pool_a = nn.MaxPool2d(kernel_size=3, stride=2).to('cuda:0')
        self.pool_b = nn.MaxPool2d(kernel_size=3, stride=2).to('cuda:1')

        # Conv 1
        self.conv1_a = nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=0).to('cuda:0')
        self.conv1_b = nn.Conv2d(3, 48, kernel_size=11, stride=4, padding=0).to('cuda:1')

        # Conv 2
        self.conv2_a = nn.Conv2d(48, 128, kernel_size=5, stride=1, padding=2).to('cuda:0')
        self.conv2_b = nn.Conv2d(48, 128, kernel_size=5, stride=1, padding=2).to('cuda:1')

        # Conv 3
        self.conv3_a = nn.Conv2d(128 * 2, 192, kernel_size=3, stride=1, padding=1).to('cuda:0')
        self.conv3_b = nn.Conv2d(128 * 2, 192, kernel_size=3, stride=1, padding=1).to('cuda:1')

        # Conv 4
        self.conv4_a = nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=1).to('cuda:0')
        self.conv4_b = nn.Conv2d(192, 192, kernel_size=3, stride=1, padding=1).to('cuda:1')

        # Conv 5
        self.conv5_a = nn.Conv2d(192, 128, kernel_size=3, stride=1, padding=1).to('cuda:0')
        self.conv5_b = nn.Conv2d(192, 128, kernel_size=3, stride=1, padding=1).to('cuda:1')

        # FC 6
        self.fc6_a = nn.Linear(128 * 6 * 6 * 2, 2048).to('cuda:0')
        self.fc6_b = nn.Linear(128 * 6 * 6 * 2, 2048).to('cuda:1')

        # FC 7
        self.fc7_a = nn.Linear(2048 * 2, 2048).to('cuda:0')
        self.fc7_b = nn.Linear(2048 * 2, 2048).to('cuda:1')

        # FC 8
        self.fc8 = nn.Linear(2048 * 2, NUM_CLASSES).to('cuda:0')

    def forward(self, x):
        stream0 = torch.cuda.Stream('cuda:0')
        stream1 = torch.cuda.Stream('cuda:1')

        with torch.cuda.stream(stream0):
            x_a = x.to('cuda:0')
            # Conv 1
            x_a = self.lrn_a(F.relu(self.conv1_a(x_a)))
            x_a = self.pool_a(x_a)
            # Conv 2
            x_a = self.lrn_a(F.relu(self.conv2_a(x_a)))
            x_a = self.pool_a(x_a)
        with torch.cuda.stream(stream1):
            x_b = x.to('cuda:1')
            # Conv 1
            x_b = self.lrn_b(F.relu(self.conv1_b(x_b)))
            x_b = self.pool_b(x_b)
            # Conv 2
            x_b = self.lrn_b(F.relu(self.conv2_b(x_b)))
            x_b = self.pool_b(x_b)

        # GPU 통신
        stream0.synchronize()
        stream1.synchronize()
        x_b_ = x_b.to('cuda:0')
        x_a_ = x_a.to('cuda:1')

        with torch.cuda.stream(stream0):
            # Conv 3
            x_a = torch.cat((x_a, x_b_), dim=1)
            x_a = F.relu(self.conv3_a(x_a))
            # Conv 4
            x_a = F.relu(self.conv4_a(x_a))
            # Conv 5
            x_a = F.relu(self.conv5_a(x_a))
            x_a = self.pool_a(x_a)
            x_a = x_a.view(x_a.size(0), -1)
        with torch.cuda.stream(stream1):
            # Conv 3
            x_b = torch.cat((x_a_, x_b), dim=1)
            x_b = F.relu(self.conv3_b(x_b))
            # Conv 4
            x_b = F.relu(self.conv4_b(x_b))
            # Conv 5
            x_b = F.relu(self.conv5_b(x_b))
            x_b = self.pool_b(x_b)
            x_b = x_b.view(x_b.size(0), -1)

        # GPU 통신
        stream0.synchronize()
        stream1.synchronize()
        x_b_ = x_b.to('cuda:0')
        x_a_ = x_a.to('cuda:1')

        with torch.cuda.stream(stream0):
            # FC 6, GPU 데이터 합치고 각각 연산 수행
            x_a = torch.cat((x_a, x_b_), dim=1)
            x_a = self.dropout_a(F.relu(self.fc6_a(x_a)))
        with torch.cuda.stream(stream1):
            # FC 6, GPU 데이터 합치고 각각 연산 수행
            x_b = torch.cat((x_a_, x_b), dim=1)
            x_b = self.dropout_b(F.relu(self.fc6_b(x_b)))

        # GPU 통신
        stream0.synchronize()
        stream1.synchronize()
        x_b_ = x_b.to('cuda:0')
        x_a_ = x_a.to('cuda:1')

        with torch.cuda.stream(stream0):
            # FC 7, GPU 데이터 합치고 각각 연산 수행
            x_a = torch.cat((x_a, x_b_), dim=1)
            x_a = self.dropout_a(F.relu(self.fc7_a(x_a)))
        with torch.cuda.stream(stream1):
            # FC 7, GPU 데이터 합치고 각각 연산 수행
            x_b = torch.cat((x_a_, x_b), dim=1)
            x_b = self.dropout_b(F.relu(self.fc7_b(x_b)))

        stream0.synchronize()
        stream1.synchronize()
        # FC 8, GPU 데이터 합쳐서 최종 연산
        x_b_ = x_b.to('cuda:0')
        x = torch.cat((x_a, x_b_), dim=1)
        logits = self.fc8(x)

        return logits

model = AlexNet()

# Example input
input_size = (1, 3, 227, 227)
x = torch.randn(input_size).cuda()

# Forward pass
output = model(x)
print(output)

Conv1  cuda:0
Conv1  cuda:1
Conv2  cuda:0
Conv2  cuda:1
Conv3  cuda:0
Conv3  cuda:1
Conv4  cuda:0
Conv4  cuda:1
Conv5  cuda:0
Conv5  cuda:1
FC6  cuda:0
FC6  cuda:1
FC7  cuda:0
FC7  cuda:1
FC8  cuda:0
tensor([[-1.3970e-02, -1.3960e-02,  6.6987e-03,  4.7104e-03, -1.1408e-02,
         -6.2351e-03, -5.7788e-03,  1.1038e-02,  4.0766e-03,  1.1278e-02,
          1.8552e-02, -6.8858e-03,  1.2358e-02, -2.5827e-02, -7.5963e-03,
         -1.6843e-03,  6.3736e-04,  8.5741e-03,  9.3615e-03, -2.6996e-03,
         -3.6860e-03, -1.5061e-02, -1.0182e-02,  1.4288e-02,  7.3025e-03,
         -6.8980e-05, -1.6342e-02, -9.9617e-03,  1.4559e-02, -1.1932e-02,
         -3.9111e-03, -1.8996e-02,  1.5565e-02, -5.2459e-03, -1.8285e-02,
         -3.8742e-03,  6.8766e-04, -1.4916e-02, -1.4892e-02, -1.8956e-02,
          1.7696e-02, -6.9104e-03, -2.3624e-03,  1.0591e-02,  1.3595e-02,
          7.1004e-03, -1.1271e-02,  1.5810e-02,  5.0691e-03, -8.0459e-03,
         -3.4708e-03,  8.3902e-03,  8.6477e-03, -1.1970e-02,