In [1]:
import torch

print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

Setup complete. Using torch 2.4.1+cu121 (NVIDIA GeForce RTX 4070)


In [22]:
# !pip install matplotlib
# ! pip install einops
# !pip install torchsummary
!pip install opencv-python

Collecting opencv-python
  Using cached opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl.metadata (20 kB)
Using cached opencv_python-4.10.0.84-cp37-abi3-win_amd64.whl (38.8 MB)
Installing collected packages: opencv-python
Successfully installed opencv-python-4.10.0.84


In [40]:
import torch
import torch.nn.functional as F
import matplotlib as F

from torch import nn
from torch import Tensor
from einops import rearrange, repeat



## Step 1. Project input to patches
### 방법1. 입력 이미지를 패치로 나누어주기 (linear)

In [37]:
x = torch.randn(1, 3, 224, 224)   # randn함수로 주어진 크기(shape)의 tensor를 생성하고 
print('x :', x.shape)

#-------------------------------------------------
# import cv2

# batch_size = 1
# img_path = "C:/Users/kimin/st_defense_lab/st-defense-lab/Vision Transformer/test_images.jpg"
# x = cv2.imread(img_path)
# x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
# x = repeat(x, 'h w c -> b c h w', b=batch_size)
# print('x :', x.shape)
#-------------------------------------------------

patch_size = 16   # 16 x 16 사이즈 패치 
patches = rearrange(x, 'b c (h s1) (w s2) -> b (h w) (s1 s2 c)',   # h: height, w: width, c: channel, s1,s2: patch_size
                    s1=patch_size, s2=patch_size)
print('patches :', patches.shape)


x : torch.Size([1, 3, 224, 224])
patches : torch.Size([1, 196, 768])


### 방법2. Conv layer를 활용하여 패치 나누기 


이 방법은 논문의 Hybrid Architecture로 언급

In [35]:
from einops.layers.torch import Rearrange
from torchsummary import summary

patch_size = 16
input_channels = 3
embedding_size = 768   # channel * patch_size * patch_size

partition = nn.Sequential(
    nn.Conv2d(input_channels, embedding_size,
              kernel_size=patch_size, stride=patch_size),   # torch.Size([1, 768, 14, 14])
    Rearrange('b e (h) (w) -> b (h w) e'))

summary(partition, x.shape[1:], device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 768, 14, 14]         590,592
         Rearrange-2             [-1, 196, 768]               0
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 2.30
Params size (MB): 2.25
Estimated Total Size (MB): 5.12
----------------------------------------------------------------


## Step2. Patches embedding
Trainable linear projection을 통해 x의 각 패치를 flatten한 벡터를 D차원으로 변환한 후, 이를 패치 임베딩으로 사용

Learnable 임베딩과 패치 임베딩에 learnable position 임베딩을 더함

In [38]:
patch_size = 16
embedding_size = 768
img_size = 224

# 이미지를 패치사이즈로 나누고 flatten
projected_x = partition(x)   # flatten 과정
print('Projected X shape :', projected_x.shape)   # (배치 크기, 패치 수, 임베딩 크기)

# cls_token과 position embedding parameter 정의
cls_token = nn.Parameter(torch.randn(1, 1, embedding_size))   # class token으로 Transformer 기반 모델에서 입력 시퀀스 앞에 추가되는 벡터 (논문 이미지에서 *에 해당하는 부분)
positions = nn.Parameter(torch.randn((img_size // patch_size) ** 2 + 1, embedding_size))   # 각 패치와 클래스 토큰에 대한 positional embedding을 위한 trainable한 parameter (14 x 14는 패치의 개수, +1은 CLS token)
print('Cls Shape :', cls_token.shape, ', Pos Shape :', positions.shape) 

# cls_token을 반복하여 batch_size의 크기와 맞춰줌
batch_size = 1
cls_token = repeat(cls_token, '() n e -> b n e', b=batch_size)   # cls_token을 배치 크기에 맞춰 복제
print('Repeated Cls shape :', cls_token.shape)   # (배치 크기, cls_token 개수, 임베딩 크기)

# cls_token과 projected_x를 concatenate
concat = torch.cat([cls_token, projected_x], dim=1)   # ([1, 196 + 1, 768])  dim=1로 설정해서 cls_token을 패치 앞에 추가

# position embedding을 더해줌
concat += positions   # 패치와 cls_token에 대한 위치 정보 더함 
print('output :', concat.shape)

Projected X shape : torch.Size([1, 196, 768])
Cls Shape : torch.Size([1, 1, 768]) , Pos Shape : torch.Size([197, 768])
Repeated Cls shape : torch.Size([1, 1, 768])
tensor([[[ 0.4201, -0.6858, -0.4151,  ..., -1.1514,  0.7832, -1.1986],
         [ 0.2705,  0.1865, -0.2750,  ...,  1.0042,  1.1362,  0.0793],
         [ 0.1894,  0.5829,  0.4339,  ..., -0.4575, -0.3327, -0.3140],
         ...,
         [-0.3102, -0.2404, -0.6107,  ...,  0.5053, -0.8523, -0.5888],
         [-0.1326,  0.3110,  0.0939,  ..., -0.2882,  0.4311,  1.4987],
         [-0.3215, -0.1747, -0.4515,  ..., -0.7191, -0.3624,  0.6005]]],
       grad_fn=<CatBackward0>)
output : torch.Size([1, 197, 768])


In [41]:
class PatchEmbedding(nn.Module):
    def __init__(self, input_channels: int=3, patch_size: int=16,
                 embedding_size: int=768, img_size: int=224):
        self.patch_size = patch_size
        super().__init__()
        self.partition = nn.Sequential(
                nn.Conv2d(input_channels, embedding_size,
              kernel_size=patch_size, stride=patch_size),   # torch.Size([1, 768, 14, 14])
                 Rearrange('b e (h) (w) -> b (h w) e')
        )
        self.cls_token = nn.Parameter(torch.randn(1,1, embedding_size))
        self.positions = nn.Parameter(torch.randn((img_size // patch_size) **2 + 1, embedding_size))
    
    def forward(self, x: Tensor) -> Tensor:
        b, _, _, _ = x.shape
        x = self.partition(x)
        cls_token = repeat(self.cls_token, '() n e -> b n e', b=b )
        x = torch.cat([cls_token, x], dim=1)
        x += self.positions

        return x

PE = PatchEmbedding()
summary(PE, (3, 224, 224), device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 768, 14, 14]         590,592
         Rearrange-2             [-1, 196, 768]               0
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 2.30
Params size (MB): 2.25
Estimated Total Size (MB): 5.12
----------------------------------------------------------------


## Step3. Transformer encoder
임베딩을 Transformer encode에 input으로 넣어 마지막 layer에서 class embedding에 대한 output인 image representation을 도출

### Multi-Head Attention
패치에 대해 self-attention 메커니즘 적용

In [42]:
embedding_size = 768
num_heads = 8

keys = nn.Linear(embedding_size, embedding_size)
queries = nn.Linear(embedding_size, embedding_size)
values = nn.Linear(embedding_size, embedding_size)
print(keys, queries, values)

x = PE(x)
print(queries)

Linear(in_features=768, out_features=768, bias=True) Linear(in_features=768, out_features=768, bias=True) Linear(in_features=768, out_features=768, bias=True)


### Transformer Encoder block 

In [None]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    
    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x

class FeedForwardBlock(nn.Sequential):
    def __init__(self, embedding_size: int, expansion: int=4, drop_p: float=0.):
        super().__init__(
            nn.Linear(embedding_size, expansion * embedding_size),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(expansion * embedding_size, embedding_size)
        )

class TransformerEncoderBlock(nn.Sequential):
    def __init__(self, embedding_size: int=768,
                 drop_p: float=0.,
                 forward_expansion: int=4,
                 forward_drop_p: float=0.,
                 **kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(embedding_size),
                MultiHeadAttention(embedding_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(embedding_size),
                FeedForwardBlock(embedding_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            ))
        )
