In [1]:
import torch

print(f"Setup complete. Using torch {torch.__version__} ({torch.cuda.get_device_properties(0).name if torch.cuda.is_available() else 'CPU'})")

Setup complete. Using torch 2.4.1+cu121 (NVIDIA GeForce RTX 4070 Ti SUPER)


In [189]:
# !pip install matplotlib
# ! pip install einops
# !pip install torchsummary
# !pip install opencv-python
!pip install tqdm

Collecting tqdm
  Using cached tqdm-4.66.5-py3-none-any.whl.metadata (57 kB)
Using cached tqdm-4.66.5-py3-none-any.whl (78 kB)
Installing collected packages: tqdm
Successfully installed tqdm-4.66.5


In [178]:
import torch
import torch.nn.functional as F
import matplotlib as plt

from torch import nn
from torch import Tensor
from einops import rearrange, repeat



## Step 1. Project input to patches
### 방법1. 입력 이미지를 패치로 나누어주기 (linear)

In [179]:
x = torch.randn(1, 3, 224, 224) 
print('x :', x.shape)

#-------------------------------------------------
# import cv2

# batch_size = 1
# img_path = "C:/Users/kimin/st_defense_lab/st-defense-lab/Vision Transformer/test_images.jpg"
# x = cv2.imread(img_path)
# x = cv2.cvtColor(x, cv2.COLOR_BGR2RGB)
# x = repeat(x, 'h w c -> b c h w', b=batch_size)
# print('x :', x.shape)
#-------------------------------------------------

patch_size = 16   # 16 x 16 사이즈 패치 
#print(x[0][0][0])
patches = rearrange(x, 'b c (h s1) (w s2) -> b (h w) (s1 s2 c)',   # h: height, w: width, c: channel, s1,s2: patch_size
                    s1=patch_size, s2=patch_size)
print('patches :', patches.shape)


x : torch.Size([1, 3, 224, 224])
patches : torch.Size([1, 196, 768])


### 방법2. Conv layer를 활용하여 패치 나누기 


이 방법은 논문의 Hybrid Architecture로 언급

In [180]:
from einops.layers.torch import Rearrange
from torchsummary import summary

patch_size = 16
input_channels = 3
embedding_size = 768   # channel * patch_size * patch_size

partition = nn.Sequential(
    nn.Conv2d(input_channels, embedding_size,
              kernel_size=patch_size, stride=patch_size),   # torch.Size([1, 768, 14, 14])
    Rearrange('b e (h) (w) -> b (h w) e'))

summary(partition, x.shape[1:], device='cpu')

----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 768, 14, 14]         590,592
         Rearrange-2             [-1, 196, 768]               0
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 2.30
Params size (MB): 2.25
Estimated Total Size (MB): 5.12
----------------------------------------------------------------


## Step2. Patches embedding
Trainable linear projection을 통해 x의 각 패치를 flatten한 벡터를 D차원으로 변환한 후, 이를 패치 임베딩으로 사용

Learnable 임베딩과 패치 임베딩에 learnable position 임베딩을 더함

In [181]:
patch_size = 16
embedding_size = 768
img_size = 224

# 이미지를 패치사이즈로 나누고 flatten
projected_x = partition(x)   # flatten 과정
print('Projected X shape :', projected_x.shape)   # (배치 크기, 패치 수, 임베딩 크기)

# cls_token과 position embedding parameter 정의
cls_token = nn.Parameter(torch.randn(1, 1, embedding_size))   # class token으로 Transformer 기반 모델에서 입력 시퀀스 앞에 추가되는 벡터 (논문 이미지에서 *에 해당하는 부분)
positions = nn.Parameter(torch.randn((img_size // patch_size) ** 2 + 1, embedding_size))   # 각 패치와 클래스 토큰에 대한 positional embedding을 위한 trainable한 parameter (14 x 14는 패치의 개수, +1은 CLS token)
print('Cls Shape :', cls_token.shape, ', Pos Shape :', positions.shape) 

# cls_token을 반복하여 batch_size의 크기와 맞춰줌
batch_size = 1
cls_token = repeat(cls_token, '() n e -> b n e', b=batch_size)   # cls_token을 배치 크기에 맞춰 복제
print('Repeated Cls shape :', cls_token.shape)   # (배치 크기, cls_token 개수, 임베딩 크기)

# cls_token과 projected_x를 concatenate
concat = torch.cat([cls_token, projected_x], dim=1)   # ([1, 196 + 1, 768])  dim=1로 설정해서 cls_token을 패치 앞에 추가

# position embedding을 더해줌
concat += positions   # 패치와 cls_token에 대한 위치 정보 더함 
print('output :', concat.shape)

Projected X shape : torch.Size([1, 196, 768])
Cls Shape : torch.Size([1, 1, 768]) , Pos Shape : torch.Size([197, 768])
Repeated Cls shape : torch.Size([1, 1, 768])
output : torch.Size([1, 197, 768])


In [182]:
class PatchEmbedding(nn.Module):
    def __init__(self, input_channels: int=3, patch_size: int=16,
                 embedding_size: int=768, img_size: int=224):
        self.patch_size = patch_size
        super().__init__()
        self.partition = nn.Sequential(
                nn.Conv2d(input_channels, embedding_size,
              kernel_size=patch_size, stride=patch_size),   # torch.Size([1, 768, 14, 14])
                 Rearrange('b e (h) (w) -> b (h w) e')
        )
        self.cls_token = nn.Parameter(torch.randn(1,1, embedding_size))
        self.positions = nn.Parameter(torch.randn((img_size // patch_size) **2 + 1, embedding_size))
    
    def forward(self, x: Tensor) -> Tensor:
        b, _, _, _ = x.shape
        x = self.partition(x)   # flatten 과정
        cls_token = repeat(self.cls_token, '() n e -> b n e', b=b )   # cls_token을 배치 크기에 맞춰 복제
        print(cls_token.shape)
        x = torch.cat([cls_token, x], dim=1)   # ([1, 196 + 1, 768])  dim=1로 설정해서 cls_token을 패치 앞에 추가
        print(x.shape)
        x += self.positions   # position embedding 더하기

        return x

PE = PatchEmbedding()
summary(PE, (3, 224, 224), device='cpu')

torch.Size([2, 1, 768])
torch.Size([2, 197, 768])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 768, 14, 14]         590,592
         Rearrange-2             [-1, 196, 768]               0
Total params: 590,592
Trainable params: 590,592
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.57
Forward/backward pass size (MB): 2.30
Params size (MB): 2.25
Estimated Total Size (MB): 5.12
----------------------------------------------------------------


## Step3. Transformer encoder
임베딩을 Transformer encode에 input으로 넣어 마지막 layer에서 class embedding에 대한 output인 image representation을 도출

### Multi-Head Attention
패치에 대해 self-attention 메커니즘 적용

In [183]:
embedding_size = 768
num_heads = 8   # 8개의 head로 구성

keys = nn.Linear(embedding_size, embedding_size)
queries = nn.Linear(embedding_size, embedding_size)
values = nn.Linear(embedding_size, embedding_size)
#print(keys, queries, values)

x = PE(x)   # position embedding 적용 
print(queries(x).shape)   # 배치 크기, n(시퀀스 길이 -> 패치 수), 임베딩 크기
queries = rearrange(queries(x), "b n (h d) -> b h n d", h=num_heads)   # batch_size(b), num_head(h), n(시퀀스 길이), head_dim(d) = emb_size/head(각 head의 차원)
                                                                       # rearrange에서 "b n (h d)" 여기서 h d는 embedding_size를 두 개의 차원인 h와 d로 나누라는 의미
                                                                       # embedding_size = h * d가 되므로, rearrange는 자동으로 embedding/num_heads를 d로 정의
keys = rearrange(keys(x), "b n (h d) -> b h n d", h=num_heads)
values = rearrange(values(x), "b n (h d) -> b h n d", h=num_heads)

print('shape :', queries.shape, keys.shape, values.shape)

torch.Size([1, 1, 768])
torch.Size([1, 197, 768])
torch.Size([1, 197, 768])
shape : torch.Size([1, 8, 197, 96]) torch.Size([1, 8, 197, 96]) torch.Size([1, 8, 197, 96])


In [184]:
score = torch.einsum('bhqd, bhkd -> bhqk', queries, keys)   # Queries * Keys(transpose) 내적 => 'bhqd, bhkd -> bhqk' queries와 key 간의 내적을 계산하는 부분
                                                            # b: batch_size, h: num_head, q: query_len(query의 시퀀스 길이), d: head_dim(각 헤드에서의 차원) => 'bhqd'
                                                            # b: batch_size, h: num_head, q: key_len(key의 시퀀스 길이), d: head_dim(각 헤드에서의 차원)     => 'bhkd'
                                                            # 'qd' * 'kd' 내적 -> 'bhqk'가 됨
print('score :', score.shape)

scaling = embedding_size ** (1/2)
attention = F.softmax(score / scaling, dim=-1)
print('attention :', attention.shape)
print('values :', values.shape)

output = torch.einsum('bhal, bhlv -> bhav', attention, values)   # attention * values 내적 
print('output :', output.shape)

output = rearrange(output, "b h n d -> b n (h d)")
print('output2 :', output.shape)

score : torch.Size([1, 8, 197, 197])
attention : torch.Size([1, 8, 197, 197])
values : torch.Size([1, 8, 197, 96])
output : torch.Size([1, 8, 197, 96])
output2 : torch.Size([1, 197, 768])


In [188]:
class MultiHeadAttention(nn.Module):
    def __init__(self, embedding_size: int=768,
                 num_heads: int=8,
                 dropout: float=0):
        super().__init__()
        self.embedding_size = embedding_size
        self.num_heads = num_heads
        # fuse the queries, keys and values in one matrix
        self.qkv = nn.Linear(embedding_size, embedding_size * 3)
        self.att_drop = nn.Dropout(dropout)
        self.projection = nn.Linear(embedding_size, embedding_size)

    def forward(self, x: Tensor, mask: Tensor = None) -> Tensor:
        # split keys, queries and values in num_heads
        qkv = rearrange(self.qkv(x), "b n (h d qkv) -> (qkv) b h n d", h=num_heads, qkv=3)
        queries, keys, values = qkv[0], qkv[1], qkv[2]
        # sum up over the last axis
        score = torch.einsum('bhqd, bhkd -> bhqk', queries, keys)
        
        if mask is not None:
            fill_value = torch.finfo(torch.float32),min
            score.mask_fill(~mask, fill_value)
        
        scaling = self.embedding_size ** (1/2)
        attention = F.softmax(score / scaling, dim=-1)
        attention = self.att_drop(attention)
        
        output = torch.einsum('bhal, bhlv -> bhav', attention, values)
        output = rearrange(output, "b h n d -> b n (h d)")
        output = self.projection(output)

        return output
    
x = torch.randn(8, 3, 224, 224)
PE = PatchEmbedding()
x = PE(x)
print(x.shape)
MHA = MultiHeadAttention()
summary(MHA, x.shape[1:], device='cpu')

torch.Size([8, 1, 768])
torch.Size([8, 197, 768])
torch.Size([8, 197, 768])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Linear-1            [-1, 197, 2304]       1,771,776
           Dropout-2          [-1, 8, 197, 197]               0
            Linear-3             [-1, 197, 768]         590,592
Total params: 2,362,368
Trainable params: 2,362,368
Non-trainable params: 0
----------------------------------------------------------------
Input size (MB): 0.58
Forward/backward pass size (MB): 6.99
Params size (MB): 9.01
Estimated Total Size (MB): 16.57
----------------------------------------------------------------


### Transformer Encoder block 

In [173]:
class ResidualAdd(nn.Module):
    def __init__(self, fn):
        super().__init__()
        self.fn = fn
    
    def forward(self, x, **kwargs):
        res = x
        x = self.fn(x, **kwargs)
        x += res
        return x

class FeedForwardBlock(nn.Sequential):
    def __init__(self, embedding_size: int, expansion: int=4, drop_p: float=0.):
        super().__init__(
            nn.Linear(embedding_size, expansion * embedding_size),
            nn.GELU(),
            nn.Dropout(drop_p),
            nn.Linear(expansion * embedding_size, embedding_size)
        )

class TransformerEncoderBlock(nn.Sequential):
    def __init__(self, embedding_size: int=768,
                 drop_p: float=0.,
                 forward_expansion: int=4,
                 forward_drop_p: float=0.,
                 **kwargs):
        super().__init__(
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(embedding_size),
                MultiHeadAttention(embedding_size, **kwargs),
                nn.Dropout(drop_p)
            )),
            ResidualAdd(nn.Sequential(
                nn.LayerNorm(embedding_size),
                FeedForwardBlock(embedding_size, expansion=forward_expansion, drop_p=forward_drop_p),
                nn.Dropout(drop_p)
            ))
        )


## Step4. Classification Head
classification을 위한 MLP Head 부분

In [187]:
from einops.layers.torch import Reduce

class TransformerEncoder(nn.Sequential):
    def __init__(self, depth: int=12, **kwargs):
        super().__init__(*[TransformerEncoderBlock(**kwargs) for _ in range(depth)])

class ClassificationHead(nn.Sequential):
    def __init__(self, embedding_size: int=768,
                 n_classes: int=1000):
        super().__init__(
            Reduce('b n e -> b e', reduction='mean'),
            nn.LayerNorm(embedding_size),
            nn.Linear(embedding_size, n_classes)
        )

class ViT(nn.Sequential):
    def __init__(self,
                 input_channels: int=3,
                 patch_size: int=16,
                 embedding_size: int=768,
                 img_size: int=224,
                 depth: int=12,
                 n_classes: int=1000, **kwargs):
        super().__init__(
            PatchEmbedding(input_channels, patch_size, embedding_size, img_size),
            TransformerEncoder(depth, embedding_size=embedding_size, **kwargs),
            ClassificationHead(embedding_size, n_classes)
        )

summary(ViT(), (3, 224, 224), device='cpu')

torch.Size([2, 1, 768])
torch.Size([2, 197, 768])
----------------------------------------------------------------
        Layer (type)               Output Shape         Param #
            Conv2d-1          [-1, 768, 14, 14]         590,592
         Rearrange-2             [-1, 196, 768]               0
    PatchEmbedding-3             [-1, 197, 768]               0
         LayerNorm-4             [-1, 197, 768]           1,536
            Linear-5            [-1, 197, 2304]       1,771,776
           Dropout-6          [-1, 8, 197, 197]               0
            Linear-7             [-1, 197, 768]         590,592
MultiHeadAttention-8             [-1, 197, 768]               0
           Dropout-9             [-1, 197, 768]               0
      ResidualAdd-10             [-1, 197, 768]               0
        LayerNorm-11             [-1, 197, 768]           1,536
           Linear-12            [-1, 197, 3072]       2,362,368
             GELU-13            [-1, 197, 3072]      