#### model 분석하기

In [22]:
import torch
import torch.nn as nn
import torch.nn.init as init
import timm
import sys

class PromptInput(nn.Module):
    def __init__(self, num_prompts, embed_dim = 768, num_layers = 12):
        super().__init__()
        self.num_prompts = num_prompts
        self.embed_dim = embed_dim

        # Initialize prompt embeddings
        self.prompts = nn.Parameter(torch.zeros(num_layers, num_prompts, embed_dim))

        init.kaiming_uniform_(self.prompts)

    def prepend_prompt(self, x, layer_idx):

        batch_size = x.shape[0]

        prompt_tokens = self.prompts[layer_idx,:,:].expand(batch_size,-1,-1)

        if layer_idx == 0:
            x = torch.cat((x[:, :1, :], prompt_tokens, x[:,1:,:]), dim = 1) # => [batch_size, cls_token + prompt_tokens + seq_len, embed_dim]
            
        else:
            x = torch.cat((x[:, :1, :], prompt_tokens, x[:, (1+self.num_prompts):, :]), dim=1) # 이미 추가된 prompt를 제외한 나머지 추가

        return x 
    

# prompt 추가 vit

class Vpt_ViT(nn.Module):
    def __init__(self, pretrained_model= 'vit_base_patch16_224',img_size=32, patch_size=4, num_classes=10, embed_dim=768, depth=12, num_heads=12, mlp_ratio=4., qkv_bias=True, drop_rate=0., attn_drop_rate=0., drop_path_rate=0.):
        super().__init__()
        self.prompt_embedding = PromptInput(num_prompts=100, embed_dim= 768, num_layers= depth)

        #timm을 이용한 pretrained_model 적용
        self.model = timm.create_model(pretrained_model, pretrained = True, img_size = img_size, patch_size = patch_size, num_classes = num_classes)
    
    def forward(self, x):
        x = self.model.patch_embed(x)
        cls_tokens = self.model.cls_token.expand(x.shape[0], -1, -1)  # 클래스 토큰 추가
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.model.pos_embed 
        x = self.model.pos_drop(x)
        for idx, block in enumerate(self.model.blocks):
            x = self.prompt_embedding.prepend_prompt(x, idx) # prompt_embedding을 통해서 
            x = block(x)

        x = self.model.norm(x)  # 최종 레이어 정규화
        x = self.model.forward_head(x)
        return x  # 분류 헤드를 통한 출력

In [5]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
model = Vpt_ViT(pretrained_model='vit_base_patch16_224', img_size=32, patch_size=4, num_classes=10) # CiFAR-10 dataset
model = model.to(device=device)
input_tensor = torch.randn(8, 3, 32, 32).to(device) 

for name, param in model.named_parameters():
    if 'blocks' in name or 'cls_token' in name:
        param.requires_grad = False
    print(f"Layer: {name} | Requires Grad: {param.requires_grad} | Shape: {param.shape}")


Layer: prompt_embedding.prompts | Requires Grad: True | Shape: torch.Size([12, 100, 768])
Layer: model.cls_token | Requires Grad: False | Shape: torch.Size([1, 1, 768])
Layer: model.pos_embed | Requires Grad: True | Shape: torch.Size([1, 65, 768])
Layer: model.patch_embed.proj.weight | Requires Grad: True | Shape: torch.Size([768, 3, 4, 4])
Layer: model.patch_embed.proj.bias | Requires Grad: True | Shape: torch.Size([768])
Layer: model.blocks.0.norm1.weight | Requires Grad: False | Shape: torch.Size([768])
Layer: model.blocks.0.norm1.bias | Requires Grad: False | Shape: torch.Size([768])
Layer: model.blocks.0.attn.qkv.weight | Requires Grad: False | Shape: torch.Size([2304, 768])
Layer: model.blocks.0.attn.qkv.bias | Requires Grad: False | Shape: torch.Size([2304])
Layer: model.blocks.0.attn.proj.weight | Requires Grad: False | Shape: torch.Size([768, 768])
Layer: model.blocks.0.attn.proj.bias | Requires Grad: False | Shape: torch.Size([768])
Layer: model.blocks.0.norm2.weight | Requir

In [11]:
import torch
import torch.nn as nn
import torch.nn.init as init
import timm

class PromptInput(nn.Module):
    def __init__(self, num_prompts, embed_dim=768, num_layers=12):
        super().__init__()
        self.num_prompts = num_prompts
        self.embed_dim = embed_dim

        # Initialize prompt embeddings
        self.prompts = nn.Parameter(torch.zeros(num_layers, num_prompts, embed_dim))
        init.kaiming_uniform_(self.prompts)

    def prepend_prompt(self, x, layer_idx):
        batch_size = x.shape[0]
        prompt_tokens = self.prompts[layer_idx, :, :].expand(batch_size, -1, -1)

        if layer_idx == 0:
            x = torch.cat((x[:, :1, :], prompt_tokens, x[:, 1:, :]), dim=1)  # => [batch_size, cls_token + prompt_tokens + seq_len, embed_dim]
        else:
            x = torch.cat((x[:, :1, :], prompt_tokens, x[:, (1 + self.num_prompts):, :]), dim=1)  # 이미 추가된 prompt를 제외한 나머지 추가

        return x

# VPT with ViT model
class Vpt_ViT(nn.Module):
    def __init__(self, pretrained_model='vit_base_patch16_224', img_size=32, patch_size=4, num_classes=10, embed_dim=768, depth=12):
        super().__init__()
        self.prompt_embedding = PromptInput(num_prompts=100, embed_dim=768, num_layers=depth)

        # Load the pretrained model from timm
        self.model = timm.create_model(pretrained_model, pretrained=True, img_size=img_size, patch_size=patch_size, num_classes=num_classes)
    
    def forward(self, x):
        x = self.model.patch_embed(x)
        cls_tokens = self.model.cls_token.expand(x.shape[0], -1, -1)  # Add class token
        x = torch.cat((cls_tokens, x), dim=1)
        x = x + self.model.pos_embed
        x = self.model.pos_drop(x)

        for idx, block in enumerate(self.model.blocks):
            x = self.prompt_embedding.prepend_prompt(x, idx)
            x = block(x)
        x = self.model.norm(x)  # Final layer normalization
        x = self.model.forward_head(x)
        return x

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

# Create the model
model = Vpt_ViT(pretrained_model='vit_base_patch16_224', img_size=32, patch_size=4, num_classes=10)
model = model.to(device=device)

# Example input: random tensor with CIFAR-10 dimensions (batch_size, channels, height, width)
input_tensor = torch.randn(8, 3, 32, 32).to(device)  # Batch of 8 images, 3 channels, 32x32 pixels

# Pass the input through the model
output = model(input_tensor)  # Forward pass, should print block indices


RuntimeError: The size of tensor a (64) must match the size of tensor b (65) at non-singleton dimension 1