In [None]:
import cv2
import torch
import numpy as np
from PIL import Image
from torchvision import transforms
from transformers import ViTFeatureExtractor, ViTForImageClassification

ModuleNotFoundError: No module named 'transformers'

In [None]:
device = 'cuda'
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224-in21k')
model_name = 'google/vit-base-patch16-224-in21k'
model = ViTForImageClassification.from_pretrained(model_name, num_labels=5).to(device)
model.load_state_dict(torch.load('vit_trained.pth'))

model.eval()

NameError: name 'ViTForImageClassification' is not defined

In [None]:
def rollout(attentions, discard_ratio, head_fusion):
    result = torch.eye(attentions[0].size(-1))
    with torch.no_grad():
        for attention in attentions:
            if head_fusion == "mean":
                attention_heads_fused = attention.mean(axis=1)
            elif head_fusion == "max":
                attention_heads_fused = attention.max(axis=1)[0]
            elif head_fusion == "min":
                attention_heads_fused = attention.min(axis=1)[0]
            else:
                raise "Attention head fusion type Not supported"

            # Drop the lowest attentions, but 
            # don't drop the class token
            flat = attention_heads_fused.view(attention_heads_fused.size(0), -1)
            _, indices = flat.topk(int(flat.size(-1)*discard_ratio), dim=-1, largest=False)
            indices = indices[indices != 0]
            flat[0, indices] = 0
            
            I = torch.eye(attention_heads_fused.size(-1))
            a = (attention_heads_fused + 1.0*I)/2
            a = a / a.sum(dim=-1)

            result = torch.matmul(a, result) # [1,197,197]

    # Look at the total attention between the class token,
    # and the image patches
    mask = result[0, 0 , 1 :] # [196,196]
    # In case of 224x224 image, this brings us from 196 to 14
    width = int(mask.size(-1)**0.5)
    mask = mask.reshape(width, width).numpy()
    mask = mask / np.max(mask)
    
    return mask # [14,14] 

In [10]:
class VITAttentionRollout:
    def __init__(self, model, attention_layer_name='attn_drop', head_fusion='mean', discard_ratio=0.9):
        self.model = model
        self.head_fusion = head_fusion
        self.discard_ratio = discard_ratio
        for name, module in self.model.named_modules():
            if attention_layer_name in name:
                module.register_forward_hook(self.get_attention)
        self.attentions = []
        
    def get_attention(self, module, input, output):
        self.attentions.append(output.cpu())
        
    def __call__(self, input_tensor):
        self.attentions = []
        with torch.no_grad():
            output = self.model(input_tensor)
            
        return rollout(self.attentions, self.discard_ratio, self.head_fusion)

In [6]:
def show_mask_on_image(img, mask):
    img = np.float32(img) / 255
    heatmap = cv2.applyColorMap(np.uint8(255 * mask), cv2.COLORMAP_JET)
    heatmap = np.float32(heatmap) / 255
    cam = heatmap + np.float32(img)
    cam = cam / np.max(cam)
    return np.uint8(255 * cam)

In [None]:
img_path = "./attention_map/"
image = Image.open(image_path).convert("RGB")
input_tensor = feature_extractor(images=image, return_tensors="pt")  # 이미지 전처리
input_tensor["pixel_values"].squeeze(0)

input_tensor = input_tensor.cuda()
print("Doing Attention Rollout")
attention_rollout = VITAttentionRollout(model, head_fusion="max", discard_ratio=0.9)
mask = attention_rollout(input_tensor)
name = "attention_map_{:.3f}_{}.png".format(attention_rollout.discard_ratio, attention_rollout.head_fusion)

NameError: name 'CustomDataset' is not defined

In [None]:
np_img = np.array(img)[:, :, ::-1]
mask = cv2.resize(mask, (np_img.shape[1], np_img.shape[0]))
mask = show_mask_on_image(np_img, mask)
cv2.imshow("Input Image", np_img)
cv2.imshow(name, mask)
cv2.imwrite("input.png", np_img)
cv2.imwrite(name, mask)
cv2.waitKey(-1)