<a href="https://colab.research.google.com/github/mobarakol/tutorial_notebooks/blob/main/Transformers_All_VQA.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [6]:
!pip install -q transformers

In [1]:
import torch
from torch import nn
from transformers import VisualBertModel, VisualBertConfig, BertTokenizerFast
from PIL import Image
import requests
from torchvision.models import resnet34, resnet101
from torchvision import transforms

img_url = 'https://www.animalfunfacts.net/images/stories/pets/dogs/pembroke_welsh_corgi_l.jpg'
img_raw = Image.open(requests.get(img_url, stream=True).raw)
mean, std = torch.tensor([0.485, 0.456, 0.406]), torch.tensor([0.229, 0.224, 0.225])
transform = transforms.Compose([transforms.Resize((224, 224)), 
                                transforms.ToTensor(),
                                transforms.Normalize(mean=mean, std=std)])
img = transform(img_raw)[None]

test_question = ["Where is the dog?"]
bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
inputs = bert_tokenizer(test_question, return_tensors="pt", padding="max_length",max_length=20,)


VisualBERT (ResNet101)

In [3]:
class VisualBERT_VQA(nn.Module):
    def __init__(self, num_labels=2):
        super(VisualBERT_VQA, self).__init__()
        self.visualbert = VisualBertModel.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.cls = nn.Linear(768, num_labels)

    def forward(self, inputs):
        last_hidden_state = self.visualbert(**inputs).last_hidden_state #[1, 56, 768]

        # Get the index of the last text token
        index_to_gather = inputs['attention_mask'].sum(1) - 2  # as in original code 5
        index_to_gather = (
            index_to_gather.unsqueeze(-1).unsqueeze(-1).expand(index_to_gather.size(0), 1, last_hidden_state.size(-1))
        ) # [b c hw]=[1, 1, 768]

        pooled_output = torch.gather(last_hidden_state, 1, index_to_gather) # [1, 1, 768]
        logits = self.cls(pooled_output).squeeze(1)
        return logits

model_visual_feat = resnet101(pretrained=True)
model_visual_feat.avgpool = nn.Identity()
model_visual_feat.fc = nn.Identity()
model_visual_feat.eval()
visual_embeds = model_visual_feat(img).view(-1, 49, 2048)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)

inputs.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)

print('visual_embeds', visual_embeds.shape, 'Text:', inputs['input_ids'].shape)
model = VisualBERT_VQA()
model.eval()
logits = model(inputs)
pred_vqa = logits.argmax(-1)
print('Logits:',logits, 'Prediction:', pred_vqa)  

visual_embeds torch.Size([1, 49, 2048]) Text: torch.Size([1, 20])


Some weights of the model checkpoint at uclanlp/visualbert-vqa-coco-pre were not used when initializing VisualBertModel: ['cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.bias', 'cls.seq_relationship.bias', 'cls.predictions.decoder.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing VisualBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing VisualBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


Logits: tensor([[-0.4455,  0.3589]], grad_fn=<SqueezeBackward1>) Prediction: tensor([1])


VisualBERT (ResNet34)

In [4]:
class VisualBERT_VQA(nn.Module):
    def __init__(self, num_labels=2):
        super(VisualBERT_VQA, self).__init__()
        self.config = VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.config.visual_embedding_dim = 512
        self.visualbert = VisualBertModel(config=self.config)#.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        #self.embeddings = self.visual_bert.embeddings
        self.cls = nn.Linear(768, num_labels)

    def forward(self, inputs):
        last_hidden_state = self.visualbert(**inputs).last_hidden_state #[1, 56, 768]

        # Get the index of the last text token
        index_to_gather = inputs['attention_mask'].sum(1) - 2  # as in original code 5
        index_to_gather = (
            index_to_gather.unsqueeze(-1).unsqueeze(-1).expand(index_to_gather.size(0), 1, last_hidden_state.size(-1))
        ) # [b c hw]=[1, 1, 768]
        pooled_output = torch.gather(last_hidden_state, 1, index_to_gather) # [1, 1, 768]
        logits = self.cls(pooled_output).squeeze(1)
        return logits

model_visual_feat = resnet34(pretrained=True)
model_visual_feat.avgpool = nn.Identity()
model_visual_feat.fc = nn.Identity()
model_visual_feat.eval()
visual_embeds = model_visual_feat(img).view(-1, 49, 512)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)

print('visual_embeds', visual_embeds.shape, 'Text:', inputs['input_ids'].shape)

model = VisualBERT_VQA()
model.eval()
logits = model(inputs)
pred_vqa = logits.argmax(-1)
print('Logits:',logits, 'Prediction:', pred_vqa)        




visual_embeds torch.Size([1, 49, 512]) Text: torch.Size([1, 20])
self.visualbert.config.visual_embedding_dim: 512
tensor([5]) 1 768
torch.Size([1, 1, 768])
Logits: tensor([[ 0.5161, -0.5943]], grad_fn=<SqueezeBackward1>) Prediction: tensor([0])


# ViT_VQA(ResNet34)

(BertTokenizerFast = AutoTokenizer)

In [5]:
!pip -q install timm

[?25l[K     |▋                               | 10 kB 40.4 MB/s eta 0:00:01[K     |█▎                              | 20 kB 51.2 MB/s eta 0:00:01[K     |██                              | 30 kB 58.7 MB/s eta 0:00:01[K     |██▋                             | 40 kB 34.3 MB/s eta 0:00:01[K     |███▏                            | 51 kB 39.0 MB/s eta 0:00:01[K     |███▉                            | 61 kB 44.2 MB/s eta 0:00:01[K     |████▌                           | 71 kB 33.7 MB/s eta 0:00:01[K     |█████▏                          | 81 kB 35.0 MB/s eta 0:00:01[K     |█████▉                          | 92 kB 37.8 MB/s eta 0:00:01[K     |██████▍                         | 102 kB 37.2 MB/s eta 0:00:01[K     |███████                         | 112 kB 37.2 MB/s eta 0:00:01[K     |███████▊                        | 122 kB 37.2 MB/s eta 0:00:01[K     |████████▍                       | 133 kB 37.2 MB/s eta 0:00:01[K     |█████████                       | 143 kB 37.2 MB/s eta 0:

In [18]:
class ViT_VQA(nn.Module):
    def __init__(self, num_labels=2):
        super(ViT_VQA, self).__init__()
        self.config = VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.config.visual_embedding_dim = 512
        self.visualbert = VisualBertModel(config=self.config)
        self.embeddings = self.visualbert.embeddings

        self.vit = create_model("vit_base_patch16_224", pretrained=True)
        self.cls = nn.Linear(768, num_labels)

    def forward(self, inputs):
        embedding_output = self.embeddings(
            input_ids=inputs['input_ids'],
            token_type_ids=inputs['token_type_ids'],
            position_ids=None,
            inputs_embeds=None,
            visual_embeds=inputs['visual_embeds'],
            visual_token_type_ids=inputs['visual_token_type_ids'],
            image_text_alignment=None,
        ) #[1, 56, 768]
        x = self.vit.blocks(embedding_output)
        x = self.vit.norm(x)
        x = x.mean(dim=1)
        logits = self.cls(x)
        return logits

model_visual_feat = resnet34(pretrained=True)
model_visual_feat.avgpool = nn.Identity()
model_visual_feat.fc = nn.Identity()
model_visual_feat.eval()
visual_embeds = model_visual_feat(img).view(-1, 49, 512)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)

print('visual_embeds', visual_embeds.shape, 'Text:', inputs['input_ids'].shape)

model = ViT_VQA(num_labels=2)
model.eval()
logits = model(inputs)
pred_vqa = logits.argmax(-1)
print('Logits:',logits, 'Prediction:', pred_vqa) 

visual_embeds torch.Size([1, 49, 512]) Text: torch.Size([1, 20])
Logits: tensor([[0.7204, 0.4291]], grad_fn=<AddmmBackward0>) Prediction: tensor([0])


# Swin-Transformer_VQA(ResNet34)

In [45]:
class SwinTranformer_VQA(nn.Module):
    def __init__(self, num_labels=2):
        super(SwinTranformer_VQA, self).__init__()
        self.config = VisualBertConfig.from_pretrained("uclanlp/visualbert-vqa-coco-pre")
        self.config.visual_embedding_dim = 512
        self.visualbert = VisualBertModel(config=self.config)
        self.embeddings = self.visualbert.embeddings

        self.swintran = create_model("swin_base_patch4_window7_224", pretrained=True)
        self.cls = nn.Linear(768, num_labels)

    def forward(self, inputs):
        embedding_output = self.embeddings(
            input_ids=inputs['input_ids'],
            token_type_ids=inputs['token_type_ids'],
            position_ids=None,
            inputs_embeds=None,
            visual_embeds=inputs['visual_embeds'],
            visual_token_type_ids=inputs['visual_token_type_ids'],
            image_text_alignment=None,
        ) #[1, 56, 768]
        #x = self.swintran.patch_embed(x)
        x = self.swintran.layers(embedding_output)
        x = self.swintran.norm(x)
        x = x.mean(dim=1)
        logits = self.cls(x)
        return logits

model_visual_feat = resnet34(pretrained=True)
model_visual_feat.avgpool = nn.Identity()
model_visual_feat.fc = nn.Identity()
model_visual_feat.eval()
visual_embeds = model_visual_feat(img).view(-1, 49, 512)
visual_token_type_ids = torch.ones(visual_embeds.shape[:-1], dtype=torch.long)
visual_attention_mask = torch.ones(visual_embeds.shape[:-1], dtype=torch.float)
inputs.update(
    {
        "visual_embeds": visual_embeds,
        "visual_token_type_ids": visual_token_type_ids,
        "visual_attention_mask": visual_attention_mask,
    }
)

print('visual_embeds', visual_embeds.shape, 'Text:', inputs['input_ids'].shape)

model = SwinTranformer_VQA(num_labels=2)
model.eval()
logits = model(inputs)
pred_vqa = logits.argmax(-1)
print('Logits:',logits, 'Prediction:', pred_vqa) 

visual_embeds torch.Size([1, 49, 512]) Text: torch.Size([1, 20])


AssertionError: ignored

# Transformer Tokenizer (Text)

In [34]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
model_name = "nlptown/bert-base-multilingual-uncased-sentiment" #"distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
test_question = ["We are very happy to show you the Transformers library"]
inputs_a = tokenizer(
    test_question,  truncation=True, return_tensors="pt", padding="max_length", max_length=20,
)

bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
inputs = bert_tokenizer(test_question, truncation=True, return_tensors="pt", padding="max_length", max_length=20,)
inputs_a['input_ids'], inputs['input_ids']

(tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
           3075,   102,     0,     0,     0,     0,     0,     0,     0,     0]]),
 tensor([[  101,  2057,  2024,  2200,  3407,  2000,  2265,  2017,  1996, 19081,
           3075,   102,     0,     0,     0,     0,     0,     0,     0,     0]]))

In [35]:
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T
from timm import create_model


class SwinTranformer_Features(nn.Module):
    def __init__(self):
        super(SwinTranformer_Features, self).__init__()
        self.swintran = create_model("swin_base_patch4_window7_224", pretrained=True)

    def forward(self, x):
        x = self.swintran.patch_embed(x) # [1, 3136, 128]
        print(x.shape)
        x = self.swintran.layers(x)#[1, 49, 1024]
        print(x.shape)
        x = self.swintran.norm(x)#[1, 49, 1024]
        print(x.shape)
        x = x.mean(dim=1)#[1, 1024]
        print(x.shape)
        logits = self.swintran.head(x)#
        return logits

model = SwinTranformer_Features()
model.eval()
logits = model(img)
pred = logits.argmax(dim=1).item()
print('prediction:', int(torch.argmax(logits)))

  return _VF.meshgrid(tensors, **kwargs)  # type: ignore[attr-defined]
Downloading: "https://github.com/SwinTransformer/storage/releases/download/v1.0.0/swin_base_patch4_window7_224_22kto1k.pth" to /root/.cache/torch/hub/checkpoints/swin_base_patch4_window7_224_22kto1k.pth


torch.Size([1, 3136, 128])
torch.Size([1, 49, 1024])
torch.Size([1, 49, 1024])
torch.Size([1, 1024])
prediction: 263


In [43]:
from transformers import AutoFeatureExtractor, SwinForImageClassification
feature_extractor = AutoFeatureExtractor.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
model = SwinForImageClassification.from_pretrained("microsoft/swin-tiny-patch4-window7-224")
model.eval()
img2 = feature_extractor(img[0], return_tensors="pt")
print(img2['pixel_values'].shape)
logits = model(**img2).logits
pred = logits.argmax(dim=1).item()
print('prediction:', int(torch.argmax(logits)))

torch.Size([1, 3, 224, 224])
prediction: 264


In [12]:
import torch
import torch.nn.functional as F
import torchvision
import torchvision.transforms as T
from timm import create_model


class ViT_Features(nn.Module):
    def __init__(self):
        super(ViT_Features, self).__init__()
        model_name = "vit_base_patch16_224"
        device = 'cuda' if torch.cuda.is_available() else 'cpu'
        self.vit = create_model(model_name, pretrained=True).to(device)

    def forward(self, x):
        patches = self.vit.patch_embed(x) # [1, 196, 768]
        pos_embed = self.vit.pos_embed # [1, 197, 768]
        x = torch.cat((self.vit.cls_token, patches), dim=1) + pos_embed #[1, 197, 768]
        x = self.vit.blocks(x)
        # for i, blk in enumerate(self.vit.blocks):
        #     x = blk(x)
        x = self.vit.norm(x)
        x = x.mean(dim=1)
        logits = self.vit.head(x)
        return logits

model = ViT_Features()
model.eval()
logits = model(img)
pred = logits.argmax(dim=1).item()
print('prediction:', int(torch.argmax(logits)))

patches torch.Size([1, 196, 768])
pos_embed torch.Size([1, 197, 768])
prediction: 263
