# Multi-Modality Inference
- BLIP
- BLIP-2
- OFA
- Flamingo (SKIP: Hugging-Face API)
- Mini-GPT4 (SKIP: Hugging-Face API)
- LLaVA (SKIP: Need to prepare multiple weights)
- Otter (SKIP: Hugging-Face API)

In [1]:
from otx.v2.adapters.torch.mmengine.mmpretrain.engine import MMPTEngine

# Engine for demo
engine  = MMPTEngine(
    work_dir="/tmp/test-multi-modal-infer",
)

# Sample for demo (single image)
sample = "../../../../tests/assets/car_tree_bug/images/train/Slide4.PNG"

  from .autonotebook import tqdm as notebook_tqdm


## BLIP
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/blip

In [2]:
# BLIP
from otx.v2.adapters.torch.mmengine.mmpretrain.model import get_model
blip_model = get_model("blip-base_3rdparty_caption")
print(blip_model)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'BertTokenizer'. 
The class this function is called from is 'BlipTokenizer'.


BlipCaption(
  (data_preprocessor): MultiModalDataPreprocessor()
  (visual_encoder): VisionTransformer(
    (patch_embed): PatchEmbed(
      (adaptive_padding): AdaptivePadding()
      (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
    )
    (drop_after_pos): Dropout(p=0.0, inplace=False)
    (layers): ModuleList(
      (0): TransformerEncoderLayer(
        (ln1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (attn): MultiheadAttention(
          (qkv): Linear(in_features=768, out_features=2304, bias=True)
          (proj): Linear(in_features=768, out_features=768, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
          (out_drop): DropPath()
          (gamma1): Identity()
        )
        (ln2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
        (ffn): FFN(
          (layers): Sequential(
            (0): Sequential(
              (0): Linear(in_features=768, out_features=3072, bias=True)
              (1): GELU(approx

In [3]:
pred_result = engine.predict(
    model=blip_model,
    img=sample
)
print(pred_result)

[{'pred_caption': 'oneself punched gardiner oneselfこ duration applications hunched licenses duration duration wat disappearance duration +こ'}]


## BLIP-2
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/blip2

In [4]:
# BLIP-2
from otx.v2.adapters.torch.mmengine.mmpretrain.model import get_model
blip_2_model = get_model("blip2-opt2.7b_3rdparty-zeroshot_caption")
print(blip_2_model)

Blip2Caption(
  (data_preprocessor): MultiModalDataPreprocessor()
  (vision_backbone): BEiTViT(
    (patch_embed): PatchEmbed(
      (adaptive_padding): AdaptivePadding()
      (projection): Conv2d(3, 1408, kernel_size=(14, 14), stride=(14, 14))
    )
    (drop_after_pos): Dropout(p=0, inplace=False)
    (layers): ModuleList(
      (0): BEiTTransformerEncoderLayer(
        (ln1): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (attn): BEiTAttention(
          (qkv): Linear(in_features=1408, out_features=4224, bias=False)
          (attn_drop): Dropout(p=0.0, inplace=False)
          (proj): Linear(in_features=1408, out_features=1408, bias=True)
          (proj_drop): Dropout(p=0.0, inplace=False)
        )
        (ln2): LayerNorm((1408,), eps=1e-06, elementwise_affine=True)
        (ffn): FFN(
          (layers): Sequential(
            (0): Sequential(
              (0): Linear(in_features=1408, out_features=6144, bias=True)
              (1): GELU(approximate='none')


In [5]:
pred_result = engine.predict(
    model=blip_2_model,
    img=sample
)
print(pred_result)

[{'pred_caption': 'of'}]


## OFA
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/ofa

In [6]:
# OFA
from otx.v2.adapters.torch.mmengine.mmpretrain.model import get_model
ofa_model = get_model("ofa-base_3rdparty-finetuned_caption")
print(ofa_model)

OFA(
  (data_preprocessor): MultiModalDataPreprocessor()
  (model): OFAEncoderDecoder(
    (encoder): OFAEncoder(
      (embed_tokens): Embedding(59457, 768, padding_idx=1)
      (embed_images): OFAResNet(
        (conv1): Conv2d(3, 64, kernel_size=(7, 7), stride=(2, 2), padding=(3, 3), bias=False)
        (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (relu): ReLU(inplace=True)
        (maxpool): MaxPool2d(kernel_size=3, stride=2, padding=1, dilation=1, ceil_mode=False)
        (layer1): ResLayer(
          (0): Bottleneck(
            (conv1): Conv2d(64, 64, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (bn1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
            (bn2): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
            (conv3): Conv2d(64, 256, kern

In [7]:
pred_result = engine.predict(
    model=ofa_model,
    img=sample
)
print(pred_result)

[{'pred_caption': 'Downloads Downloads Downloads vul vul vul Downloads Downloads KKK KKK KKK wonderfully wonderfully wonderfully Indonesian Indonesian Indonesian'}]


## Flamingo
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/flamingo
- SKIP: Need to Hugging-Face model settings

## Mini-GPT4
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/minigpt4
- SKIP: Need to Hugging-Face token settings

## LLaVA
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/llava
- SKIP: Need to download model weights separately

## Otter
- https://github.com/open-mmlab/mmpretrain/tree/main/configs/otter
- SKIP: Need to Hugging-Face token settings