# Multimodal Language Model:

Use the VisionTextDualEncoder as the encoder and for the decoder use GPT2 and ImageGPT.

# References:
# https://huggingface.co/docs/transformers/model_doc/vision-text-dual-encoder

In [20]:
from PIL import Image
import requests
from transformers import (
    EncoderDecoderModel,
    VisionTextDualEncoderModel,
    VisionTextDualEncoderProcessor,
    AutoImageProcessor,
    AutoTokenizer,
)

In [17]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224")
processor = VisionTextDualEncoderProcessor(image_processor, tokenizer)
encoder_model = VisionTextDualEncoderModel.from_vision_text_pretrained(
    "google/vit-base-patch16-224", "bert-base-uncased"
)

Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.weight', 'vit.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The projection layer and logit scale weights `['visual_projection.weight', 'text_projection.weight', 'logit_scale']` are newly initialized. You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [18]:
# contrastive training
urls = [
    "http://images.cocodataset.org/val2017/000000039769.jpg",
    "https://farm3.staticflickr.com/2674/5850229113_4fe05d5265_z.jpg",
]
images = [Image.open(requests.get(url, stream=True).raw) for url in urls]
inputs = processor(
    text=["a photo of a cat", "a photo of a dog"], images=images, return_tensors="pt", padding=True
)
outputs = encoder_model(
    input_ids=inputs.input_ids,
    attention_mask=inputs.attention_mask,
    pixel_values=inputs.pixel_values,
    return_loss=True,
)
loss, logits_per_image = outputs.loss, outputs.logits_per_image  # this is the image-text similarity score

In [12]:
# save and load from pretrained
encoder_model.save_pretrained("vit-bert")
encoder_model = VisionTextDualEncoderModel.from_pretrained("./vit-bert")

In [13]:
# inference
outputs = encoder_model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score
probs = logits_per_image.softmax(dim=1)  # we can take the softmax to get the label probabilities

In [22]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("./vit-bert", "gpt2")

Some weights of GPT2LMHeadModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['h.4.crossattention.q_attn.weight', 'h.9.crossattention.c_proj.weight', 'h.10.crossattention.q_attn.weight', 'h.9.crossattention.c_attn.bias', 'h.4.crossattention.c_attn.bias', 'h.9.crossattention.q_attn.weight', 'h.11.crossattention.c_attn.weight', 'h.7.ln_cross_attn.weight', 'h.10.crossattention.c_proj.weight', 'h.6.crossattention.q_attn.bias', 'h.11.crossattention.c_attn.bias', 'h.11.crossattention.c_proj.weight', 'h.9.ln_cross_attn.bias', 'h.2.crossattention.c_proj.weight', 'h.5.crossattention.c_attn.bias', 'h.3.ln_cross_attn.weight', 'h.0.crossattention.c_attn.weight', 'h.10.crossattention.c_proj.bias', 'h.2.crossattention.c_attn.bias', 'h.4.crossattention.c_proj.weight', 'h.2.ln_cross_attn.weight', 'h.3.crossattention.c_attn.weight', 'h.6.crossattention.c_proj.bias', 'h.0.crossattention.q_attn.bias', 'h.11.ln_cross_attn.bias', 'h.9.crossattention.c_attn.weight', 'h.7

AttributeError: 'VisionTextDualEncoderConfig' object has no attribute 'hidden_size'