LOAD CLIP's Visual Encoder

In [1]:
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor
from PIL import Image

model_name = 'openai/clip-vit-base-patch32'

# Image Processor (for resizing, normalization)
processor = CLIPImageProcessor.from_pretrained(model_name)

# Visual Encoder (Patch-level outputs)
clip_vision_encoder = CLIPVisionModel.from_pretrained(model_name)

Extract Image Embeddings

In [7]:
image = Image.open('my_image.jpg')
inputs = processor(images=image, return_tensors="pt")

with torch.no_grad():
    outputs = clip_vision_encoder(**inputs)

# Extract patch-level embeddings (not pooled!)
image_embeds = outputs.last_hidden_state  # shape: [1, num_patches+1, hidden_dim]
print(image_embeds.shape)
print(image_embeds)


torch.Size([1, 50, 768])
tensor([[[-0.6340,  0.3656, -0.2445,  ..., -0.0313,  0.3453,  0.0162],
         [-0.1868,  0.2712, -0.6086,  ...,  0.0164,  0.0588,  0.1814],
         [-0.0984,  0.2395, -0.6235,  ..., -0.0471,  0.3991,  0.2777],
         ...,
         [-0.3309, -0.3027, -0.2398,  ...,  0.4375,  0.6725, -0.0038],
         [-0.2847,  0.0317, -0.2260,  ...,  0.2764,  0.4962, -0.0597],
         [ 0.4559,  0.5215, -1.1170,  ...,  0.2762,  0.4585, -0.2910]]])


Define Transformer Decoder

In [3]:
from transformers import GPT2Tokenizer, GPT2LMHeadModel, GPT2Config

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token

# Modify GPT-2 to accept Cross-Attention
config = GPT2Config.from_pretrained('gpt2')
config.add_cross_attention = True

decoder = GPT2LMHeadModel(config)

print(decoder)


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D(nf=2304, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (crossattention): GPT2Attention(
          (c_attn): Conv1D(nf=1536, nx=768)
          (q_attn): Conv1D(nf=768, nx=768)
          (c_proj): Conv1D(nf=768, nx=768)
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_cross_attn): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): C

In [None]:
from transformers import GenerationConfig


generation_config = GenerationConfig(
    max_length=20,
    num_beams=5,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)

input_ids = tokenizer("A photo of", return_tensors="pt").input_ids

# Generate caption conditioned on image embeddings
outputs = decoder(input_ids=input_ids, encoder_hidden_states=image_embeds)

# Get logits to predict next token
# logits = outputs.logits
# predicted_next_token = torch.argmax(logits[:, -1, :], dim=-1)
# print(predicted_next_token.)

with torch.no_grad():
    generated_ids = decoder.generate(
        input_ids=input_ids,
        encoder_hidden_states=image_embeds,
        generation_config=generation_config
        
    )

caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated caption:", caption)



`generation_config` default values have been modified to match model-specific defaults: {'bos_token_id': 50256, 'eos_token_id': 50256}. If this is not desired, please set these values explicitly.
The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated caption: A photo of Hawai Hawai reperc reperc reperc reperc protecting protecting protecting reperc repercituituitu accommodate accommodate accommodate Sega Sega Sega


Training the Decoder

In [None]:
optimizer = torch.optim.AdamW(decoder.parameters(), lr=1e-4)



for images, captions in dataloader:
    optimizer.zero_grad()
    
    inputs = processor(images=images, return_tensors="pt")
    image_embeds = clip_vision_encoder(**inputs).last_hidden_state

    caption_inputs = tokenizer(captions, padding=True, return_tensors="pt")

    outputs = decoder(
        input_ids=caption_inputs.input_ids,
        attention_mask=caption_inputs.attention_mask,
        encoder_hidden_states=image_embeds,
        labels=caption_inputs.input_ids
    )

    loss = outputs.loss
    loss.backward()
    optimizer.step()


Caption Generation (Inference)

In [None]:
from transformers import GenerationConfig

generation_config = GenerationConfig(
    max_length=20,
    num_beams=5,
    early_stopping=True
)

generated_ids = decoder.generate(
    input_ids=tokenizer("A photo of", return_tensors="pt").input_ids,
    encoder_hidden_states=image_embeds,
    generation_config=generation_config
)

caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print(caption)


In [12]:
import torch
from transformers import CLIPVisionModel, CLIPImageProcessor, GPT2Tokenizer, GPT2LMHeadModel, GPT2Config, GenerationConfig
from PIL import Image

# 1. Load models and processor
model_name = 'openai/clip-vit-base-patch32'
processor = CLIPImageProcessor.from_pretrained(model_name)
clip_vision_encoder = CLIPVisionModel.from_pretrained(model_name)

tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
tokenizer.pad_token = tokenizer.eos_token
config = GPT2Config.from_pretrained('gpt2')
config.add_cross_attention = True
decoder = GPT2LMHeadModel(config)

# 2. Load and preprocess the image
image = Image.open("my_image.jpg")
inputs = processor(images=image, return_tensors="pt")

# 3. Get image embeddings
with torch.no_grad():
    outputs = clip_vision_encoder(**inputs)
image_embeds = outputs.last_hidden_state  # shape: [1, num_patches+1, hidden_dim]

# 4. Generate caption
generation_config = GenerationConfig(
    max_length=20,
    num_beams=5,
    early_stopping=True,
    pad_token_id=tokenizer.eos_token_id
)
prompt = "A photo of"
input_ids = tokenizer(prompt, return_tensors="pt").input_ids

with torch.no_grad():
    generated_ids = decoder.generate(
        input_ids=input_ids,
        encoder_hidden_states=image_embeds,
        generation_config=generation_config
    )

caption = tokenizer.decode(generated_ids[0], skip_special_tokens=True)
print("Generated caption:", caption)

Generated caption: A photo of restoration restoration restoration restoration deposit deposit deposit KNOW KNOW KNOW KNOW KNOW KNOW KNOW Counsel Counsel Counsel Counsel Counsel Counsel


In [13]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

# Load pre-trained image captioning model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

# Generate caption
image = Image.open("my_image.jpg")
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values

output_ids = model.generate(pixel_values, max_length=50, num_beams=4)
caption = tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0]
print(caption)

config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/982M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/982M [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/228 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/241 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/120 [00:00<?, ?B/s]

We strongly recommend passing in an `attention_mask` since your input_ids may be padded. See https://huggingface.co/docs/transformers/troubleshooting#incorrect-output-when-padding-tokens-arent-masked.


NotImplementedError: Make sure that a `_reorder_cache` function is correctly implemented in transformers.models.gpt2.modeling_gpt2 to enable beam search for <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'>