In [1]:
!pip install transformers
!pip install open_clip_torch



In [10]:
# from transformers import pipeline
from transformers import AutoProcessor
from transformers import BlipForConditionalGeneration
from transformers import CLIPProcessor, CLIPModel
from open_clip import create_model_from_pretrained, get_tokenizer
from PIL import Image
import torch.nn.functional as F
import torch
import os

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"

blip = "Salesforce/blip-image-captioning-base"
clip = "openai/clip-vit-base-patch32"
clips = "hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-224-Recap-DataComp-1B"

In [4]:
img_list = []

for i in os.listdir("samples"):
    img_list.append("samples/" + i)

img_list = sorted(img_list)

In [5]:
results = [] 
blip_proc = AutoProcessor.from_pretrained(blip)
blip_model = BlipForConditionalGeneration.from_pretrained(blip).to(device)


for i in img_list:
    img = Image.open(i)

    inputs = blip_proc(images = img, return_tensors="pt").to(device)

    with torch.no_grad():
        ids = blip_model.generate(**inputs)
    caption = blip_proc.batch_decode(ids, skip_special_tokens=True)[0].strip()

    results.append({
        "image": img,
        "blip caption": caption,
        "clip score": None,
        "clips score": None
    })

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [6]:
clip_proc = CLIPProcessor.from_pretrained(clip)
clip_model = CLIPModel.from_pretrained(clip).to(device)

for i in results:
    img = i["image"]
    caption = i["blip caption"]

    inputs = clip_proc(text=[caption], images=img, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = clip_model(**inputs)
        logits_per_image = outputs.logits_per_image
        clip_score = logits_per_image.item()

    i["clip score"] = clip_score


In [11]:
clips_model, clips_preprocess = create_model_from_pretrained(clips)
clips_tokenizer = get_tokenizer(clips)

for i in results:
    img = i["image"]
    caption = i["blip caption"]
    image = clips_preprocess(img).unsqueeze(0).to(device)
    text = clips_tokenizer([caption]).to(device)

    with torch.no_grad():
        image_features = clips_model.encode_image(image)
        text_features = clips_model.encode_text(text)

        image_features_norm = F.normalize(image_features, dim=-1)
        text_features_norm = F.normalize(text_features, dim=-1)

        cosine_similarity = (image_features_norm @ text_features_norm.T).item()

        logit_scale = clips_model.logit_scale.exp().item()
        scaled_similarity = cosine_similarity * logit_scale

        i["clips score"] = scaled_similarity

In [12]:
results

[{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x302>,
  'blip caption': 'a small dog walking on a green carpet',
  'clip score': 31.566017150878906,
  'clips score': 24.999535889562594},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x366>,
  'blip caption': 'a small dog running across a green field',
  'clip score': 32.70260238647461,
  'clips score': 26.338922534023368},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x300>,
  'blip caption': 'a family sitting in a pool with a towel',
  'clip score': 31.336462020874023,
  'clips score': 20.4160130104367},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x320>,
  'blip caption': 'a small bird sitting on a plant',
  'clip score': 28.938268661499023,
  'clips score': 23.344033408776568},
 {'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=400x300>,
  'blip caption': 'a small dog standing on a stone ledge',
  'clip score': 31.03472900390625