In [57]:
from torchmetrics.multimodal.clip_score import CLIPScore
from PIL import Image
import torch
import os
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import requests
import numpy as np
from natsort import natsorted
from transformers import Blip2Processor, Blip2Model

USER_AGENT = get_datasets_user_agent()


In [58]:


def calculate_clip_score(images, prompts):
    metrics = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")
    images_int = (images * 255).astype("uint8")
    images_int = np.expand_dims(images_int, axis=0)  # Add batch dimension
    score = metrics(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(score), 4)


In [72]:
def calculate_blip_2(image, prompt, processor ,model,device):
    try:
        inputs = processor(images=image, text=prompt, return_tensors="pt").to(device)
        decoder_input_ids = processor.tokenizer(prompt, return_tensors="pt").input_ids.to(device)
        # Forward pass through blip model
        outputs = model(**inputs, decoder_input_ids=decoder_input_ids)
        # Extract image and text embeddings
        image_embeds = outputs.image_embeds
        text_embeds = outputs.text_embeds
        print(f"Image Embeds Shape: {image_embeds.shape}")
        print(f"Text Embeds Shape: {text_embeds.shape}")
        # Calculate cosine similarity
        similarity = torch.nn.functional.cosine_similarity(image_embeds, text_embeds)
        return similarity
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

Data set & captions


In [8]:
num_threads = 20
dset = load_dataset("google-research-datasets/conceptual_captions",streaming=True)


In [9]:
dset_iter = iter(dset['train'])

In [42]:

def conceptual_captioning_stream_to_dir (save_dir, image_num):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    captions = []
    for i in range(image_num):
        item2 = next(dset_iter)
        image_url = item2['image_url']
        caption = item2['caption']
        response = requests.get(image_url, stream=True)
        try:
            if response.status_code == 200:
                image_path = os.path.join(save_dir, f"image_{i}.jpg")
                with open(image_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                captions.append(caption)
            
            else:
                print(f"Failed to download image {i} from {image_url}")
        except Exception as e:
            print(f"Error downloading image {i} from {image_url}: {e}")
    return captions , save_dir

In [75]:
def evaluation(save_dir, captions , metric= "clip"):
    image_files = natsorted(os.listdir(save_dir))  # Ensure the images are processed in order
    if (metric == "clip"):
        print(f"Metric used is CLIP Score")
        for img_file, caption in zip(image_files, captions):
            img_path = os.path.join(save_dir, img_file)
            if img_path.endswith(".jpg") or img_path.endswith(".png"):
                img = Image.open(img_path)
                img_array = np.array(img) / 255.0
                score = calculate_clip_score(img_array, caption)
                print(f"Image: {img_path}, Caption: {caption}, CLIP Score: {score}")
    elif(metric == "blip"):
        print(f"Metric used is BLIP Score")
        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
        model = Blip2Model.from_pretrained("Salesforce/blip2-flan-t5-xl")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using {device}")
        model.to(device)
        for img_file, caption in zip(image_files, captions):
            img_path = os.path.join(save_dir, img_file)
            if img_path.endswith(".jpg") or img_path.endswith(".png"):
                img = Image.open(img_path)
                print(f"Image {img_path}")
                simularity = calculate_blip_2(img,caption,processor,model,device)
                print(f"Image: {img_path}, Caption: {caption}, BLIP2 Score: {simularity}")



In [51]:
captions = []
path_dir = os.path.join(os.getcwd(), "conceptual_captioning")
captions, save_dir = conceptual_captioning_stream_to_dir(path_dir,image_num=20)


Failed to download image 0 from http://oneindiaonepeople.com/wp-content/uploads/2014/02/13.jpg
Failed to download image 9 from https://lynismael.com/wp-content/uploads/2015/05/Holland-Marsh-Wedding-Photographer-Katelyn-Rick_0047(pp_w768_h534).jpg
Failed to download image 13 from http://media.gettyimages.com/photos/entertainer-beyonce-performs-on-stage-during-the-formation-world-tour-picture-id538724332
Failed to download image 18 from http://www.fourintravels.com/wp-content/uploads/2013/12/The-villas-from-the-front1.jpg


Clip Score Evaluation


In [52]:
evaluation(save_dir,captions)

Metric used is CLIP Score
Image: /home/reutsalman/repo/Stable-Diffusion-Attribute-Binding/conceptual_captioning/image_1.jpg, Caption: farm tractor is moving on the field , cultivating land, CLIP Score: 26.5389
Image: /home/reutsalman/repo/Stable-Diffusion-Attribute-Binding/conceptual_captioning/image_2.jpg, Caption: master playing with his little golden retriever dog on the lawn, CLIP Score: 30.9777
Image: /home/reutsalman/repo/Stable-Diffusion-Attribute-Binding/conceptual_captioning/image_3.jpg, Caption: year later the small shrubs doubled in size ., CLIP Score: 26.462
Image: /home/reutsalman/repo/Stable-Diffusion-Attribute-Binding/conceptual_captioning/image_4.jpg, Caption: according to the model , she regularly gets told it looks, CLIP Score: 26.0662
Image: /home/reutsalman/repo/Stable-Diffusion-Attribute-Binding/conceptual_captioning/image_5.jpg, Caption: person and the fiancee at their engagement party ., CLIP Score: 28.7006
Image: /home/reutsalman/repo/Stable-Diffusion-Attribute-

BLIP2 evaluation

In [76]:
evaluation(save_dir,captions, metric='blip')

Metric used is BLIP Score


Loading checkpoint shards: 100%|██████████| 2/2 [00:06<00:00,  3.15s/it]


Using cpu
Image /home/reutsalman/repo/Stable-Diffusion-Attribute-Binding/conceptual_captioning/image_1.jpg


KeyboardInterrupt: 