In [2]:
pip install torchmetrics



In [57]:
pip install sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-3.1.0-py3-none-any.whl.metadata (23 kB)
Downloading sentence_transformers-3.1.0-py3-none-any.whl (249 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m249.1/249.1 kB[0m [31m8.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: sentence_transformers
Successfully installed sentence_transformers-3.1.0


In [3]:
pip install datasets

Collecting datasets
  Downloading datasets-3.0.0-py3-none-any.whl.metadata (19 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Downloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Downloading datasets-3.0.0-py3-none-any.whl (474 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m474.3/474.3 kB[0m [31m10.1 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m4.4 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl (39.9 MB)
[2K 

In [58]:
from torchmetrics.multimodal.clip_score import CLIPScore
from PIL import Image
import torch
import os
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import requests
import numpy as np
from natsort import natsorted
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util

USER_AGENT = get_datasets_user_agent()


In [5]:


def calculate_clip_score(images, prompts):
    metrics = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")
    images_int = (images * 255).astype("uint8")
    images_int = np.expand_dims(images_int, axis=0)  # Add batch dimension
    score = metrics(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(score), 4)


In [67]:
def calculate_blip_2(image, prompt, processor ,model,device):
    try:
        img_inputs = processor(images=image, return_tensors="pt").to(device)

        # Generate image features
        with torch.no_grad():
          generated_ids = model.generate(**img_inputs)

        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
        generated_text_embedding = similarity_model.encode(generated_text)
        caption_embedding = similarity_model.encode(prompt)

        similarity_score = util.pytorch_cos_sim(caption_embedding, generated_text_embedding).item()
        return similarity_score, generated_text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

Data set & captions


In [8]:
num_threads = 20
dset = load_dataset("google-research-datasets/conceptual_captions",streaming=True)


README.md:   0%|          | 0.00/14.2k [00:00<?, ?B/s]

In [9]:
dset_iter = iter(dset['train'])

In [19]:

def conceptual_captioning_stream_to_dir (save_dir, image_num):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    captions = []
    for i in range(image_num):
        item2 = next(dset_iter)
        image_url = item2['image_url']
        caption = item2['caption']
        response = requests.get(image_url, stream=True)
        try:
            if response.status_code == 200:
                image_path = os.path.join(save_dir, f"image_{i}.jpg")
                with open(image_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                captions.append(caption)
            else:
                print(f"Failed to download image {i} from {image_url}")
        except Exception as e:
            print(f"Error downloading image {i} from {image_url}: {e}")
    return captions , save_dir

In [68]:
def evaluation(save_dir, captions , metric= "clip"):
    image_files = natsorted(os.listdir(save_dir))  # Ensure the images are processed in order
    if (metric == "clip"):
        print(f"Metric used is CLIP Score")
        for img_file, caption in zip(image_files, captions):
            img_path = os.path.join(save_dir, img_file)
            if img_path.endswith(".jpg") or img_path.endswith(".png"):
                img = Image.open(img_path)
                img_array = np.array(img) / 255.0
                score = calculate_clip_score(img_array, caption)
                print(f"Image: {img_path}, Caption: {caption}, CLIP Score: {score}")
    elif(metric == "blip"):
        print(f"Metric used is BLIP Score")
        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
        model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using {device}")
        model.to(device)
        for img_file, caption in zip(image_files, captions):
            img_path = os.path.join(save_dir, img_file)
            if img_path.endswith(".jpg") or img_path.endswith(".png"):
                img = Image.open(img_path)
                simularity, generated_caption = calculate_blip_2(img,caption,processor,model,device)
                print(f"Image: {img_path}, Caption: {caption}, Generated Caption {generated_caption} BLIP2 Score: {simularity}")



Download Images to directory

In [24]:
# remove before merge

captions = []
path_dir = os.path.join(os.getcwd(), "conceptual_captioning")
captions, save_dir = conceptual_captioning_stream_to_dir(path_dir,image_num=20)


Clip Score Evaluation


In [23]:
# remove before merge
import shutil

image_path = os.path.join(os.getcwd(), "conceptual_captioning")

if os.path.exists(image_path):
    shutil.rmtree(image_path)
    print(f"{image_path} has been deleted.")
else:
    print(f"{image_path} does not exist.")

/content/conceptual_captioning has been deleted.


In [26]:
evaluation(save_dir,captions)

Metric used is CLIP Score




Image: /content/conceptual_captioning/image_0.jpg, Caption: actor and daughters uk premiere held, CLIP Score: 30.4291
Image: /content/conceptual_captioning/image_1.jpg, Caption: a fine , grainy vector pattern in black and white ., CLIP Score: 29.4877
Image: /content/conceptual_captioning/image_2.jpg, Caption: seamless border of orange roses and paisley , pattern on a white background ., CLIP Score: 29.311
Image: /content/conceptual_captioning/image_3.jpg, Caption: students in front of a school, CLIP Score: 28.599
Image: /content/conceptual_captioning/image_4.jpg, Caption: museum , opened is among the 20th - century 's most significant buildings ., CLIP Score: 27.0254
Image: /content/conceptual_captioning/image_5.jpg, Caption: river as it meets bodies of water where airline meets, CLIP Score: 25.7665
Image: /content/conceptual_captioning/image_6.jpg, Caption: young rock star jamming on a guitar, CLIP Score: 27.6241
Image: /content/conceptual_captioning/image_7.jpg, Caption: a moored fis

BLIP2 evaluation

In [69]:
evaluation(save_dir,captions, metric='blip')

Metric used is BLIP Score




Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Using cpu




Image: /content/conceptual_captioning/image_0.jpg, Caption: actor and daughters uk premiere held, Generated Caption samuel l. jackson and his family at the uk BLIP2 Score: 0.28897207975387573
Image: /content/conceptual_captioning/image_1.jpg, Caption: a fine , grainy vector pattern in black and white ., Generated Caption black and white seamless pattern with a pattern of leaves BLIP2 Score: 0.5404555201530457
Image: /content/conceptual_captioning/image_2.jpg, Caption: seamless border of orange roses and paisley , pattern on a white background ., Generated Caption watercolor floral border with roses and paisley BLIP2 Score: 0.8097856640815735
Image: /content/conceptual_captioning/image_3.jpg, Caption: students in front of a school, Generated Caption a group of school children posing for a photo in a school - BLIP2 Score: 0.6507741212844849
Image: /content/conceptual_captioning/image_4.jpg, Caption: museum , opened is among the 20th - century 's most significant buildings ., Generated Ca