In [None]:
!pip install torchmetrics --upgrade
!pip install pandas
!pip install sentence_transformers
!pip install datasets
!pip install transformers
!pip install sklearn

In [25]:

from PIL import Image
import torch
import os
from datasets import load_dataset
from datasets.utils.file_utils import get_datasets_user_agent
import requests
import numpy as np
# from natsort import natsorted
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from sklearn.metrics.pairwise import cosine_similarity
from sentence_transformers import SentenceTransformer, util
from torchmetrics.multimodal.clip_score import CLIPScore
USER_AGENT = get_datasets_user_agent()


ImportError: cannot import name 'Tensor' from 'torch' (unknown location)

In [None]:


def calculate_clip_score(images, prompts):
    metrics = CLIPScore(model_name_or_path="openai/clip-vit-base-patch16")
    images_int = (images * 255).astype("uint8")
    images_int = np.expand_dims(images_int, axis=0)  # Add batch dimension
    score = metrics(torch.from_numpy(images_int).permute(0, 3, 1, 2), prompts).detach()
    return round(float(score), 4)


In [None]:
def calculate_blip_2(image, prompt, processor ,model,device):
    try:
        img_inputs = processor(images=image, return_tensors="pt").to(device)

        # Generate image features
        with torch.no_grad():
          generated_ids = model.generate(**img_inputs)

        generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
        similarity_model = SentenceTransformer('all-MiniLM-L6-v2')
        generated_text_embedding = similarity_model.encode(generated_text)
        caption_embedding = similarity_model.encode(prompt)

        similarity_score = util.pytorch_cos_sim(caption_embedding, generated_text_embedding).item()
        return similarity_score, generated_text
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

Data set & captions


In [None]:
num_threads = 20
dset = load_dataset("google-research-datasets/conceptual_captions",streaming=True)


In [None]:
dset_iter = iter(dset['train'])

In [None]:

def conceptual_captioning_stream_to_dir (save_dir, image_num):
    if not os.path.exists(save_dir):
        os.makedirs(save_dir)
    captions = []
    for i in range(image_num):
        item2 = next(dset_iter)
        image_url = item2['image_url']
        caption = item2['caption']
        response = requests.get(image_url, stream=True)
        try:
            if response.status_code == 200:
                image_path = os.path.join(save_dir, f"image_{i}.jpg")
                with open(image_path, 'wb') as f:
                    for chunk in response.iter_content(1024):
                        f.write(chunk)
                captions.append(caption)
            else:
                print(f"Failed to download image {i} from {image_url}")
        except Exception as e:
            print(f"Error downloading image {i} from {image_url}: {e}")
    return captions , save_dir

In [None]:
def evaluation(save_dir, captions , metric= "clip"):
    image_files = natsorted(os.listdir(save_dir))  # Ensure the images are processed in order
    if (metric == "clip"):
        print(f"Metric used is CLIP Score")
        for img_file, caption in zip(image_files, captions):
            img_path = os.path.join(save_dir, img_file)
            if img_path.endswith(".jpg") or img_path.endswith(".png"):
                img = Image.open(img_path)
                img_array = np.array(img) / 255.0
                score = calculate_clip_score(img_array, caption)
                print(f"Image: {img_path}, Caption: {caption}, CLIP Score: {score}")
    elif(metric == "blip"):
        print(f"Metric used is BLIP Score")
        processor = Blip2Processor.from_pretrained("Salesforce/blip2-flan-t5-xl")
        model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-flan-t5-xl")
        device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        print(f"Using {device}")
        model.to(device)
        for img_file, caption in zip(image_files, captions):
            img_path = os.path.join(save_dir, img_file)
            if img_path.endswith(".jpg") or img_path.endswith(".png"):
                img = Image.open(img_path)
                simularity, generated_caption = calculate_blip_2(img,caption,processor,model,device)
                print(f"Image: {img_path}, Caption: {caption}, Generated Caption {generated_caption} BLIP2 Score: {simularity}")



In [None]:

def csv_to_json (csv_file_path , json_file_path):
    df = pd.read_csv(csv_file_path)
    df.to_json(json_file_path, orient='records', lines=True)
    print(f"CSV file has been converted to JSON and saved at {json_file_path}")


In [None]:
os.getcwd()

Download Images to directory

In [None]:
# remove before merge

captions = []
path_dir = os.path.join(os.getcwd(), "conceptual_captioning")
captions, save_dir = conceptual_captioning_stream_to_dir(path_dir,image_num=20)


Clip Score Evaluation


In [None]:
# remove before merge
import shutil

image_path = os.path.join(os.getcwd(), "conceptual_captioning")

if os.path.exists(image_path):
    shutil.rmtree(image_path)
    print(f"{image_path} has been deleted.")
else:
    print(f"{image_path} does not exist.")

In [None]:
evaluation(save_dir,captions)

BLIP2 evaluation

In [None]:
evaluation(save_dir,captions, metric='blip')

In [None]:
from huggingface_hub import HfApi
import os
api = HfApi()
api.upload_folder(
    folder_path= os.path.join(os.getcwd(),"datasets/filtered_imsitu/dataset"),
    repo_id="Reutsalman/IMSITUE_FILTER-FOR_PROJECT",
    repo_type="dataset"
)