In [10]:
!pip install transformers
!pip install open_clip_torch



In [11]:
from transformers import CLIPProcessor, CLIPModel
from open_clip import create_model_from_pretrained, get_tokenizer
import torch
import torch.nn.functional as F
from PIL import Image

In [12]:
device = "cuda" if torch.cuda.is_available() else "cpu"
texts = ["man holding a dog", 
    "man holding a black coloured dog", 
    "a man wearing a white shirt and gray pants",
    "a man wearing a white shirt and gray pants and holding a dog",
    "man wearing a white shirt", 
    "a bookshelf in the background", 
    "a running dog", "a running man", 
    "a man with a dog face", 
    "a dog with a man face" ]

clip = "openai/clip-vit-base-patch32"
image = Image.open("sample_image.jpg")

In [13]:
def clip_text_scorer(model_name, texts, image, device):
    model = CLIPModel.from_pretrained(model_name).to(device)
    processor = CLIPProcessor.from_pretrained(model_name)

    inputs = processor(text=texts, images=image, return_tensors="pt", padding=True).to(device)

    with torch.no_grad():
        outputs = model(**inputs)

    logits_per_image = outputs.logits_per_image
    scores = logits_per_image.squeeze().cpu().tolist()

    return scores

def clips_text_scorer(model_name, texts, image, device):
    model, preprocess = create_model_from_pretrained(model_name)
    tokenizer = get_tokenizer(model_name)

    image_tensor = preprocess(image).unsqueeze(0).to(device)

    text = tokenizer(texts, context_length = model.context_length).to(device)

    with torch.no_grad():
        image_features = model.encode_image(image_tensor)
        text_features = model.encode_text(text)

        image_features_norm = F.normalize(image_features, dim=-1)
        text_features_norm = F.normalize(text_features, dim=-1)

        cosine_similarity = image_features_norm @ text_features_norm.T

        logit_scale = model.logit_scale.exp()
        scaled_similarity = cosine_similarity * logit_scale

        scores = scaled_similarity.squeeze().cpu().tolist()

        return scores
    

In [14]:
scores = clip_text_scorer(clip, texts, image, device)

In [15]:
for i in range(10):
    print(f"{texts[i]} | Similarity Score: {scores[i]}")

man holding a dog | Similarity Score: 30.558883666992188
man holding a black coloured dog | Similarity Score: 29.479265213012695
a man wearing a white shirt and gray pants | Similarity Score: 24.179716110229492
a man wearing a white shirt and gray pants and holding a dog | Similarity Score: 32.02540588378906
man wearing a white shirt | Similarity Score: 23.536558151245117
a bookshelf in the background | Similarity Score: 23.917461395263672
a running dog | Similarity Score: 23.639495849609375
a running man | Similarity Score: 22.64459228515625
a man with a dog face | Similarity Score: 25.942140579223633
a dog with a man face | Similarity Score: 25.717147827148438


In [16]:
clips = "hf-hub:UCSC-VLAA/ViT-L-14-CLIPS-224-Recap-DataComp-1B"

clips_scores = clips_text_scorer(clips, texts, image, device)

In [17]:
for i in range(10):
    print(f"{texts[i]} | Similarity Score: {clips_scores[i]}")

man holding a dog | Similarity Score: 24.29550552368164
man holding a black coloured dog | Similarity Score: 23.524459838867188
a man wearing a white shirt and gray pants | Similarity Score: 4.223145008087158
a man wearing a white shirt and gray pants and holding a dog | Similarity Score: 24.82670021057129
man wearing a white shirt | Similarity Score: 11.843993186950684
a bookshelf in the background | Similarity Score: 17.921937942504883
a running dog | Similarity Score: 13.983269691467285
a running man | Similarity Score: 5.167665958404541
a man with a dog face | Similarity Score: 22.434938430786133
a dog with a man face | Similarity Score: 21.930673599243164
