### Trial for OpenAI's CLIP Multimodal Embedding

In [1]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image


# Load the pre-trained CLIP model and processor
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm


In [19]:

# Example text and image
text = ["A diagram for a large language model application.", "An easy mathematical equation"]
images = [Image.open("equation.png"), Image.open("diagram.png")]

In [20]:
inputs = processor(text=text, images=images, return_tensors="pt", padding=True)

In [21]:
outputs = model(**inputs)
text_embeddings = outputs.text_embeds  # Embeddings for text
image_embeddings = outputs.image_embeds  # Embeddings for images

In [22]:
text_embeddings = text_embeddings / text_embeddings.norm(dim=-1, keepdim=True)
image_embeddings = image_embeddings / image_embeddings.norm(dim=-1, keepdim=True)

In [23]:
similarity = torch.matmul(text_embeddings, image_embeddings.T)
print(similarity)

tensor([[0.2067, 0.3026],
        [0.3033, 0.2047]], grad_fn=<MmBackward0>)


In [24]:
top_matches = similarity.argmax(dim=1)
for i, match in enumerate(top_matches):
    print(f"Text: {text[i]}")
    print(f"Best Match Image: {match}")


Text: A diagram for a large language model application.
Best Match Image: 1
Text: An easy mathematical equation
Best Match Image: 0
