In [2]:
import torch
from PIL import Image
from transformers import CLIPModel, CLIPProcessor

clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
def get_text_embedding(text: str):
    inputs = clip_processor(text=[text], return_tensors="pt", padding=True)
    text_embeddings = clip_model.get_text_features(**inputs)
    return text_embeddings

In [19]:
def get_image_embedding(image_path: str):
    image = Image.open(image_path)
    inputs = clip_processor(images=image, return_tensors="pt")
    image_embeddings = clip_model.get_image_features(**inputs)
    return image_embeddings

In [20]:
def get_joint_embedding(image_path: str, text: str):
    # Preprocessing
    image = Image.open(image_path)
    image_inputs = clip_processor(images=image, return_tensors="pt")
    text_inputs = clip_processor(text=[text], return_tensors="pt", padding=True)

    # Generate embeddings
    with torch.no_grad():
        image_embedding = clip_model.get_image_features(**image_inputs)
        text_embedding = clip_model.get_text_features(**text_inputs)
    joint_embedding = (image_embedding + text_embedding) / 2

    return joint_embedding

In [24]:
# Verify length to be 2 x 512
get_joint_embedding("../data/images/motorcycle_1.jpg", "a bike")

tensor([[-6.2834e-02,  1.0380e-01,  9.6675e-02,  3.0001e-02, -2.9249e-01,
          7.1380e-02, -6.1374e-02, -4.2095e-01,  2.8316e-01,  2.0372e-01,
         -7.6836e-02, -2.6146e-01,  2.4978e-01, -2.1236e-01,  4.1645e-01,
          1.3331e-01,  3.1808e-01, -3.2554e-02, -3.4612e-01, -1.6126e-01,
          3.0429e-01, -2.3719e-01,  2.1889e-01, -3.8791e-01, -3.4716e-01,
          7.9866e-02, -1.8136e-01, -1.0697e-01, -1.3817e-01, -1.0560e-01,
         -1.8034e-01, -1.9573e-01, -2.0430e-01,  1.5146e-01, -1.8781e-01,
         -1.5047e-02, -1.4425e-01,  5.4641e-01,  1.1652e-01,  6.1591e-01,
          3.2179e-01, -2.2858e-01, -4.2895e-02, -1.9234e-01,  1.2435e-01,
          3.8528e-02,  1.7254e-01, -2.4025e-02, -9.7716e-02,  3.0564e-01,
          2.6720e-01,  5.9142e-02,  5.0896e-01, -3.4095e-01, -3.4978e-01,
          1.4924e-01,  3.8415e-01,  4.2192e-02, -6.1538e-02, -2.7015e-01,
         -1.5351e-01, -1.0578e-01, -2.4631e-01,  1.6987e-01, -2.9511e-01,
          1.1361e-01,  1.3852e-01,  7.

In [7]:
# Examples
ex1 = get_joint_embedding("../data/images/motorcycle_1.jpg", "a bike outside")[0]
ex2 = get_joint_embedding("../data/images/motorcycle_2.jpg", "a bike outside")[0]
ex3 = get_joint_embedding("../data/images/cat_2.jpeg", "a bike outside")[0]

# Cosine Similarity

In [101]:
import numpy as np


def cosine_similarity(vec1, vec2):
    similarity = np.dot(vec1, vec2) / (np.linalg.norm(vec1) * np.linalg.norm(vec2))
    return similarity

In [102]:
ex1_embed = np.array(ex1)
ex2_embed = np.array(ex2)
ex3_embed = np.array(ex3)
sim_ex1_ex2 = cosine_similarity(ex1_embed, ex2_embed)
sim_ex1_ex3 = cosine_similarity(ex1_embed, ex3_embed)

  ex1_embed = np.array(ex1)
  ex2_embed = np.array(ex2)
  ex3_embed = np.array(ex3)


In [103]:
print("Cosine similarity between ex1_embeded and ex2_embeded is:")
display(sim_ex1_ex2)
print("Cosine similarity between ex1_embeded and ex3_embeded is:")
display(sim_ex1_ex3)

Cosine similarity between ex1_embeded and ex2_embeded is:


np.float32(0.8063979)

Cosine similarity between ex1_embeded and ex3_embeded is:


np.float32(0.7158405)