In [None]:
pip install "torch>=2.2" "transformers>=4.45" pillow faiss-cpu numpy


In [None]:
import numpy as np, torch
from PIL import Image
from transformers import CLIPProcessor, CLIPModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "openai/clip-vit-base-patch32"  # CLIP model

processor = CLIPProcessor.from_pretrained(MODEL_ID)
model = CLIPModel.from_pretrained(MODEL_ID).to(DEVICE).eval()

@torch.inference_mode()
def embed_image(path: str) -> np.ndarray:
    img = Image.open(path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt").to(DEVICE)
    out = model.get_image_features(**inputs)
    vec = torch.nn.functional.normalize(out.squeeze(0), dim=0)
    return vec.cpu().numpy()


In [None]:
ref_paths = [
    "images/7up_bottle.jpg",
    "images/coke_zero.jpg", 
    "images/coke.jpg",
    "images/pepsi_bottle.jpg",
    "images/pepsi.jpg",
    "images/sprite.jpg"
]

ref_vecs = np.stack([embed_image(p) for p in ref_paths], axis=0)
np.savez("clip_refs.npz", paths=np.array(ref_paths), vecs=ref_vecs)
print("Saved 6 embeddings → clip_refs.npz")


In [None]:

q = embed_image("images/pepsi_test.jpg")
sims = ref_vecs @ q            # both are L2-normalized → cosine similarity
best = int(np.argmax(sims))
top3_idx = np.argsort(-sims)[:3]
print("Most similar:", ref_paths[best], "score:", float(sims[best]))
print("Top-3:", [(ref_paths[i], float(sims[i])) for i in top3_idx])
