In [2]:
pip install "torch>=2.2" "transformers>=4.45" pillow faiss-cpu numpy


Collecting torch>=2.2
  Downloading torch-2.8.0-cp311-none-macosx_11_0_arm64.whl.metadata (30 kB)
Collecting transformers>=4.45
  Downloading transformers-4.57.0-py3-none-any.whl.metadata (41 kB)
Collecting pillow
  Using cached pillow-11.3.0-cp311-cp311-macosx_11_0_arm64.whl.metadata (9.0 kB)
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp311-cp311-macosx_14_0_arm64.whl.metadata (5.1 kB)
Collecting numpy
  Downloading numpy-2.3.3-cp311-cp311-macosx_14_0_arm64.whl.metadata (62 kB)
Collecting filelock (from torch>=2.2)
  Downloading filelock-3.20.0-py3-none-any.whl.metadata (2.1 kB)
Collecting sympy>=1.13.3 (from torch>=2.2)
  Downloading sympy-1.14.0-py3-none-any.whl.metadata (12 kB)
Collecting networkx (from torch>=2.2)
  Downloading networkx-3.5-py3-none-any.whl.metadata (6.3 kB)
Collecting jinja2 (from torch>=2.2)
  Downloading jinja2-3.1.6-py3-none-any.whl.metadata (2.9 kB)
Collecting fsspec (from torch>=2.2)
  Downloading fsspec-2025.9.0-py3-none-any.whl.metadata (10 kB)
Co

In [2]:
import numpy as np, torch
from PIL import Image
from transformers import AutoImageProcessor, AutoModel

DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
MODEL_ID = "facebook/dinov2-base"  # open model

processor = AutoImageProcessor.from_pretrained(MODEL_ID)
model = AutoModel.from_pretrained(MODEL_ID).to(DEVICE).eval()

@torch.inference_mode()
def embed_image(path: str) -> np.ndarray:
    img = Image.open(path).convert("RGB")
    inputs = processor(images=img, return_tensors="pt").to(DEVICE)
    out = model(**inputs)
    vec = torch.nn.functional.normalize(out.pooler_output.squeeze(0), dim=0)
    return vec.cpu().numpy()


  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


In [3]:
ref_paths = [
    "images/7up_bottle.jpg",
    "images/coke_zero.jpg", 
    "images/coke.jpg",
    "images/pepsi_bottle.jpg",
    "images/pepsi.jpg",
    "images/sprite.jpg"
]

ref_vecs = np.stack([embed_image(p) for p in ref_paths], axis=0)
np.savez("dinov2_refs.npz", paths=np.array(ref_paths), vecs=ref_vecs)
print("Saved 10 embeddings → dinov2_refs.npz")


Saved 10 embeddings → dinov2_refs.npz


In [6]:

q = embed_image("images/pepsi_test.jpg")
sims = ref_vecs @ q            # both are L2-normalized → cosine similarity
best = int(np.argmax(sims))
top3_idx = np.argsort(-sims)[:3]
print("Most similar:", ref_paths[best], "score:", float(sims[best]))
print("Top-3:", [(ref_paths[i], float(sims[i])) for i in top3_idx])


Most similar: images/pepsi.jpg score: 0.832436203956604
Top-3: [('images/pepsi.jpg', 0.832436203956604), ('images/coke_zero.jpg', 0.7735756635665894), ('images/coke.jpg', 0.7493051886558533)]
