In [None]:
import os
import cv2
import torch
import numpy as np
from transformers import CLIPModel, CLIPTokenizer
from google.colab.patches import cv2_imshow

In [None]:
# ---------- Config ----------
IMAGE_FOLDER = "/content/Pics"   # put a few images here (jpg/png)
TOP_K = 3
MODEL_NAME = "openai/clip-vit-base-patch32"
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
IMG_SIZE = 224

In [None]:
# Load model + tokenizer
model = CLIPModel.from_pretrained(MODEL_NAME).to(DEVICE)
tokenizer = CLIPTokenizer.from_pretrained(MODEL_NAME)

# CLIP pixel normalization constants (from official CLIP preprocessing)
CLIP_MEAN = np.array([0.48145466, 0.4578275, 0.40821073], dtype=np.float32)
CLIP_STD  = np.array([0.26862954, 0.26130258, 0.27577711], dtype=np.float32)

def preprocess_cv2(img_bgr, size=IMG_SIZE):
    """
    img_bgr: np.ndarray HxWx3 (OpenCV BGR uint8)
    returns: torch.FloatTensor shaped (3, size, size), normalized for CLIP
    """
    img_rgb = cv2.cvtColor(img_bgr, cv2.COLOR_BGR2RGB)
    img_resized = cv2.resize(img_rgb, (size, size), interpolation=cv2.INTER_AREA)
    img_float = img_resized.astype(np.float32) / 255.0  # H W C in [0,1]
    # normalize per-channel
    img_norm = (img_float - CLIP_MEAN) / CLIP_STD
    # HWC -> CHW
    img_chw = np.transpose(img_norm, (2, 0, 1)).astype(np.float32)
    tensor = torch.from_numpy(img_chw)
    return tensor

# Load small dataset of images
image_paths = []
image_tensors = []
for fname in sorted(os.listdir(IMAGE_FOLDER)):
    path = os.path.join(IMAGE_FOLDER, fname)
    if not (fname.lower().endswith(".jpg") or fname.lower().endswith(".png") or fname.lower().endswith(".jpeg")):
        continue
    img = cv2.imread(path)
    if img is None:
        continue
    image_paths.append(path)
    image_tensors.append(preprocess_cv2(img))

if len(image_tensors) == 0:
    raise SystemExit(f"No images found in folder '{IMAGE_FOLDER}'. Put a few images there and retry.")

# Batchify and move to device
image_batch = torch.stack(image_tensors).to(DEVICE)  # shape: (N, 3, H, W)

In [1]:
#  Get image embeddings
with torch.no_grad():
    img_feats = model.get_image_features(pixel_values=image_batch)   # (N, D)
    img_feats = img_feats / img_feats.norm(dim=-1, keepdim=True)    # L2-normalize

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/605M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/605M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/592 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

In [6]:
# --- Query loop ---
print(f"Loaded {len(image_paths)} images. Enter text queries (empty to exit).")
while True:
    query = input("\nEnter search query (or empty to quit): ").strip()
    if query == "":
        break

    # Tokenize + encode text
    tokens = tokenizer([query], padding=True, return_tensors="pt").to(DEVICE)
    with torch.no_grad():
        txt_feats = model.get_text_features(**tokens)  # (1, D)
        txt_feats = txt_feats / txt_feats.norm(dim=-1, keepdim=True) # L2-normalize

    # Cosine similarity: (1, D) @ (N, D).T -> (1, N)
    sims = (txt_feats @ img_feats.T).squeeze(0).cpu().numpy()  # shape (N,)

    # Top-k
    topk_idx = np.argsort(-sims)[:TOP_K]
    print("\nTop matches:")
    for rank, idx in enumerate(topk_idx, start=1):
        print(f"{rank}. {os.path.basename(image_paths[idx])}  (score={sims[idx]:.4f})")

    # Show top match (OpenCV window)
    best_path = image_paths[topk_idx[0]]
    best_img = cv2.imread(best_path)
    if best_img is not None:
        # annotate filename and score

        display = best_img.copy()
        text_line = f"{os.path.basename(best_path)}  score={sims[topk_idx[0]]:.3f}"
        cv2.putText(display, text_line, (10,30), cv2.FONT_HERSHEY_SIMPLEX, 0.8, (0,255,0), 2)
        cv2_imshow(display)  # works in Colab


Loaded 4 images. Enter text queries (empty to exit).


KeyboardInterrupt: Interrupted by user