# Extract the embedding vectors from pre-trained clip model for image and text 
The model is from https://huggingface.co/openai/clip-vit-base-patch32



In [None]:
#!pip install transformers torch pillow requests
#!pip install pip install SimpleITK        

In [5]:
from PIL import Image
import requests

from transformers import CLIPProcessor, CLIPModel
import numpy as np
import matplotlib.pyplot as plt
import os 
import torch



In [None]:
"""
Script: Load an image, normalize pixel values, and extract CLIP image embeddings.

Steps:
1. Load the image from disk and convert to RGB.
2. (Optional) Convert to a NumPy array and normalize intensities to [0, 255].
3. Convert normalized array back to PIL (RGB).
4. Load the CLIP model and processor.
5. Preprocess the image and extract its 512‑dim embedding.
"""


def load_image(image_path: str) :
    """
    Load an image, convert to RGB, normalize pixel intensities,
    and return both the PIL image and the normalized NumPy array.

    Args:
        image_path (str): Path to the image file.

    Returns:
        pil_img (PIL.Image.Image): RGB image for CLIP processing.
        image_array (np.ndarray): Normalized array, shape (H, W, 3), dtype uint8.
    """
    pil_img = Image.open(image_path).convert("RGB")

    
    return pil_img


def extract_clip_image_embedding(pil_img: Image.Image,
                                 model_name: str = "openai/clip-vit-base-patch32") -> torch.Tensor:
    """
    Given a PIL image, run it through CLIP's vision encoder and return its embedding.

    Args:
        pil_img (PIL.Image.Image): Input RGB image.
        model_name (str): Hugging Face identifier for the CLIP variant.

    Returns:
        image_embeds (torch.Tensor): L2-normalized image embedding, shape (1, 512).
    """
    processor = CLIPProcessor.from_pretrained(model_name)
    model = CLIPModel.from_pretrained(model_name)

    # Preprocess and convert to tensors
    inputs = processor(images=pil_img, return_tensors="pt")
    
    # Extract features without computing gradients
    with torch.no_grad():
        image_embeds = model.get_image_features(**inputs)
        # Ensure the embedding is L2-normalized
        image_embeds = image_embeds / image_embeds.norm(p=2, dim=-1, keepdim=True)
    
    return image_embeds





In [None]:
IMAGE_PATH = "path/to/your/image.jpg"

# Load and normalize
pil_image, normalized_array = load_image(IMAGE_PATH)
print(f"Image loaded: {normalized_array.shape}, dtype: {normalized_array.dtype}")

# Extract CLIP embedding
embeds = extract_clip_image_embedding(pil_image)
print(f"CLIP image embedding: {embeds.shape}")

#save the embedding to a file
output_file = "clip_image_embedding.pt"
torch.save(embeds, output_file)
