In [3]:
import torch
from PIL import Image
import requests
from transformers import AutoImageProcessor, AutoModel



In [None]:
def extract_dinov3_features():
    """
    This function demonstrates how to extract image features using a pre-trained
    DINOv3 model from the Hugging Face Hub.
    """
    # --- 1. Set up the model and image processor ---
    # We'll use a smaller, efficient version of DINOv3 for this demo.
    model_name = "facebook/dinov3-vits16"
    print(f"Loading model: {model_name}")

    # The AutoImageProcessor handles all the necessary transformations for the model,
    # such as resizing, normalization, and converting the image to a tensor.
    processor = AutoImageProcessor.from_pretrained(model_name)

    # The AutoModel class loads the pre-trained DINOv3 model weights.
    model = AutoModel.from_pretrained(model_name)

    # Move the model to the GPU if available, for faster processing.
    device = "cuda" if torch.cuda.is_available() else "cpu"
    model.to(device)
    print(f"Using device: {device}")


    # --- 2. Load and prepare the image ---
    # We'll use a sample image from the web.
    url = "http://images.cocodataset.org/val2017/000000039769.jpg"
    print(f"\nLoading image from: {url}")
    try:
        image = Image.open(requests.get(url, stream=True).raw).convert("RGB")
        print("Image loaded successfully.")
    except Exception as e:
        print(f"Error loading image: {e}")
        return

    # 

    # --- 3. Process the image and extract features ---
    print("\nProcessing image and extracting features...")

    # The image processor prepares the image in the format the model expects.
    # `return_tensors="pt"` ensures the output is a PyTorch tensor.
    inputs = processor(images=image, return_tensors="pt").to(device)

    # We run the model in inference mode, without calculating gradients.
    with torch.no_grad():
        outputs = model(**inputs)

    # The primary output is the `last_hidden_state`.
    last_hidden_state = outputs.last_hidden_state


    # --- 4. Understand the output features ---
    print("\n--- Feature Extraction Results ---")

    # The output tensor has the shape: [batch_size, num_tokens, embedding_dim]
    print(f"Shape of the output tensor: {last_hidden_state.shape}")

    # The first token is the [CLS] token, which represents the global features
    # of the entire image. This is often used for image classification tasks.
    cls_token = last_hidden_state[:, 0]
    print(f"Shape of the [CLS] (global) token: {cls_token.shape}")

    # The subsequent tokens are the patch tokens. DINOv3 divides the image into
    # a grid of patches (e.g., 16x16 pixels) and creates a feature vector for each.
    # These are useful for dense prediction tasks like segmentation.
    patch_tokens = last_hidden_state[:, 1:]
    print(f"Shape of the patch tokens: {patch_tokens.shape}")
    print("\nThis script has successfully extracted both global and patch-level features from the image using DINOv3.")


