In [None]:
from IPython.core.interactiveshell import InteractiveShell

InteractiveShell.ast_node_interactivity = "all"

# Foundation Models

## Goals

- Download, setup foundation model
- Perform zero-shot image classification

## Google Colab Check

In [None]:
import sys

# Detect Colab
IN_COLAB = "google.colab" in sys.modules
print(f"In Colab: {IN_COLAB}")

# Show prominent message if in Colab
if IN_COLAB:
    try:
        from IPython.display import Markdown, display

        display(
            Markdown(
                """
> 💾 **Optionally:**  
> Save this notebook to your **personal Google Drive** to persist any changes.
>
> *Go to `File ▸ Save a copy in Drive` before editing.*
            """
            )
        )
    except Exception:
        print(
            "\n💾 Optionally: Save the notebook to your personal Google Drive to persist changes.\n"
        )

We mount google drive to store data.

In [None]:
if IN_COLAB:
    from google.colab import drive

    drive.mount("/content/drive")

## Specify Data Path

**Modify the following paths if necessary.**

That is where your data will be stored.

In [None]:
from pathlib import Path

if IN_COLAB:
    DATA_PATH = Path("/content/drive/MyDrive/cas-dl-module-compvis-part1")
else:
    DATA_PATH = Path("../../data")
assert DATA_PATH.exists(), f"PATH: {DATA_PATH} does not exist."

## Install Lectures Package

Install `dl_cv_lectures` package with all necessary dependencies.

This package provides the environment of the exercises-repository, as well as helper- and utils modules: [Link](https://github.com/marco-willi/cas-dl-compvis-exercises-hs2025)

The following code installs the package from a local repository (if available), otherwise it installs it from the exercise repository.

In [None]:
import subprocess
import sys
from pathlib import Path

from rich.console import Console

console = Console()


def ensure_dl_cv_lectures():
    """Ensure dl_cv_lectures is installed (local or from GitHub)."""
    try:
        import dl_cv_lectures

        console.print(
            "[bold green]✅ dl_cv_lectures installed — all good![/bold green]"
        )
        return
    except ImportError:
        console.print("[bold yellow]⚠️ dl_cv_lectures not found.[/bold yellow]")
    repo_path = Path("/workspace/pyproject.toml")
    if repo_path.exists():
        console.print("[cyan]📦 Installing from local repository...[/cyan]")
        cmd = [sys.executable, "-m", "pip", "install", "-e", "/workspace"]
    else:
        console.print("[cyan]🌐 Installing from GitHub repository...[/cyan]")
        cmd = [
            sys.executable,
            "-m",
            "pip",
            "install",
            "git+https://github.com/marco-willi/cas-dl-compvis-exercises-hs2025",
        ]
    try:
        subprocess.run(cmd, check=True)
        console.print("[bold green]✅ Installation successful![/bold green]")
    except subprocess.CalledProcessError as e:
        console.print(f"[bold red]❌ Installation failed ({e}).[/bold red]")


ensure_dl_cv_lectures()

### Load Libraries

Load all libraries and packages used in this exercise.

In [None]:
import torch
from PIL import Image

Define a default device for your computations.

**GPU is strongly recommended!** (otherwise the images have to be restricted in size).

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using: {device}")

## 1)  The CLIP Model

The CLIP model [Link](https://arxiv.org/abs/2103.00020) has had a profound impact in the deep learning community and in practical applications.

We are going to use it for zero-shot image classification.


In [None]:
import requests
from transformers import CLIPModel, CLIPProcessor

model = CLIPModel.from_pretrained(
    "openai/clip-vit-base-patch32", cache_dir=DATA_PATH.joinpath("hf_cache")
)
processor = CLIPProcessor.from_pretrained(
    "openai/clip-vit-base-patch32", cache_dir=DATA_PATH.joinpath("hf_cache")
)

We download an image.

In [None]:
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

Now we define a prompt for each class that we are interested in. In this example the classes `cat` and `dog`.

In [None]:
inputs = processor(
    text=["a photo of a cat", "a photo of a dog"],
    images=image,
    return_tensors="pt",
    padding=True,
)
# we can take the softmax to get the label probabilities

Now we evaluate the similarities of the image with respect to each prompt.

In [None]:
outputs = model(**inputs)
logits_per_image = outputs.logits_per_image  # this is the image-text similarity score

We can now evaluate the relative similarities and produce a probability distribution (softmax) over all classes.

In [None]:
probs = logits_per_image.softmax(dim=1)
probs

**Task**: Play around with the prompts. Can you also classify / detect other objects in the images?  How about a different image?

## 2) DINOv3: Self-Supervised Image Embeddings

DINOv3 is a state-of-the-art self-supervised vision transformer that learns powerful visual representations without requiring labels. 

In this exercise, we'll:
1. **Extract and compare image embeddings** - Compare global features between different images
2. **Analyze patch-based embeddings** - Explore local features within images to understand spatial relationships

**What makes DINOv3 special?**
- Trained using self-distillation (DINO = **D**istillation with **NO** labels)
- Produces both global (image-level) and local (patch-level) features
- Excellent transfer learning capabilities for downstream tasks

### Load DINOv3 Model with timm

We'll use the `timm` library which provides easy access to the DINOv3 model trained on the LVD-1689M dataset (1.689 billion images).

In [None]:
import matplotlib.pyplot as plt
import requests
import timm
import torch
from PIL import Image

# Load DINOv3 ViT-Base model from timm
model_name = "vit_base_patch16_dinov3.lvd1689m"

dinov3 = timm.create_model(
    model_name,
    pretrained=True,
    num_classes=0,  # Remove classification head to get embeddings
)
dinov3 = dinov3.to(device)
dinov3.eval()

# Get model-specific transforms
data_config = timm.data.resolve_model_data_config(dinov3)
transforms = timm.data.create_transform(**data_config, is_training=False)

print(f"✅ Model loaded: {model_name}")
print("📊 Model stats:")
print(f"   - Parameters: {sum(p.numel() for p in dinov3.parameters()) / 1e6:.1f}M")
print(f"   - Input size: {data_config['input_size']}")
print(f"   - Device: {device}")

### Get Data

Now we get some images!

In [None]:
from dl_cv_lectures.data import (
    cats_vs_dogs,
)
from dl_cv_lectures.data.image_folder import ImageFolder

In [None]:
cats_vs_dogs.download(DATA_PATH)

In [None]:
image_root_path = DATA_PATH.joinpath("cats_vs_dogs/PetImages")
ds = ImageFolder(image_root_path)
ds.classes
ds[0]
ds[0]["image"]

### Let's calculate Image Embeddings


Let's select a random number of images.

In [None]:
import random

# Select random images
n_samples = 200  # Number of images to sample
random.seed(123)  # For reproducibility

# Get random indices
random_indices = random.sample(range(len(ds)), min(n_samples, len(ds)))

# Get the sampled images
sampled_images = [ds[i] for i in random_indices]

sampled_labels = [img["label"] for img in sampled_images]

print(f"Selected {len(sampled_images)} random images")
print(f"Classes: {[img['label'] for img in sampled_images[:5]]}")  # Show first 5 labels

Now we calcualate the embeddings.

In [None]:
# Calculate global embeddings for all sampled images
embeddings = []
labels = []

with torch.no_grad():
    for sample in sampled_images:
        # Preprocess image
        img_tensor = transforms(sample["image"]).unsqueeze(0).to(device)

        # Get global embedding
        embedding = dinov3(img_tensor)

        embeddings.append(embedding.cpu())
        labels.append(sample["label"])

# Stack all embeddings into a single tensor
embeddings = torch.cat(embeddings, dim=0)
labels = torch.tensor(labels)

print("✅ Embeddings calculated!")
print(f"   - Shape: {embeddings.shape}")
print(f"   - Labels shape: {labels.shape}")

### Analyse Embeddings

In [None]:
from sklearn.manifold import TSNE

# Calculate t-SNE
tsne = TSNE(n_components=2, random_state=123, perplexity=30, max_iter=1000)
embeddings_2d = tsne.fit_transform(embeddings.numpy())

print("✅ t-SNE calculated!")
print(f"   - Original shape: {embeddings.shape}")
print(f"   - Reduced shape: {embeddings_2d.shape}")

We can now inspect how the images cluster. Let's use tsne clustering.

In [None]:
import seaborn as sns

fig, ax = plt.subplots(figsize=(10, 8))
_ = sns.scatterplot(
    x=embeddings_2d[:, 0], y=embeddings_2d[:, 1], alpha=0.7, s=50, ax=ax
).set(
    title="t-SNE visualization of DINOv3 Image Embeddings",
    xlabel="t-SNE dimension 1",
    ylabel="t-SNE dimension 2",
)

**Question**: What do you observe?

In [None]:
fig, ax = plt.subplots(figsize=(10, 8))
_ = sns.scatterplot(
    x=embeddings_2d[:, 0],
    y=embeddings_2d[:, 1],
    hue=sampled_labels,
    alpha=0.7,
    s=50,
    ax=ax,
).set(
    title="t-SNE visualization of DINOv3 Image Embeddings",
    xlabel="t-SNE dimension 1",
    ylabel="t-SNE dimension 2",
)