<a href="https://colab.research.google.com/github/obete/ClassifierDINOv3/blob/main/ClassifierDINOv3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **DINOv3 as base with a classifier head**


In [None]:
# Target folder under My Drive
import os
from google.colab import drive
drive.mount('/content/drive')
DRIVE_ROOT = '/content/drive/MyDrive/Colab Notebooks/DINOv3_Classifier/'

In [None]:
from torchvision import transforms

# For ViT-Small/Base: 224x224, for ViT-Large: 384x384
img_size = 224

train_transform = transforms.Compose([
    transforms.RandomResizedCrop(img_size),      # random crops
    transforms.RandomHorizontalFlip(),           # flip left-right
    transforms.ColorJitter(0.2, 0.2, 0.2, 0.1),  # color variation
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406),
                         std=(0.229, 0.224, 0.225))
])

val_transform = transforms.Compose([
    transforms.Resize((img_size, img_size)),     # no randomness
    transforms.ToTensor(),
    transforms.Normalize(mean=(0.485, 0.456, 0.406),
                         std=(0.229, 0.224, 0.225))
])


In [None]:
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader

train_dir = f'{DRIVE_ROOT}/train_set'
val_dir = f'{DRIVE_ROOT}/val_set'

train_dataset = ImageFolder(root=train_dir, transform=train_transform)
val_dataset = ImageFolder(root=val_dir, transform=val_transform)

train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=32, shuffle=False)

# Class names
print(train_dataset.classes)
num_classes = len(train_dataset.classes)
print(num_classes)

In [None]:
#@title Using DINOv3

!pip install -U transformers

Local Inference on GPU
Model page: https://huggingface.co/facebook/dinov3-vith16plus-pretrain-lvd1689m

⚠️ If the generated code snippets do not work, please open an issue on either the model repo and/or on huggingface.js 🙏

The model you are trying to use is gated. Please make sure you have access to it by visiting the model page.To run inference, either set HF_TOKEN in your environment variables/ Secrets or run the following cell to login. 🤗

In [None]:
from huggingface_hub import login
login(new_session=False)

In [None]:
# @title Load model directly
from transformers import AutoImageProcessor, AutoModel

processor = AutoImageProcessor.from_pretrained("facebook/dinov3-vith16plus-pretrain-lvd1689m")
backbone = AutoModel.from_pretrained("facebook/dinov3-vith16plus-pretrain-lvd1689m")

In [None]:
#@title Freeze backbone → use DINOv3 as a fixed feature extractor.
for param in backbone.parameters():
    param.requires_grad = False  # freeze backbone


In [None]:
#@title Custom Classification head
import torch
import torch.nn as nn
device = "cuda" if torch.cuda.is_available() else "cpu"

class DinoClassifier(nn.Module):
    def __init__(self, backbone, num_classes):
        super().__init__()
        self.backbone = backbone
        hidden_size = backbone.config.hidden_size
        self.head = nn.Sequential(
            nn.LayerNorm(hidden_size),
            nn.Linear(hidden_size, num_classes)
        )

    def forward(self, x):
        features = self.backbone(x).last_hidden_state[:, 0]  # CLS token
        return self.head(features)

model = DinoClassifier(backbone, num_classes).to(device)

In [None]:
!pip install torchmetrics

In [None]:
import torchmetrics

num_epochs = 150
lr = 1e-4
best_val_acc = 0.0

 #Loss, optimizer, metrics
# -------------------------
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=1e-4)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=30, gamma=0.1)  # decay LR

train_acc_metric = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes).to(device)
val_acc_metric   = torchmetrics.Accuracy(task="multiclass", num_classes=num_classes).to(device)


for epoch in range(150):
    # ---- Training ----
    model.train()
    train_loss = 0.0
    train_acc_metric.reset()
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        outputs = model(imgs)
        loss = criterion(outputs, labels)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        train_loss += loss.item() * imgs.size(0)
        train_acc_metric.update(outputs, labels)

    train_loss /= len(train_dataset)
    train_acc = train_acc_metric.compute()

    # ---- Validation ----
    model.eval()
    val_loss = 0.0
    val_acc_metric.reset()
    with torch.no_grad():
        for imgs, labels in val_loader:
            imgs, labels = imgs.to(device), labels.to(device)
            outputs = model(imgs)
            loss = criterion(outputs, labels)

            val_loss += loss.item() * imgs.size(0)
            val_acc_metric.update(outputs, labels)

    val_loss /= len(val_dataset)
    val_acc = val_acc_metric.compute()

    # ---- Save best checkpoint ----
    if val_acc > best_val_acc:
        best_val_acc = val_acc
        torch.save(model.state_dict(), "best_dinov3_classifier.pth")

    # ---- Scheduler step ----
    scheduler.step()

    # ---- Log progress ----
    print(f"Epoch {epoch+1}/{num_epochs} "
          f"| Train Loss: {train_loss:.4f}, Acc: {train_acc:.4f} "
          f"| Val Loss: {val_loss:.4f}, Acc: {val_acc:.4f} "
          f"| LR: {scheduler.get_last_lr()[0]:.6f}")

print("Training complete. Best Val Acc:", best_val_acc.item())
