<a href="https://colab.research.google.com/github/pgurazada/explore-dinov2/blob/main/snacks_classification_dinov2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this exercise, we train a classification model that uses the feature extraction capabilities of [Dino V2](https://arxiv.org/pdf/2304.07193.pdf).

# Imports

In [1]:
!pip install -q datasets flaml

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.2 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.2/295.2 kB[0m [31m20.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m11.8 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m15.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import torch

import numpy as np
import torchvision.transforms as T

from PIL import Image
from tqdm import tqdm
from datasets import load_dataset
from flaml import AutoML

In [3]:
!pip show torch torchvision

Name: torch
Version: 2.1.0+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, triton, typing-extensions
Required-by: fastai, torchaudio, torchdata, torchtext, torchvision
---
Name: torchvision
Version: 0.16.0+cu118
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, pillow, requests, torch
Required-by: fastai


# Data

In [4]:
snacks_ds = load_dataset("Matthijs/snacks")

Downloading builder script:   0%|          | 0.00/3.50k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/1.69k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/110M [00:00<?, ?B/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

# Model for Embeddings

In [5]:
dinov2_vit14 = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitb14_pretrain.pth
100%|██████████| 330M/330M [00:01<00:00, 238MB/s]


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
dinov2_vit14.to(device)

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

In [7]:
transform_image = T.Compose(
    [
        T.ToTensor(),
        T.Resize(244, antialias=True),
        T.CenterCrop(224),
        T.Normalize([0.5], [0.5])
    ]
)

In [8]:
def load_image(img: str) -> torch.Tensor:
    """
    Load an image and return a tensor that can be used as an input to DINOv2.
    """

    transformed_img = transform_image(img)[:3].unsqueeze(0)

    return transformed_img

In [9]:
def compute_embeddings(files) -> dict:
    """
    Create an index that contains all of the images in the specified list of files.
    """
    all_embeddings = {}
    bad_image_indices = []

    with torch.no_grad():
      for i, file in enumerate(tqdm(files)):
        try:
            embeddings = dinov2_vit14(load_image(file['image']).to(device))
            all_embeddings[i] = np.array(embeddings[0].cpu().numpy()).reshape(1, -1).tolist()
        except Exception as e:
            bad_image_indices.append(i)
            continue

    return all_embeddings, bad_image_indices

In [10]:
train_embeddings, bad_train_image_indices = compute_embeddings(snacks_ds['train'])

100%|██████████| 4838/4838 [01:57<00:00, 41.25it/s]


In [11]:
len(train_embeddings)

4838

In [12]:
len(bad_train_image_indices)

0

In [14]:
test_embeddings, bad_test_image_indices = compute_embeddings(snacks_ds['test'])

100%|██████████| 952/952 [00:24<00:00, 39.34it/s]


In [15]:
len(test_embeddings)

952

In [16]:
len(bad_test_image_indices)

0

In [19]:
ytrain = np.array([file['label'] for file in snacks_ds['train']])
train_embedding_list = list(train_embeddings.values())
Xtrain = np.array(train_embedding_list).reshape(-1, dinov2_vit14.embed_dim)

In [20]:
ytest = np.array([file['label'] for file in snacks_ds['test']])
test_embedding_list = list(test_embeddings.values())
Xtest = np.array(test_embedding_list).reshape(-1, dinov2_vit14.embed_dim)

# Model for Classification

In [21]:
automl = AutoML()

In [22]:
automl.fit(
    X_train=Xtrain, y_train=ytrain,
    time_budget=240,
    log_file_name='snacks.log',
    task='classification',
    metric='accuracy',
    split_ratio=.3
)

[flaml.automl.logger: 12-13 16:00:14] {1679} INFO - task = classification
[flaml.automl.logger: 12-13 16:00:14] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-13 16:00:14] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-13 16:00:14] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-13 16:00:14] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-13 16:00:21] {2344} INFO - Estimated sufficient time budget=67026s. Estimated necessary time budget=1545s.
[flaml.automl.logger: 12-13 16:00:21] {2391} INFO -  at 6.9s,	estimator lgbm's best error=0.2217,	best estimator lgbm's best error=0.2217
[flaml.automl.logger: 12-13 16:00:21] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-13 16:00:24] {2391} INFO -  at 10.0s,	estimator lgbm's best error=0.2217,	best estimator lgbm's best error=0.2217
[flaml.automl.logger: 12-

In [23]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print(f'Best accuracy on validation data: {(1-automl.best_loss):.3f}')
print(f'Training duration of best run: {(automl.best_config_train_time):.4f} s')

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 57, 'num_leaves': 13, 'min_child_samples': 9, 'learning_rate': 0.15922418945050276, 'log_max_bin': 10, 'colsample_bytree': 0.8345075630938922, 'reg_alpha': 0.0009765625, 'reg_lambda': 1.6702305601383631}
Best accuracy on validation data: 0.926
Training duration of best run: 168.2140 s


In [24]:
automl.model.estimator

In [25]:
ypred = automl.predict(Xtest)

In [26]:
(ypred == ytest).mean()

0.9117647058823529