<a href="https://colab.research.google.com/github/pgurazada/explore-dinov2/blob/main/painting_style_classification_dinov2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this exercise, we train a classification model that uses the feature extraction capabilities of [Dino V2](https://arxiv.org/pdf/2304.07193.pdf).

# Imports

In [1]:
!pip install -q datasets flaml

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m3.5 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.2/295.2 kB[0m [31m25.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m15.4 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m17.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import torch

import numpy as np
import pandas as pd

import torchvision.transforms as T

from PIL import Image

from tqdm import tqdm

from datasets import load_dataset

from flaml import AutoML

In [3]:
!pip show torch torchvision

Name: torch
Version: 2.1.0+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, triton, typing-extensions
Required-by: fastai, torchaudio, torchdata, torchtext, torchvision
---
Name: torchvision
Version: 0.16.0+cu118
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, pillow, requests, torch
Required-by: fastai


# Data

In [4]:
painting_style_ds = load_dataset(
    "keremberke/painting-style-classification",
    name="full"
)

Downloading builder script:   0%|          | 0.00/4.82k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/2.17k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/3 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/150M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/42.8M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/21.0M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/3 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [5]:
train_data = {}

for file in painting_style_ds['train']:
    train_data[file['image_file_path']] = file['labels']

In [7]:
test_data = {}

for file in painting_style_ds['test']:
    test_data[file['image_file_path']] = file['labels']

# Model for Embeddings

In [8]:
dinov2_vit14 = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitb14_pretrain.pth
100%|██████████| 330M/330M [00:03<00:00, 101MB/s]


In [9]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
dinov2_vit14.to(device)

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

In [10]:
transform_image = T.Compose(
    [
        T.ToTensor(),
        T.Resize(244, antialias=True),
        T.CenterCrop(224),
        T.Normalize([0.5], [0.5])
    ]
)

In [11]:
def load_image(img: str) -> torch.Tensor:
    """
    Load an image and return a tensor that can be used as an input to DINOv2.
    """
    img = Image.open(img)

    transformed_img = transform_image(img)[:3].unsqueeze(0)

    return transformed_img

In [12]:
def compute_embeddings(files: list) -> dict:
    """
    Create an index that contains all of the images in the specified list of files.
    """
    all_embeddings = {}

    with torch.no_grad():
      for i, file in enumerate(tqdm(files)):
        embeddings = dinov2_vit14(load_image(file).to(device))

        all_embeddings[file] = np.array(embeddings[0].cpu().numpy()).reshape(1, -1).tolist()

    return all_embeddings

In [13]:
train_embeddings = compute_embeddings(train_data.keys())

100%|██████████| 4493/4493 [02:02<00:00, 36.55it/s]


In [15]:
test_embeddings = compute_embeddings(test_data.keys())

100%|██████████| 629/629 [00:15<00:00, 39.97it/s]


In [16]:
ytrain = np.array([train_data[file] for file in train_data.keys()])
train_embedding_list = list(train_embeddings.values())
Xtrain = np.array(train_embedding_list).reshape(-1, dinov2_vit14.embed_dim)

ytest = np.array([test_data[file] for file in test_data.keys()])
test_embedding_list = list(test_embeddings.values())
Xtest = np.array(test_embedding_list).reshape(-1, dinov2_vit14.embed_dim)

# Model for Classification

In [17]:
automl = AutoML()

In [18]:
automl.fit(
    X_train=Xtrain, y_train=ytrain,
    time_budget=240,
    log_file_name='painting_style.log',
    task='classification',
    metric='accuracy',
    split_ratio=.3
)

[flaml.automl.logger: 12-13 12:06:48] {1679} INFO - task = classification
[flaml.automl.logger: 12-13 12:06:48] {1690} INFO - Evaluation method: holdout


INFO:flaml.automl.task.generic_task:class 2 augmented from 5 to 20
INFO:flaml.automl.task.generic_task:class 5 augmented from 6 to 24
INFO:flaml.automl.task.generic_task:class 6 augmented from 13 to 26
INFO:flaml.automl.task.generic_task:class 22 augmented from 14 to 28


[flaml.automl.logger: 12-13 12:06:48] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-13 12:06:48] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-13 12:06:48] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-13 12:06:52] {2344} INFO - Estimated sufficient time budget=33567s. Estimated necessary time budget=774s.
[flaml.automl.logger: 12-13 12:06:52] {2391} INFO -  at 3.5s,	estimator lgbm's best error=0.7300,	best estimator lgbm's best error=0.7300
[flaml.automl.logger: 12-13 12:06:52] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-13 12:06:55] {2391} INFO -  at 7.1s,	estimator lgbm's best error=0.7300,	best estimator lgbm's best error=0.7300
[flaml.automl.logger: 12-13 12:06:55] {2218} INFO - iteration 2, current learner lgbm
[flaml.automl.logger: 12-13 12:06:57] {2391} INFO -  at 8.6s,	estimator lgbm's best error=0.66

In [19]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print(f'Best accuracy on validation data: {(1-automl.best_loss):.3f}')
print(f'Training duration of best run: {(automl.best_config_train_time):.4f} s')

Best ML leaner: lgbm
Best hyperparmeter config: {'n_estimators': 28, 'num_leaves': 13, 'min_child_samples': 9, 'learning_rate': 0.1111072858208537, 'log_max_bin': 7, 'colsample_bytree': 0.8345075630938922, 'reg_alpha': 0.006958608037974516, 'reg_lambda': 1.0522050334575102}
Best accuracy on validation data: 0.439
Training duration of best run: 23.8047 s


In [20]:
automl.model.estimator

In [21]:
ypred = automl.predict(Xtest)

In [22]:
(ypred == ytest).mean()

0.4117647058823529