<a href="https://colab.research.google.com/github/pgurazada/explore-dinov2/blob/main/indian_food_classification_dinov2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduction

In this exercise, we train a classification model that uses the feature extraction capabilities of [Dino V2](https://arxiv.org/pdf/2304.07193.pdf).

# Imports

In [1]:
!pip install -q datasets flaml

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m521.2/521.2 kB[0m [31m7.9 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m295.2/295.2 kB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m115.3/115.3 kB[0m [31m10.6 MB/s[0m eta [36m0:00:00[0m
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m11.2 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import os
import torch
import io

import numpy as np
import torchvision.transforms as T

from PIL import Image
from tqdm import tqdm
from datasets import load_dataset
from flaml import AutoML

In [3]:
!pip show torch torchvision

Name: torch
Version: 2.1.0+cu118
Summary: Tensors and Dynamic neural networks in Python with strong GPU acceleration
Home-page: https://pytorch.org/
Author: PyTorch Team
Author-email: packages@pytorch.org
License: BSD-3
Location: /usr/local/lib/python3.10/dist-packages
Requires: filelock, fsspec, jinja2, networkx, sympy, triton, typing-extensions
Required-by: fastai, torchaudio, torchdata, torchtext, torchvision
---
Name: torchvision
Version: 0.16.0+cu118
Summary: image and video datasets and models for torch deep learning
Home-page: https://github.com/pytorch/vision
Author: PyTorch Core Team
Author-email: soumith@pytorch.org
License: BSD
Location: /usr/local/lib/python3.10/dist-packages
Requires: numpy, pillow, requests, torch
Required-by: fastai


# Data

In [4]:
indian_foods_ds = load_dataset("bharat-raghunathan/indian-foods-dataset")

Downloading readme:   0%|          | 0.00/2.79k [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/281M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/261M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/147M [00:00<?, ?B/s]

Extracting data files:   0%|          | 0/2 [00:00<?, ?it/s]

Generating train split:   0%|          | 0/3809 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/961 [00:00<?, ? examples/s]

In [5]:
train_data = list(indian_foods_ds['train'])

In [6]:
test_data = list(indian_foods_ds['test'])

# Model for Embeddings

In [7]:
dinov2_vit14 = torch.hub.load("facebookresearch/dinov2", "dinov2_vitb14")

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vitb14/dinov2_vitb14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vitb14_pretrain.pth
100%|██████████| 330M/330M [00:01<00:00, 193MB/s]


In [8]:
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
dinov2_vit14.to(device)

DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=3072, out_features=768, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

In [9]:
transform_image = T.Compose(
    [
        T.ToTensor(),
        T.Resize(244, antialias=True),
        T.CenterCrop(224),
        T.Normalize([0.5], [0.5])
    ]
)

In [10]:
def load_image(img: str) -> torch.Tensor:
    """
    Load an image and return a tensor that can be used as an input to DINOv2.
    """

    transformed_img = transform_image(img)[:3].unsqueeze(0)

    return transformed_img

In [18]:
def compute_embeddings(files: list) -> dict:
    """
    Create an index that contains all of the images in the specified list of files.
    """
    all_embeddings = {}
    bad_image_indices = []

    with torch.no_grad():
      for i, file in enumerate(tqdm(files)):
        try:
            embeddings = dinov2_vit14(load_image(file['image']).to(device))
            all_embeddings[i] = np.array(embeddings[0].cpu().numpy()).reshape(1, -1).tolist()
        except Exception as e:
            bad_image_indices.append(i)
            continue

    return all_embeddings, bad_image_indices

In [19]:
train_embeddings, bad_train_image_indices = compute_embeddings(train_data)

100%|██████████| 3809/3809 [01:29<00:00, 42.35it/s]


In [20]:
len(train_embeddings)

3774

In [21]:
test_embeddings, bad_test_image_indices = compute_embeddings(test_data)

100%|██████████| 961/961 [00:30<00:00, 32.00it/s]


In [22]:
len(test_embeddings)

955

In [23]:
ytrain = np.array([file['label'] for file in train_data])
ytrain_cleaned = np.delete(ytrain, bad_train_image_indices)

train_embedding_list = list(train_embeddings.values())
Xtrain = np.array(train_embedding_list).reshape(-1, dinov2_vit14.embed_dim)

In [26]:
ytest = np.array([file['label'] for file in test_data])
ytest_cleaned = np.delete(ytest, bad_test_image_indices)

test_embedding_list = list(test_embeddings.values())
Xtest = np.array(test_embedding_list).reshape(-1, dinov2_vit14.embed_dim)

# Model for Classification

In [27]:
automl = AutoML()

In [28]:
automl.fit(
    X_train=Xtrain, y_train=ytrain_cleaned,
    time_budget=240,
    log_file_name='indian-food.log',
    task='classification',
    metric='accuracy',
    split_ratio=.3
)

[flaml.automl.logger: 12-13 14:29:58] {1679} INFO - task = classification
[flaml.automl.logger: 12-13 14:29:58] {1690} INFO - Evaluation method: holdout
[flaml.automl.logger: 12-13 14:29:58] {1788} INFO - Minimizing error metric: 1-accuracy
[flaml.automl.logger: 12-13 14:29:58] {1900} INFO - List of ML learners in AutoML Run: ['lgbm', 'rf', 'xgboost', 'extra_tree', 'xgb_limitdepth', 'lrl1']
[flaml.automl.logger: 12-13 14:29:58] {2218} INFO - iteration 0, current learner lgbm
[flaml.automl.logger: 12-13 14:30:03] {2344} INFO - Estimated sufficient time budget=50068s. Estimated necessary time budget=1154s.
[flaml.automl.logger: 12-13 14:30:03] {2391} INFO -  at 5.3s,	estimator lgbm's best error=0.1400,	best estimator lgbm's best error=0.1400
[flaml.automl.logger: 12-13 14:30:03] {2218} INFO - iteration 1, current learner lgbm
[flaml.automl.logger: 12-13 14:30:05] {2391} INFO -  at 7.2s,	estimator lgbm's best error=0.1400,	best estimator lgbm's best error=0.1400
[flaml.automl.logger: 12-1

In [33]:
print('Best ML leaner:', automl.best_estimator)
print('Best hyperparmeter config:', automl.best_config)
print(f'Best accuracy on validation data: {(1-automl.best_loss):.3f}')
print(f'Training duration of best run: {(automl.best_config_train_time):.4f} s')

Best ML leaner: extra_tree
Best hyperparmeter config: {'n_estimators': 124, 'max_features': 0.12458198341734236, 'max_leaves': 999, 'criterion': 'entropy'}
Best accuracy on validation data: 0.972
Training duration of best run: 4.6179 s


In [34]:
automl.model.estimator

In [35]:
ypred = automl.predict(Xtest)

In [36]:
(ypred == ytest_cleaned).mean()

0.9727748691099476