<a href="https://colab.research.google.com/github/mystakhs/test-Self-Supervised-Learning/blob/main/DINOv2_CIFAR10.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# DINOv2の動作確認
## 1. 必要パッケージインストール

In [None]:
!pip install timm



## 2. インポート

In [None]:
# 基本パッケージ
import torch
import torch.nn as nn
import torch.optim as optim
import numpy as np

# 画像処理・データセット関連
from torchvision import datasets, transforms
from torch.utils.data import Dataset, DataLoader

# 評価用
from sklearn.metrics import accuracy_score, confusion_matrix, ConfusionMatrixDisplay
from sklearn.manifold import TSNE
from sklearn.metrics.pairwise import cosine_similarity as sklearn_cosine_similarity
from collections import Counter

# 可視化・ログ
import matplotlib.pyplot as plt
import pandas as pd

# DINOv2のBackbone用
import timm

# デバイス設定
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')


## 3. DINOv2に合わせたモデル定義
※DINOv2はProjectorを内部に持つので、別途Projectorクラスは不要。

ここではtimmから**dinov2_vits14**をロードして特徴抽出専用に使用。

In [None]:
# DINOv2 Feature Extractor
class DINOv2Extractor(nn.Module):
    def __init__(self, model_name="vit_small", out_dim=384):
        super().__init__()
        self.backbone = timm.create_model('vit_small_patch14_dinov2.lvd142m', pretrained=True)
        self.out_dim = out_dim

    def forward(self, x):
        features = self.backbone.forward_features(x)  # ここで特徴量だけ取り出す
        return features


## 4. データセット
※SimCLR時とほぼ同様。ただし、DINOv2は解像度が最低でも224x224推奨。

In [None]:
# DINOv2用データセット（CIFAR10 224x224リサイズ）
class DINOv2Dataset(Dataset):
    def __init__(self, root, transform, train=True):
        self.dataset = datasets.CIFAR10(root=root, train=train, download=True)
        self.transform = transform

    def __len__(self):
        return len(self.dataset)

    def __getitem__(self, idx):
        image, label = self.dataset[idx]
        image = self.transform(image)
        return image, label


## 5. データ拡張・ローダー


In [None]:
# データ拡張（DINOv2推奨に近いもの）
transform = transforms.Compose([
    transforms.Resize(518),
    transforms.CenterCrop(518),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.4, 0.4, 0.4, 0.1),
    transforms.RandomGrayscale(p=0.2),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# データローダー
train_dataset = DINOv2Dataset(root='./data', transform=transform, train=True)
train_loader = DataLoader(train_dataset, batch_size=128, shuffle=True, num_workers=2)

val_dataset = DINOv2Dataset(root='./data', transform=transform, train=False)
val_loader = DataLoader(val_dataset, batch_size=128, shuffle=False, num_workers=2)


## 6. 事前定義された損失関数（DINOv2ではContrastive不要）
今回は、DINOv2の特徴を直接k-NN評価。
そのため、自己教師あり事前学習済みモデルをそのまま使い、損失関数不要。

## 7. 特徴量抽出・k-NN

In [None]:
# 特徴量抽出
def feature_extraction(model, data_loader):
    model.eval()
    all_features = []
    all_labels = []

    with torch.no_grad():
        for x, y in data_loader:
            # x = x.to(device, dtype=torch.float16)  # ← 省メモリ化 (float16)
            x = x.to(device)  # ★float16変換しない！！
            y = y.to(device)

            features = model(x)
            # features = features.float()  # 安全のため float32に戻す（optional）

            # 特徴量プーリング（DINOv2は出力が(バッチ, パッチ数+1, hidden_dim)）
            if features.ndim == 3:
                features = features[:, 0]  # CLSトークンだけ取る

            all_features.append(features.cpu())
            all_labels.append(y.cpu())

    # メモリ圧縮したまま結合
    features = torch.cat(all_features, dim=0).numpy()
    labels = torch.cat(all_labels, dim=0).numpy()

    return features, labels


# k-NN分類（コサイン類似度）
def knn_cosine(train_features, train_labels, val_features, k=200):
    cosine_sim = sklearn_cosine_similarity(val_features, train_features)

    val_label_pred = []
    for i in range(val_features.shape[0]):
        top_k_indices = np.argsort(-cosine_sim[i])[:k]
        top_k_labels = train_labels[top_k_indices]
        most_common_label = Counter(top_k_labels).most_common(1)[0][0]
        val_label_pred.append(most_common_label)

    return np.array(val_label_pred)


## 8. モデル・評価・可視化

In [None]:
## モデル
model = DINOv2Extractor().to(device)

# 特徴量取得
train_features, train_labels = feature_extraction(model, train_loader)
val_features, val_labels = feature_extraction(model, val_loader)

# k-NN
predictions = knn_cosine(train_features, train_labels, val_features, k=200)
knn_acc = accuracy_score(val_labels, predictions)
print(f'k-NN Validation Accuracy: {knn_acc:.2f}')

# 混同行列
cm = confusion_matrix(val_labels, predictions)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=val_dataset.dataset.classes)
disp.plot(xticks_rotation='vertical')
plt.title("Confusion Matrix")
plt.show()

# t-SNE可視化
tsne = TSNE(n_components=2, random_state=0, perplexity=30)
val_features_2d = tsne.fit_transform(val_features)

plt.figure(figsize=(8, 8))
scatter = plt.scatter(val_features_2d[:, 0], val_features_2d[:, 1], c=val_labels, cmap='tab10', alpha=0.7)
plt.legend(*scatter.legend_elements(), title="Classes", bbox_to_anchor=(1.05, 1))
plt.title("t-SNE Visualization")
plt.grid(True)
plt.show()


上記はfeature_extraction()でメモリオーバーでクラッシュする

# roboflowでの実装例

In [None]:
!pip install torch numpy opencv-python scikit-learn roboflow

Collecting roboflow
  Downloading roboflow-1.1.61-py3-none-any.whl.metadata (9.7 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collectin

In [None]:
import numpy as np
import torch
import torchvision.transforms as T
from PIL import Image


In [None]:
dinov2_vits14 = torch.hub.load("facebookresearch/dinov2", "dinov2_vits14")
device = torch.device('cuda' if torch.cuda.is_available() else "cpu")
dinov2_vits14.to(device)

Downloading: "https://github.com/facebookresearch/dinov2/zipball/main" to /root/.cache/torch/hub/main.zip
Downloading: "https://dl.fbaipublicfiles.com/dinov2/dinov2_vits14/dinov2_vits14_pretrain.pth" to /root/.cache/torch/hub/checkpoints/dinov2_vits14_pretrain.pth
100%|██████████| 84.2M/84.2M [00:00<00:00, 92.2MB/s]


DinoVisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 384, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (blocks): ModuleList(
    (0-11): 12 x NestedTensorBlock(
      (norm1): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (attn): MemEffAttention(
        (qkv): Linear(in_features=384, out_features=1152, bias=True)
        (attn_drop): Dropout(p=0.0, inplace=False)
        (proj): Linear(in_features=384, out_features=384, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((384,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=384, out_features=1536, bias=True)
        (act): GELU(approximate='none')
        (fc2): Linear(in_features=1536, out_features=384, bias=True)
        (drop): Dropout(p=0.0, inplace=False)
      )
      (ls2): LayerScale()
      (drop_path2): Identity()
    )
  )
  (n

In [None]:
transform_image = T.Compose([
    T.ToTensor(),
    T.Resize(244),
    T.CenterCrop(224),
    T.Normalize([0.5], [0.5])
])


In [None]:
def load_image(img_path):
    img = Image.open(img_path)
    transformed_img = transform_image(img)[:3].unsqueeze(0)
    return transformed_img

def compute_embeddings(files):
    all_embeddings = {}
    with torch.no_grad():
        for file in files:
            embeddings = dinov2_vits14(load_image(file).to(device))
            all_embeddings[file] = embeddings.cpu().numpy().reshape(1, -1).tolist()
    return all_embeddings


In [None]:
import torch
from transformers import AutoImageProcessor, AutoModel
from PIL import Image
import requests

# 画像の取得
url = 'http://images.cocodataset.org/val2017/000000039769.jpg'
image = Image.open(requests.get(url, stream=True).raw)

# モデルとプロセッサのロード
processor = AutoImageProcessor.from_pretrained('facebook/dinov2-base')
model = AutoModel.from_pretrained('facebook/dinov2-base')

# 前処理と推論
inputs = processor(images=image, return_tensors="pt")
outputs = model(**inputs)
# last_hidden_states = outputs.last_hidden_state
last_hidden_states = outputs[0]


preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


config.json:   0%|          | 0.00/548 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

In [None]:
last_hidden_states = outputs[0]
# We have to force return_dict=False for tracing
model.config.return_dict = False

with torch.no_grad():
    traced_model = torch.jit.trace(model, [inputs.pixel_values])
    traced_outputs = traced_model(inputs.pixel_values)

print((last_hidden_states - traced_outputs[0]).abs().max())

`loss_type=None` was set in the config but it is unrecognised.Using the default loss: `ForCausalLMLoss`.
  if num_channels != self.num_channels:


tensor(0., grad_fn=<MaxBackward1>)


In [None]:
print((last_hidden_states - traced_outputs[0]).abs().max())

tensor(0., grad_fn=<MaxBackward1>)


In [None]:
!pip install transformers datasets

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.5.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.2/491.2 kB[0m [31m12.9 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.2 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2024.12.0-py3-none-any.w

In [None]:
from transformers import AutoImageProcessor, Dinov2ForImageClassification
import torch
from datasets import load_dataset

dataset = load_dataset("huggingface/cats-image", trust_remote_code=True)
image = dataset["test"]["image"][0]

image_processor = AutoImageProcessor.from_pretrained("facebook/dinov2-small-imagenet1k-1-layer")
model = Dinov2ForImageClassification.from_pretrained("facebook/dinov2-small-imagenet1k-1-layer")

inputs = image_processor(image, return_tensors="pt")

with torch.no_grad():
    logits = model(**inputs).logits

# model predicts one of the 1000 ImageNet classes
predicted_label = logits.argmax(-1).item()
print(model.config.id2label[predicted_label])

cats-image.py:   0%|          | 0.00/2.56k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/173k [00:00<?, ?B/s]

Generating test split: 0 examples [00:00, ? examples/s]

preprocessor_config.json:   0%|          | 0.00/436 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/58.3k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/91.3M [00:00<?, ?B/s]

tabby, tabby cat
