In [1]:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
from transformers import ViTImageProcessor


In [2]:
import os
from PIL import Image
from torchvision import transforms
from torchvision.transforms import Compose, Normalize, ToTensor, Resize

In [4]:
from app.slow_classificator import ResClassifier, Classificator, VitClassifier

In [None]:
# from torchvision.models import resnet34, ResNet, resnext101_64x4d

In [5]:
class CustomImageDataset(Dataset):
    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        
        self.id2label = {k: v for k, v in enumerate(sorted(os.listdir(root_dir)))}
        self.label2id = {v: k for k, v in self.id2label.items()}
        
        self.image_paths = []
        self.labels = []

        self.improcessor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')
        
        self.size = self.improcessor.size["height"]
        self.normalize = Normalize(
            mean=self.improcessor.image_mean,
            std=self.improcessor.image_std
        )

        self._transforms = Compose([
            Resize((self.size, self.size)),
            ToTensor(),
            self.normalize
        ])

        for cls in self.id2label.values():
            cls_folder = os.path.join(root_dir, cls)
            if os.path.isdir(cls_folder):
                for img_name in os.listdir(cls_folder):
                    img_path = os.path.join(cls_folder, img_name)
                    self.image_paths.append(img_path)
                    self.labels.append(cls)
    
    def __len__(self):
        return len(self.image_paths)
    
    def __getitem__(self, idx):
        return {
            "pixel_values": self.improcessor(
                images=Image.open(self.image_paths[idx]).convert("RGB")).pixel_values[0].squeeze(), # .squeeze()
            "labels": self.label2id[self.labels[idx]]
        }


In [6]:
dataset = CustomImageDataset(root_dir="/home/user1/hack/train_data_rkn/dataset")
# train_dataloader = DataLoader(dataset, batch_size=128, shuffle=True,num_workers=4)

In [7]:
dataset[0]

{'pixel_values': array([[[-0.69411767, -0.7019608 , -0.6784314 , ...,  0.21568632,
          -0.12941176, -0.27843136],
         [-0.7254902 , -0.7254902 , -0.7176471 , ...,  0.0196079 ,
          -0.03529412, -0.1372549 ],
         [-0.7411765 , -0.7647059 , -0.77254903, ..., -0.17647058,
          -0.04313725, -0.02745098],
         ...,
         [-0.69411767, -0.84313726, -0.94509804, ..., -0.90588236,
          -0.8980392 , -0.8980392 ],
         [-0.69411767, -0.78039217, -0.92156863, ..., -0.92941177,
          -0.9372549 , -0.92941177],
         [-0.78039217, -0.79607844, -0.9137255 , ..., -0.9137255 ,
          -0.92156863, -0.92156863]],
 
        [[-0.60784316, -0.60784316, -0.5921569 , ...,  0.30196083,
          -0.03529412, -0.12941176],
         [-0.64705884, -0.64705884, -0.64705884, ...,  0.13725495,
           0.09019613,  0.01176476],
         [-0.6627451 , -0.69411767, -0.7019608 , ..., -0.03529412,
           0.12156868,  0.12156868],
         ...,
         [-0.6784

In [8]:
classifier = VitClassifier(id2label=dataset.id2label, label2id=dataset.label2id)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([106]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([106, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [10]:
classifier.tune(dataset,
                device="cuda",
                epochs=12,
                batch_size=256,
                lr=2e-5, test_split=True,
                output_dir="./vit_v5_results")

Epoch,Training Loss,Validation Loss,Mean Average Precision
1,No log,3.654415,0.344189
2,No log,2.69404,0.608644
3,No log,1.978977,0.717972
4,No log,1.538348,0.769273
5,No log,1.274908,0.79582
6,No log,1.111773,0.811154
7,No log,1.011399,0.819799
8,No log,0.946715,0.825168
9,1.740100,0.908505,0.827045
10,1.740100,0.886341,0.82819




Epoch 8.19672131147541 - Step 500 - Loss: 1.7401




Fine-tuning complete.


In [12]:
classifier.model.config

ViTConfig {
  "_name_or_path": "google/vit-base-patch16-224",
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": ".DS_Store",
    "1": "Accordion",
    "2": "Adhesive tape",
    "3": "Aircraft",
    "4": "Airplane",
    "5": "Alarm clock",
    "6": "Alpaca",
    "7": "Ambulance",
    "8": "Animal",
    "9": "Ant",
    "10": "Apple",
    "11": "Artichoke",
    "12": "Banana",
    "13": "Barge",
    "14": "Bathtub",
    "15": "Belt",
    "16": "Binoculars",
    "17": "Bottle",
    "18": "Bow and arrow",
    "19": "Bread",
    "20": "Briefcase",
    "21": "Broccoli",
    "22": "Camera",
    "23": "Cannon",
    "24": "Cassette deck",
    "25": "Cat",
    "26": "Cello",
    "27": "Christmas tree",
    "28": "Coin",
    "29": "Common fig",
    "30": "Cosmetics",
    "31": "Cucumber",
    "32": "Cutting board",
    "33": "Ea

In [15]:
import numpy as np
from transformers import AutoModel

def load_and_save_model_to_onnx(output_path, dataset, device="cpu"):
    input_shape = np.expand_dims(dataset[0]["pixel_values"], axis=0).shape

    model.to(device)
    model.eval()

    torch.onnx.export(
        model, # model = AutoModel.from_pretrain(checkpoint)
        torch.randn(*input_shape).to(device),
        output_path,
        export_params=True,
        opset_version=14,
        do_constant_folding=True,
        input_names=["pixel_values"],
        output_names=["logits"],
        dynamic_axes={"pixel_values": {0: "batch_size"}, "logits": {0: "batch_size"}},
    )

    print(f"Model successfully saved to {output_path}")

load_and_save_model_to_onnx("/home/user1/solve/vit_overfit_last_results/checkpoint-684", "vit_v4.onnx", dataset)


Some weights of ViTModel were not initialized from the model checkpoint at /home/user1/solve/vit_overfit_last_results/checkpoint-684 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
  if num_channels != self.num_channels:
  if height != self.image_size[0] or width != self.image_size[1]:


Model successfully saved to vit_v4.onnx
