In [5]:
%load_ext autoreload
%autoreload 2

import logging
import os
import sys
from dataclasses import dataclass, field
from typing import Optional

import numpy as np
from datasets import load_dataset
from PIL import Image
import evaluate

import torch
from torch.utils.data import DataLoader
from torchvision.transforms import (ToTensor,Lambda ,Resize,RandomResizedCrop,CenterCrop ,Compose, Normalize,RandomHorizontalFlip,RandomVerticalFlip,RandomRotation)
from accelerate import Accelerator

import transformers
from transformers import (
    MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING,
    AutoConfig,
    AutoImageProcessor,
    AutoModelForImageClassification,
    HfArgumentParser,
    Trainer,
    TrainingArguments,
    set_seed,
    get_scheduler
)
import math
from tqdm import tqdm
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import check_min_version, send_example_telemetry
from transformers.utils.versions import require_version

from transformers import ViTForImageClassification, ViTImageProcessor, TrainingArguments, Trainer


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [6]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
device

accelerator = Accelerator()

In [16]:
MODEL_CONFIG_CLASSES = list(MODEL_FOR_IMAGE_CLASSIFICATION_MAPPING.keys())
MODEL_TYPES = tuple(conf.model_type for conf in MODEL_CONFIG_CLASSES)

label_column_name = 'label' # "label" or "labels"
image_column_name = 'image'
model_name_or_path = "google/vit-base-patch16-224"
model_name_or_path = "apple/mobilevit-small"
trust_remote_code = True
ignore_mismatched_sizes = True
weight_decay = 0.0
learning_rate = 0.001
lr_scheduler_type = "linear"  # "linear" or "cosine_with_restarts" 
max_train_steps = 20000
num_warmup_steps = 0
overrode_max_train_steps = False
per_device_batch_size = 32

In [8]:
dataset = load_dataset("imagefolder", data_dir="/home/tuvis/Work/ml_works/mlkit/data/food-101/images")

In [9]:
dataset= dataset["train"].train_test_split(0.2)
label_column_name = 'label'

In [10]:
#dataset = load_dataset("beans")

labels = dataset["train"].features[label_column_name].names
label2id = {label: str(i) for i, label in enumerate(labels)}
id2label = {str(i): label for i, label in enumerate(labels)}

In [11]:
config = AutoConfig.from_pretrained(
        model_name_or_path,
        num_labels=len(labels),
        i2label=id2label,
        label2id=label2id,
        finetuning_task="image-classification",
        trust_remote_code=trust_remote_code,
    )

#config

In [12]:
image_processor = AutoImageProcessor.from_pretrained(
        model_name_or_path,
        trust_remote_code=trust_remote_code,
    )
model = AutoModelForImageClassification.from_pretrained(
    model_name_or_path,
    from_tf=bool(".ckpt" in model_name_or_path),
    config=config,
    ignore_mismatched_sizes=ignore_mismatched_sizes,
    trust_remote_code=trust_remote_code,
)

model.to(device)

print("done")

Some weights of MobileViTForImageClassification were not initialized from the model checkpoint at apple/mobilevit-small and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 640]) in the checkpoint and torch.Size([101, 640]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([101]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


done


In [13]:
#size = (image_processor.size["height"], image_processor.size["width"])
size = (256,256)

normalize = (
        Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
        if hasattr(image_processor, "image_mean") and hasattr(image_processor, "image_std")
        else Lambda(lambda x: x)
    )

train_transforms = Compose(
        [
            RandomResizedCrop(size),
            RandomHorizontalFlip(),
            ToTensor(),
            normalize,
        ]
    )
val_transforms = Compose(
    [
        Resize(size),
        CenterCrop(size),
        ToTensor(),
        normalize,
    ]
)

def preprocess_train(example_batch):
    """Apply _train_transforms across a batch."""
    example_batch["pixel_values"] = [
        train_transforms(image.convert("RGB")) for image in example_batch[image_column_name]
    ]
    return example_batch


def preprocess_val(example_batch):
    """Apply _val_transforms across a batch."""
    example_batch["pixel_values"] = [
        val_transforms(image.convert("RGB")) for image in example_batch[image_column_name]
    ]
    return example_batch

In [14]:
train_dataset = dataset["train"].with_transform(preprocess_train)
val_dataset = dataset["test"].with_transform(preprocess_val)



    # DataLoaders creation:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example[label_column_name] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}


train_dataloader = DataLoader(train_dataset,shuffle=True,batch_size=per_device_batch_size,collate_fn=collate_fn)
val_dataloader = DataLoader(val_dataset,shuffle=True,batch_size=per_device_batch_size,collate_fn=collate_fn)

In [15]:
len(train_dataloader)

2525

In [17]:
no_decay = ["bias", "LayerNorm.weight"]
optimizer_grouped_parameters = [
        {
            "params": [p for n, p in model.named_parameters() if not any(nd in n for nd in no_decay)],
            "weight_decay": weight_decay,
        },
        {
            "params": [p for n, p in model.named_parameters() if any(nd in n for nd in no_decay)],
            "weight_decay": 0.0,
        },
    ]

optimizer = torch.optim.AdamW(optimizer_grouped_parameters, lr=learning_rate)

In [18]:
lr_scheduler = get_scheduler(
        name=lr_scheduler_type,
        optimizer=optimizer,
        num_warmup_steps=num_warmup_steps,
        num_training_steps=max_train_steps
    )



In [19]:
num_update_steps_per_epoch = math.ceil(len(train_dataloader))

In [20]:
num_train_epochs = math.ceil(max_train_steps / num_update_steps_per_epoch)
metric = evaluate.load("accuracy")

loss_func = torch.nn.CrossEntropyLoss()

In [21]:
model, optimizer, train_dataloader, lr_scheduler = accelerator.prepare(model, optimizer, train_dataloader, lr_scheduler)

In [22]:
total_batch_size = per_device_batch_size# * accelerator.num_processes * args.gradient_accumulation_steps


In [None]:
progress_bar = tqdm(range(max_train_steps), disable=not accelerator.is_local_main_process)
completed_steps = 0
starting_epoch = 0

progress_bar.update(completed_steps)

for i in range(starting_epoch,num_train_epochs):
    model.train()
    total_loss = 0
    for step,batch in enumerate(train_dataloader):
        with accelerator.accumulate(model):
            X = batch["pixel_values"]
            y = batch["labels"]
            X = X.to(device)
            y = y.to(device)

            outputs = model(pixel_values=X,labels=y)
                
            loss = outputs.loss
            preds = outputs.logits.argmax(dim=-1)
            metric.add_batch(predictions=preds, references=y)

        #  print(outputs.loss)
            total_loss += loss.detach().float()
            
            accelerator.backward(loss)
            optimizer.step()
            lr_scheduler.step()
            optimizer.zero_grad()

        if accelerator.sync_gradients:
            progress_bar.update(1)
            completed_steps += 1
            

        
    tr_acc = metric.compute()['accuracy']
    model.eval()

    for step, batch in enumerate(val_dataloader):
        with torch.no_grad():
            X = batch["pixel_values"]
            y = batch["labels"]
            X = X.to(device)
            y = y.to(device)

            outputs = model(pixel_values=X,labels=y)
            preds = outputs.logits.argmax(dim=-1)
            metric.add_batch(predictions=preds, references=y)


    val_acc = metric.compute()['accuracy']


    print(f"Epoch {i} loss: {total_loss.item() / len(train_dataloader)} | acc : {tr_acc:.3f} | val_acc : {val_acc:.3f}")

    

 13%|█▎        | 2525/20000 [18:56<2:23:10,  2.03it/s]

Epoch 0 loss: 2.1594863861386138 | acc : 0.468 | val_acc : 0.612


 25%|██▌       | 5050/20000 [43:35<1:42:56,  2.42it/s]

Epoch 1 loss: 1.4483147045173268 | acc : 0.629 | val_acc : 0.702


 38%|███▊      | 7575/20000 [1:09:14<1:21:57,  2.53it/s]

Epoch 2 loss: 1.2382741916769802 | acc : 0.680 | val_acc : 0.751


 42%|████▏     | 8378/20000 [1:20:08<1:18:01,  2.48it/s]  

In [24]:
model.eval()

for step, batch in enumerate(val_dataloader):
    with torch.no_grad():
        X = batch["pixel_values"]
        y = batch["labels"]
        X = X.to(device)
        y = y.to(device)

        outputs = model(pixel_values=X,labels=y)
        preds = outputs.logits.argmax(dim=-1)
       # metric.add_batch(predictions=preds, references=y)

    break

In [None]:
print("pred :   ",preds)
print("target : ",y )

In [37]:
import matplotlib.pyplot as plt

In [None]:
for i in val_dataset.shuffle():
    
    
    input_data= val_transforms(i["image"])
    input_label = i["labels"]
    input_data= torch.unsqueeze(input_data,dim=0)
    input_data.shape


    output = model(input_data.to(device))
    pred = output.logits.argmax(-1).item()
    print(f" name: {labels[input_label]} real class: {input_label}, predicted class: {pred} ")

    plt.imshow(i["image"])
    plt.show()

In [None]:
i


In [None]:
labels

In [None]:
pred

In [None]:
labels

In [None]:
train_transforms.__dict__

In [220]:
outputs.logits.shape
predictions = outputs.logits.argmax(dim=-1)
labels = y

In [221]:
metric.add_batch(predictions=predictions, references=labels)

In [None]:
metric.compute( )

In [None]:
preds = outputs.logits.argmax(dim=-1)

metric(y,preds)

In [None]:
y

In [None]:
total_loss

In [None]:
X.shape,y.shape

In [152]:
l

In [None]:
outputs

In [47]:

from torch.utils.data import Dataset   

In [98]:
import math

In [None]:
train = dataset["train"]
train[]

In [61]:
class HFDataset(Dataset):
    def __init__(self,hf_dataset,feature_name="image",target_name="labels") -> None:
        self.hf_dataset = hf_dataset
        self.feature_name = feature_name
        self.target_name = target_name 
    def __len__(self):
        return len(self.hf_dataset)
    
    def __getitem__(self, index):
        
        image = self.hf_dataset[self.feature_name][index]
        target = self.hf_dataset[self.target_name][index]

        image = self.process(image)

        return image,target
    def process (self,image):
        image = ToTensor()(image)
        return image
        

train_dataset = HFDataset(dataset["train"])

In [67]:
dataloader = DataLoader(train_dataset, batch_size=2, shuffle=True)

In [68]:
for i in dataloader:
    break

In [10]:
def pil_loader(path: str):
    with open(path, "rb") as f:
        im = Image.open(f)
        return im.convert("RGB")


In [None]:
# Load pre-trained ViT model and image processor
model_name = "google/vit-base-patch16-224"
model = ViTForImageClassification.from_pretrained(
    model_name
)


In [None]:
dataset["train"].train_test_split()

In [None]:
dataset["train"]["image"][:2]