In [62]:
import torch 
from transformers import ViTFeatureExtractor, ViTForImageClassification, ViTImageProcessor
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np 
from PIL import Image
import pandas as pd
import os 
#import wandb
from tqdm import tqdm
from functools import partial

In [63]:
classes = os.listdir("final_dataset/train")
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
image_processor = ViTImageProcessor.from_pretrained('google/vit-base-patch16-224')



In [64]:
feature_extractor

ViTFeatureExtractor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTFeatureExtractor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

In [65]:
train_dir = "final_dataset/train"
test_dir = "final_dataset/test"

In [66]:
class ImageDataset(Dataset):
    
    def __init__(self, root, feature_extractor, labels):
        self.feature_extractor = feature_extractor 
        self.files = []
        self.labels = labels
        
        # get all the subdirectories in the root folder
        subdirs = sorted(os.listdir(root))
        
        for subdir in subdirs:
            for path, _, files in os.walk(os.path.join(root, subdir)):
                for file in files:
                    # append both the filename and label
                    self.files.append((os.path.join(root, subdir, file), subdir))
    
    def __getitem__(self, index):
        img, label = self.files[index % len(self.files)]
        img = Image.open(img).convert("RGB")
        img = self.feature_extractor(img, return_tensors="pt")
        img["labels"] = torch.tensor(self.labels.index(label))
        return img
    
    def __len__(self):
        return len(self.files)

In [67]:
train_dataset = ImageDataset(train_dir, feature_extractor, classes)
test_dataset = ImageDataset(test_dir, feature_extractor, classes)


test_train_size = int(0.8 * len(test_dataset))
test_test_size = len(test_dataset) - test_train_size


test_dataset, val_dataset = torch.utils.data.random_split(test_dataset, [test_train_size, test_test_size])

In [68]:
batch_size = 32
num_workers = os.cpu_count()

train_loader = DataLoader(train_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=batch_size, num_workers=num_workers, shuffle=True)


train_loader, val_loader, test_loader

(<torch.utils.data.dataloader.DataLoader at 0x1f101ea3d00>,
 <torch.utils.data.dataloader.DataLoader at 0x1f101ea3550>,
 <torch.utils.data.dataloader.DataLoader at 0x1f101ea3430>)

In [69]:
#batch = next(enumerate(train_loader))
from datasets import load_dataset
ds = load_dataset("imagefolder", data_dir="final_dataset")
ds


#for batch in train_loader:
   # print(batch)
   # break

#batch = train_loader[0]
#batch_images = batch["pixel_values"]
#batch_labels = batch["labels"]

#plt.figure(figsize=(16,16))

Resolving data files:   0%|          | 0/1838 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/961 [00:00<?, ?it/s]

Found cached dataset imagefolder (C:/Users/shoba/.cache/huggingface/datasets/imagefolder/default-6a4e43a2077b7a98/0.0.0/37fbb85cc714a338bea574ac6c7d0b5be5aff46c1862c1989b20e0771199e93f)


  0%|          | 0/2 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1838
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 960
    })
})

In [70]:
labels = ds["train"].features["label"]
labels

ClassLabel(names=['Adult', 'Airplane', 'Alpaca', 'Bird', 'Bus', 'Car', 'Cat', 'Child', 'Elephant', 'Flower', 'Giraffe', 'Horse', 'Monkey', 'Panda', 'Reptile', 'Vessel'], id=None)

In [71]:
def transform(examples):
  # convert all images to RGB format, then preprocessing it
  # using our image processor
  inputs = image_processor([img.convert("RGB") for img in examples["image"]], return_tensors="pt")
  # we also shouldn't forget about the labels
  inputs["labels"] = examples["label"]
  return inputs

# use the with_transform() method to apply the transform to the dataset on the fly during training
dataset = ds.with_transform(transform)

In [72]:
labels = ds["train"].features["label"].names
labels

['Adult',
 'Airplane',
 'Alpaca',
 'Bird',
 'Bus',
 'Car',
 'Cat',
 'Child',
 'Elephant',
 'Flower',
 'Giraffe',
 'Horse',
 'Monkey',
 'Panda',
 'Reptile',
 'Vessel']

In [73]:
for item in dataset["train"]:
  print(item["pixel_values"].shape)
  print(item["labels"])
  break

torch.Size([3, 224, 224])
0


In [74]:
def collate_fn(batch):
  return {
      "pixel_values": torch.stack([x["pixel_values"] for x in batch]),
      "labels": torch.tensor([x["labels"] for x in batch]),
  }

In [75]:
from evaluate import load
import numpy as np

# load the accuracy and f1 metrics from the evaluate module
accuracy = load("accuracy")
f1 = load("f1")

def compute_metrics(eval_pred):
  # compute the accuracy and f1 scores & return them
  accuracy_score = accuracy.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids)
  f1_score = f1.compute(predictions=np.argmax(eval_pred.predictions, axis=1), references=eval_pred.label_ids, average="macro")
  return {**accuracy_score, **f1_score}

In [76]:
# load the ViT model
model = ViTForImageClassification.from_pretrained(
    'google/vit-base-patch16-224',
    num_labels=len(labels),
    id2label={str(i): c for i, c in enumerate(labels)},
    label2id={c: str(i) for i, c in enumerate(labels)},
    ignore_mismatched_sizes=True,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([16, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([16]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [77]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
num_epochs = 5
lr = 2e-4
eval_steps = 100
record_steps= 10
save_checkpoint = 5

#wandb.config.update({"lr": lr, "num_epochs": num_epochs})

model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224', num_labels=len(classes),
                                                   id2label={str(i): c for i, c in enumerate(classes)},
                                                   label2id={c: str(i) for i, c in enumerate(classes)},
                                                   ignore_mismatched_sizes=True).to(device)

optim = torch.optim.AdamW(model.parameters(), lr=lr)

# negative log likelihood -> multi-class classification
training_stats_step = 10

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([16, 768]) in the model instantiated
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([16]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [78]:
from transformers import TrainingArguments

training_args = TrainingArguments(
  output_dir="./vit-base-custom, # output directory
  # output_dir="./vit-base-skin-cancer",
  per_device_train_batch_size=32, # batch size per device during training
  evaluation_strategy="steps",    # evaluation strategy to adopt during training
  num_train_epochs=50,             # total number of training epochs
  # fp16=True,                    # use mixed precision
  save_steps=1000,                # number of update steps before saving checkpoint
  eval_steps=1000,                # number of update steps before evaluating
  logging_steps=1000,             # number of update steps before logging
  # save_steps=50,
  # eval_steps=50,
  # logging_steps=50,
  save_total_limit=2,             # limit the total amount of checkpoints on disk
  remove_unused_columns=False,    # remove unused columns from the dataset
  push_to_hub=False,              # do not push the model to the hub
  report_to='tensorboard',        # report metrics to tensorboard
  load_best_model_at_end=True,    # load the best model at the end of training
)

In [79]:
from transformers import Trainer

trainer = Trainer(
    model=model,                        # the instantiated 🤗 Transformers model to be trained
    args=training_args,                 # training arguments, defined above
    data_collator=collate_fn,           # the data collator that will be used for batching
    compute_metrics=compute_metrics,    # the metrics function that will be used for evaluation
    train_dataset=dataset["train"],     # training dataset
    eval_dataset=dataset["test"], # evaluation dataset
    tokenizer=image_processor,          # the processor that will be used for preprocessing the images
)

In [80]:
# start training
trainer.train()



  0%|          | 0/2900 [00:00<?, ?it/s]

In [None]:
trainer.evaluate(dataset["test"])

  0%|          | 0/120 [00:00<?, ?it/s]

{'eval_loss': 0.09072335064411163,
 'eval_accuracy': 0.9833333333333333,
 'eval_f1': 0.983288697020956,
 'eval_runtime': 395.7325,
 'eval_samples_per_second': 2.426,
 'eval_steps_per_second': 0.303,
 'epoch': 3.0}

In [None]:
from torchinfo import summary
summary(model, 
        input_size=(32, 3, 224, 224), # make sure this is "input_size", not "input_shape" (batch_size, color_channels, height, width)
        verbose=0,
        col_names=["input_size", "output_size", "num_params", "trainable"],
        col_width=20,
        row_settings=["var_names"]
)

  action_fn=lambda data: sys.getsizeof(data.storage()),
  return super().__sizeof__() + self.nbytes()


Layer (type (var_name))                                                Input Shape          Output Shape         Param #              Trainable
ViTForImageClassification (ViTForImageClassification)                  [32, 3, 224, 224]    [32, 16]             --                   True
├─ViTModel (vit)                                                       [32, 3, 224, 224]    [32, 197, 768]       --                   True
│    └─ViTEmbeddings (embeddings)                                      [32, 3, 224, 224]    [32, 197, 768]       152,064              True
│    │    └─ViTPatchEmbeddings (patch_embeddings)                      [32, 3, 224, 224]    [32, 196, 768]       590,592              True
│    │    └─Dropout (dropout)                                          [32, 197, 768]       [32, 197, 768]       --                   --
│    └─ViTEncoder (encoder)                                            [32, 197, 768]       [32, 197, 768]       --                   True
│    │    └─ModuleList (

In [None]:
'''from torch.optim.lr_scheduler import StepLR

scheduler = StepLR(optim, step_size=1, gamma=0.6)'''

'from torch.optim.lr_scheduler import StepLR\n\nscheduler = StepLR(optim, step_size=1, gamma=0.6)'

In [None]:
'''def evaluate(model, eval_loader, eval_dataset):
    correct = 0
    eval_loss = []
    
    model.eval()
    for batch in tqdm(eval_loader):
        # extracting images and labels from batch 
        batch_images = batch["pixel_values"].squeeze(1).to(device)
        batch_labels = batch["labels"].to(device)
        
        # not training the model
        with torch.no_grad():
            outputs = model(pixel_values=batch_images, labels=batch_labels)
            loss = outputs[0]
            eval_loss.append(loss.item())
            correct += (torch.argmax(outputs["logits"], dim=1) == batch_labels).sum().item()
    
    # return eval accuracy and loss
    accuracy = (100 * correct / len(eval_dataset))
    #avg_loss = (sum(eval_loss) / len(epoch_loss))
    
    return accuracy#, avg_loss'''

'def evaluate(model, eval_loader, eval_dataset):\n    correct = 0\n    eval_loss = []\n    \n    model.eval()\n    for batch in tqdm(eval_loader):\n        # extracting images and labels from batch \n        batch_images = batch["pixel_values"].squeeze(1).to(device)\n        batch_labels = batch["labels"].to(device)\n        \n        # not training the model\n        with torch.no_grad():\n            outputs = model(pixel_values=batch_images, labels=batch_labels)\n            loss = outputs[0]\n            eval_loss.append(loss.item())\n            correct += (torch.argmax(outputs["logits"], dim=1) == batch_labels).sum().item()\n    \n    # return eval accuracy and loss\n    accuracy = (100 * correct / len(eval_dataset))\n    #avg_loss = (sum(eval_loss) / len(epoch_loss))\n    \n    return accuracy#, avg_loss'

In [None]:
'''tqdm = partial(tqdm, position=0, leave=True)


for epoch in range(1, num_epochs+1):
    # storing loss and accuracy across the epoch
    epoch_loss = []
    epoch_acc = []
    
    print(f"Epoch {epoch}")
    for index, batch in enumerate(tqdm(train_loader)):
        model.train()
        optim.zero_grad()

        # extract images and labels from batch
        batch_images = batch["pixel_values"].squeeze(1).to(device)
        batch_labels = batch["labels"].to(device)
        size = len(batch_images)
        
        outputs = model(pixel_values=batch_images, labels=batch_labels)
        
        loss = outputs[0]
        epoch_loss.append(loss.item())
        loss.backward()
        optim.step()
        
        correct = (torch.argmax(outputs["logits"], dim=1) == batch_labels).sum().item()
        acc = (100 * correct) / size
        epoch_acc.append(acc)
        
        # log the training metrics
        #if index % record_steps == 0:
           # wandb.log({'loss': loss, "acc" : acc})
        
    
    # calculate summary stats for each epoch 
    avg_accuracy = (sum(epoch_acc) / len(epoch_acc))
    avg_loss = (sum(epoch_loss) / len(epoch_loss))
    
    # we decay the loss over time 
    scheduler.step()
    
    # save checkpoints using torchscript 
    if epoch % save_checkpoint == 0:
        model.save_pretrained(f"Epoch {epoch}")
    
    # finding validation accuracy and loss
    val_acc, val_loss = evaluate(model, val_loader, val_dataset)
    #wandb.log({'validation accuracy': val_acc, "validation loss" : val_loss})
    
    # summary stats at the end of the episode
    print("evaluating on validation set")
    print(f"val loss: {round(val_loss, 4)}, val acc: {round(val_acc, 4)}%")
    print(f"Epoch: {epoch}  avg loss: {round(avg_loss, 4)} avg acc: {round(avg_accuracy, 4)}%")'''

'tqdm = partial(tqdm, position=0, leave=True)\n\n\nfor epoch in range(1, num_epochs+1):\n    # storing loss and accuracy across the epoch\n    epoch_loss = []\n    epoch_acc = []\n    \n    print(f"Epoch {epoch}")\n    for index, batch in enumerate(tqdm(train_loader)):\n        model.train()\n        optim.zero_grad()\n\n        # extract images and labels from batch\n        batch_images = batch["pixel_values"].squeeze(1).to(device)\n        batch_labels = batch["labels"].to(device)\n        size = len(batch_images)\n        \n        outputs = model(pixel_values=batch_images, labels=batch_labels)\n        \n        loss = outputs[0]\n        epoch_loss.append(loss.item())\n        loss.backward()\n        optim.step()\n        \n        correct = (torch.argmax(outputs["logits"], dim=1) == batch_labels).sum().item()\n        acc = (100 * correct) / size\n        epoch_acc.append(acc)\n        \n        # log the training metrics\n        #if index % record_steps == 0:\n           # wa

In [None]:
'''test_acc, test_loss = evaluate(model, test_loader, test_dataset)
print(f"testing acc: {round(test_acc, 4)}%, testing loss: {round(test_loss, 4)}%")'''

'test_acc, test_loss = evaluate(model, test_loader, test_dataset)\nprint(f"testing acc: {round(test_acc, 4)}%, testing loss: {round(test_loss, 4)}%")'