In [1]:
import numpy as np
from skimage import exposure
from skimage.filters import unsharp_mask
from skimage import io
from PIL import Image
import os
import torch
import torchvision.transforms as transforms
from torchvision.transforms import v2,ToTensor, ToPILImage, InterpolationMode
from torchvision.datasets import ImageFolder
from torch.utils.data import DataLoader, Dataset
import torch.nn as nn
import matplotlib.pyplot as plt
from distutils.dir_util import copy_tree
from transformers import ViTImageProcessor, ViTForImageClassification
from transformers import TrainingArguments, Trainer
from datasets import load_dataset
import random
%matplotlib inline

2024-05-24 12:25:56.256810: E external/local_xla/xla/stream_executor/cuda/cuda_dnn.cc:9261] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
2024-05-24 12:25:56.256921: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:607] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
2024-05-24 12:25:56.393327: E external/local_xla/xla/stream_executor/cuda/cuda_blas.cc:1515] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


In [2]:
dataset_path = "/kaggle/input/raf-db-dataset"

In [3]:
dataset = load_dataset("imagefolder", data_dir=dataset_path)

labels = dataset["train"].features["label"].names

idx2label = {idx: label for idx, label in enumerate(labels)}
label2idx = {label: idx for idx, label in enumerate(labels)}

Resolving data files:   0%|          | 0/24543 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/6137 [00:00<?, ?it/s]

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

In [4]:
def show_images(images):
    plt.figure(figsize=(15, 15))
    for i, image in enumerate(images):
        ax = plt.subplot(1, len(images), i + 1)
        plt.imshow(image)
        plt.axis("off")
        
def show_images_with_labels(images, labels):
    plt.figure(figsize=(15, 15))
    for i, (image, label) in enumerate(zip(images, labels)):
        ax = plt.subplot(1, len(images), i + 1)
        plt.imshow(image)
        plt.title(label)
        plt.axis("off")

In [5]:
random_indices = random.sample(range(len(dataset["train"])), 5)
random_images = [dataset["train"][i]['image'] for i in random_indices]
random_labels = [dataset["train"][i]['label'] for i in random_indices]

In [6]:
# show_images_with_labels(random_images, random_labels)

In [7]:
model_name = 'google/vit-base-patch16-224'

In [8]:
class HistogramEqualizer(torch.nn.Module):
    def forward(self, img):
        return Image.fromarray(np.uint8(exposure.equalize_adapthist(np.asarray(img))*255))

In [9]:
class UnsharpMasking(torch.nn.Module):
    def forward(self, img):
        return Image.fromarray(np.uint8(unsharp_mask(np.asarray(img),5,1, channel_axis=2)*255))

In [10]:
processor = ViTImageProcessor.from_pretrained(model_name, return_tensors = 'pt')

image_mean, image_std = processor.image_mean, processor.image_std
size = processor.size["height"]

normalize = v2.Normalize(mean=image_mean, std=image_std)

train_transform = v2.Compose([
    HistogramEqualizer(),
    UnsharpMasking(),
    v2.Resize((processor.size["height"], processor.size["width"]), interpolation=InterpolationMode.BILINEAR ),
    v2.RandomHorizontalFlip(0.4),
    v2.RandomVerticalFlip(0.1),
    v2.RandomApply(transforms=[v2.RandomRotation(degrees=(0, 90))], p=0.5),
    v2.RandomApply(transforms=[v2.ColorJitter(brightness=.3, hue=.1)], p=0.3),
    v2.RandomApply(transforms=[v2.GaussianBlur(kernel_size=(5, 9))], p=0.3),
    v2.ToTensor(),
    normalize
 ])

test_transform = v2.Compose([
    HistogramEqualizer(),
    UnsharpMasking(),
    v2.Resize((processor.size["height"], processor.size["width"]), interpolation=InterpolationMode.BILINEAR  ),
    v2.ToTensor(),
    normalize
])

preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]



In [11]:
def train_transforms(examples):
    examples['pixel_values'] = [train_transform(image.convert("RGB")) for image in examples['image']]
    return examples

def test_transforms(examples):
    examples['pixel_values'] = [test_transform(image.convert("RGB")) for image in examples['image']]
    return examples

# Set the transforms
dataset['train'].set_transform(train_transforms)
dataset['test'].set_transform(test_transforms)

In [12]:
model = ViTForImageClassification.from_pretrained(model_name,
                                                  id2label=idx2label,
                                                  label2id=label2idx,
                                                  ignore_mismatched_sizes=True)

config.json:   0%|          | 0.00/69.7k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([7]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([7, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [13]:
from sklearn.metrics import accuracy_score
import numpy as np

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return dict(accuracy=accuracy_score(predictions, labels))

In [14]:
def collate_fn(examples):
    pixel_values = torch.stack([example["pixel_values"] for example in examples])
    labels = torch.tensor([example["label"] for example in examples])
    return {"pixel_values": pixel_values, "labels": labels}

In [15]:
experiment_name = "vit-base-face-recognition"
dir_ft = f'/kaggle/working/{experiment_name}/'

In [16]:
metric_name = "accuracy"

# Define Train Parameters
args = TrainingArguments(
    experiment_name,
    use_cpu = False,
    evaluation_strategy="epoch",
    logging_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=96,
    per_device_eval_batch_size=96,
    num_train_epochs=10,
    weight_decay=0.01,
    load_best_model_at_end=True,
    metric_for_best_model=metric_name,
    logging_dir='logs',
    remove_unused_columns=False,
    save_strategy = "epoch")

# Train
trainer = Trainer(
    model,
    args,
    train_dataset=dataset['train'],
    eval_dataset=dataset['test'],
    data_collator=collate_fn,
    compute_metrics=compute_metrics,
    tokenizer=processor,
)

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


In [17]:
import wandb
wandb.login(key='b619c32148b781d6b76b10c9290b3e15c5a99c55')

[34m[1mwandb[0m: W&B API key is configured. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [18]:
trainer.train()

[34m[1mwandb[0m: Currently logged in as: [33mm-motawie[0m ([33mnile-uni[0m). Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: wandb version 0.17.0 is available!  To upgrade, please run:
[34m[1mwandb[0m:  $ pip install wandb --upgrade
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/kaggle/working/wandb/run-20240524_122727-mgjr1my0[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off syncing.
[34m[1mwandb[0m: Syncing run [33mstill-salad-15[0m
[34m[1mwandb[0m: ⭐️ View project at [34m[4mhttps://wandb.ai/nile-uni/huggingface[0m
[34m[1mwandb[0m: 🚀 View run at [34m[4mhttps://wandb.ai/nile-uni/huggingface/runs/mgjr1my0[0m


Epoch,Training Loss,Validation Loss,Accuracy
1,1.06,0.66894,0.76369
2,0.6862,0.589666,0.793677
3,0.5444,0.525129,0.822686
4,0.4583,0.502638,0.825293
5,0.3968,0.497981,0.83116
6,0.3438,0.507806,0.823664
7,0.3045,0.499821,0.831486
8,0.2799,0.508232,0.830834
9,0.2605,0.509881,0.831812
10,0.2452,0.509104,0.83279


TrainOutput(global_step=2560, training_loss=0.4579571053385735, metrics={'train_runtime': 10421.283, 'train_samples_per_second': 23.55, 'train_steps_per_second': 0.246, 'total_flos': 1.901893635657437e+19, 'train_loss': 0.4579571053385735, 'epoch': 10.0})

In [19]:
trainer.save_model(dir_ft)