In [None]:
!pip install datasets pillow rich transformers evaluate accelerate torchvision scikit-learn numpy pytorch-lightning ipywidgets huggingface_hub --quiet

In [None]:
from huggingface_hub import interpreter_login

interpreter_login()

In [None]:
import datasets
from datasets import load_dataset

In [None]:
# inspect(datasets.Image, help=True)
food = load_dataset("food101", split="train")

In [None]:
print(f'food: {food}\n')

labels = food.features['label'].names
first_label = food[0]['label']
first_image = food[0]['image']
print(f'all labels: {labels}\n')
print(f'food[0]: {food[0]}')
print(f'Example label: {first_label}')
print(f'Example image filename: {first_image}')

In [None]:
hot_dog = food.filter(lambda x: x['label'] == 49)
print(f'hot_dog: {hot_dog}')

In [None]:
# Set all hot dogs to have the label 0
hot_dog = hot_dog.map(lambda x: {'image': x['image'], 'label': 0}, num_proc=8)
# Set all other images to have the label 1
not_hot_dog = food.filter(lambda x: x['label'] != 49, num_proc=8).map(lambda x: {'image': x['image'], 'label': 1}, num_proc=8)

In [None]:
# Check pre and post filtering lengths
print(f'hot_dog: {len(hot_dog)}')
print(f'not_hot_dog (before): {len(not_hot_dog)}')
not_hot_dog = not_hot_dog.select(range(len(hot_dog)))
print(f'not_hot_dog: {len(not_hot_dog)}')

In [None]:
# Recombine datasets of hotdog and not_hot_dog
all_food = datasets.concatenate_datasets([hot_dog, not_hot_dog])

In [None]:
# Save the dataset to disk
all_food.save_to_disk("hotdog_not_hotdog")

In [None]:
# Split the dataset into training and testing
all_food = all_food.train_test_split(test_size=0.2)
print(f'all_food: {all_food}')


In [None]:
from transformers import AutoImageProcessor

checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [None]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

# Define the image transformations and normalization to prevent overfitting
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [None]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [None]:
# Apply the transformations to the dataset and save it to the original variable
all_food = all_food.with_transform(transforms)

In [None]:
from transformers import DefaultDataCollator

# Define the data collator which will be used to batch the data
data_collator = DefaultDataCollator()

In [None]:
import evaluate

# Load the accuracy metric from the hub
accuracy = evaluate.load('accuracy')

import numpy as np


# Create a function to compute the accuracy of the model
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
print(f'all_food: {all_food}')

# Provide a label to id and id to label mapping
labels = all_food['train'].features["label"].names
label_to_id, id_to_label = dict(), dict()
for i, label in enumerate(labels):
    label_to_id[label] = str(i)
    id_to_label[str(i)] = label

In [None]:
print(f'all_food: {all_food}')

In [None]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=len(labels),
    id2label=id_to_label,
    label2id=label_to_id,
)

In [28]:
training_args = TrainingArguments(
    output_dir="not_hotdog_vit_base_patch16_224_in21k",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-5,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=4,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=all_food["train"],
    eval_dataset=all_food["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

# Train the model!
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/54 [00:00<?, ?it/s]

{'loss': 4.467, 'grad_norm': 2.793170690536499, 'learning_rate': 4.5833333333333334e-05, 'epoch': 0.53}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 3.165264129638672, 'eval_accuracy': 0.94, 'eval_runtime': 11.9627, 'eval_samples_per_second': 25.078, 'eval_steps_per_second': 1.588, 'epoch': 0.96}
{'loss': 3.4884, 'grad_norm': 3.6933586597442627, 'learning_rate': 3.541666666666667e-05, 'epoch': 1.07}
{'loss': 2.745, 'grad_norm': 3.675457000732422, 'learning_rate': 2.5e-05, 'epoch': 1.6}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 2.318723201751709, 'eval_accuracy': 0.9433333333333334, 'eval_runtime': 10.4579, 'eval_samples_per_second': 28.686, 'eval_steps_per_second': 1.817, 'epoch': 1.97}
{'loss': 2.3875, 'grad_norm': 3.6425845623016357, 'learning_rate': 1.4583333333333335e-05, 'epoch': 2.13}
{'loss': 2.2085, 'grad_norm': 3.5709142684936523, 'learning_rate': 4.166666666666667e-06, 'epoch': 2.67}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 2.1579627990722656, 'eval_accuracy': 0.9433333333333334, 'eval_runtime': 10.4603, 'eval_samples_per_second': 28.68, 'eval_steps_per_second': 1.816, 'epoch': 2.88}
{'train_runtime': 408.8327, 'train_samples_per_second': 8.806, 'train_steps_per_second': 0.132, 'train_loss': 2.9908782287880227, 'epoch': 2.88}


TrainOutput(global_step=54, training_loss=2.9908782287880227, metrics={'train_runtime': 408.8327, 'train_samples_per_second': 8.806, 'train_steps_per_second': 0.132, 'train_loss': 2.9908782287880227, 'epoch': 2.88})