In [74]:
!pip install datasets pillow rich transformers evaluate accelerate torchvision scikit-learn numpy pytorch-lightning ipywidgets huggingface_hub --quiet

In [75]:
from huggingface_hub import interpreter_login

interpreter_login()


    _|    _|  _|    _|    _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|_|_|_|    _|_|      _|_|_|  _|_|_|_|
    _|    _|  _|    _|  _|        _|          _|    _|_|    _|  _|            _|        _|    _|  _|        _|
    _|_|_|_|  _|    _|  _|  _|_|  _|  _|_|    _|    _|  _|  _|  _|  _|_|      _|_|_|    _|_|_|_|  _|        _|_|_|
    _|    _|  _|    _|  _|    _|  _|    _|    _|    _|    _|_|  _|    _|      _|        _|    _|  _|        _|
    _|    _|    _|_|      _|_|_|    _|_|_|  _|_|_|  _|      _|    _|_|_|      _|        _|    _|    _|_|_|  _|_|_|_|

    A token is already saved on your machine. Run `huggingface-cli whoami` to get more information or `huggingface-cli logout` if you want to log out.
    Setting a new token will erase the existing one.
    To login, `huggingface_hub` requires a token generated from https://huggingface.co/settings/tokens .
Token is valid (permission: write).
Your token has been saved in your configured git credential helpers (osxkeychain).

In [76]:
import datasets
from datasets import load_dataset

In [77]:
# inspect(datasets.Image, help=True)
food = load_dataset("food101", split="train")

In [78]:
print(f'food: {food}\n')

labels = food.features['label'].names
first_label = food[0]['label']
first_image = food[0]['image']
print(f'all labels: {labels}\n')
print(f'food[0]: {food[0]}')
print(f'Example label: {first_label}')
print(f'Example image filename: {first_image}')

food: Dataset({
    features: ['image', 'label'],
    num_rows: 75750
})

all labels: ['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche', 'cheesecake', 'cheese_plate', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel', 'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad', 'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', '

In [79]:
hot_dog = food.filter(lambda x: x['label'] == 49)
print(f'hot_dog: {hot_dog}')

hot_dog: Dataset({
    features: ['image', 'label'],
    num_rows: 750
})


In [80]:
# Set all hot dogs to have the label 0
hot_dog = hot_dog.map(lambda x: {'image': x['image'], 'label': 0}, num_proc=8)
# Set all other images to have the label 1
not_hot_dog = food.filter(lambda x: x['label'] != 49, num_proc=8).map(lambda x: {'image': x['image'], 'label': 1}, num_proc=8)

In [81]:
# Shuffle the not_hot_dog dataset
not_hot_dog = not_hot_dog.shuffle(seed=42)

In [100]:
# Add some pizza to the not_hot_dog dataset
pizza = food.filter(lambda x: x['label'] == 63, num_proc=8).map(lambda x: {'image': x['image'], 'label': 1}, num_proc=8)
# Configure the percentage pizza to be added
pizza = pizza.train_test_split(test_size=0.5)['train']

# Now add the pizza to the not_hot_dog dataset
not_hot_dog_with_pizza = datasets.concatenate_datasets([not_hot_dog, pizza])

# re-shuffle the not_hot_dog_with_pizza dataset and set it to the non_hot_dog dataset
not_hot_dog = not_hot_dog_with_pizza.shuffle(seed=42)

In [86]:
# Check pre and post filtering lengths
print(f'hot_dog: {len(hot_dog)}')
print(f'not_hot_dog (before): {len(not_hot_dog)}')
not_hot_dog = not_hot_dog.select(range(len(hot_dog)))
print(f'not_hot_dog: {len(not_hot_dog)}')

hot_dog: 750
not_hot_dog (before): 75750
not_hot_dog: 750


In [87]:
# Recombine datasets of hotdog and not_hot_dog
all_food = datasets.concatenate_datasets([hot_dog, not_hot_dog])


In [88]:
# Split the dataset into training and testing
all_food = all_food.train_test_split(test_size=0.2)
print(f'all_food: {all_food}')


all_food: DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 300
    })
})


In [89]:
from transformers import AutoImageProcessor

# Transfer learning to load the pre-trained google vit model
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [90]:
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor

# Define the image transformations and normalization to prevent overfitting
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [91]:
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [92]:
# Apply the transformations to the dataset and save it to the original variable
all_food = all_food.with_transform(transforms)

In [93]:
from transformers import DefaultDataCollator

# Define the data collator which will be used to batch the data
data_collator = DefaultDataCollator()

In [94]:
import evaluate

# Load the accuracy metric from the hub
accuracy = evaluate.load('accuracy')

import numpy as np


# Create a function to compute the accuracy of the model
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [95]:
print(f'all_food: {all_food}')

# Provide a label to id and id to label mapping
label_to_id = {"hot_dog": 0, "not_hot_dog": 1}
id_to_label = {0: "hot_dog", 1: "not_hot_dog"}

all_food: DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 300
    })
})


In [96]:
# # Add gradient boosting to detect edge cases
# from sklearn.ensemble import GradientBoostingClassifier

# # Define the model
# model = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0, max_depth=1, random_state=0)

# # Train the model
# model.fit(all_food["train"]["pixel_values"], all_food["train"]["label"])


In [97]:
print(f'all_food: {all_food}')

all_food: DatasetDict({
    train: Dataset({
        features: ['image', 'label'],
        num_rows: 1200
    })
    test: Dataset({
        features: ['image', 'label'],
        num_rows: 300
    })
})


In [98]:
from transformers import AutoModelForImageClassification, TrainingArguments, Trainer

model = AutoModelForImageClassification.from_pretrained(
    checkpoint,
    num_labels=2,
    id2label=id_to_label,
    label2id=label_to_id,
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [99]:
training_args = TrainingArguments(
    output_dir="not_hotdog",
    remove_unused_columns=False,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    learning_rate=5e-6,
    per_device_train_batch_size=16,
    gradient_accumulation_steps=6,
    per_device_eval_batch_size=16,
    num_train_epochs=10,
    warmup_ratio=0.1,
    logging_steps=10,
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    push_to_hub=True,
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=all_food["train"],
    eval_dataset=all_food["test"],
    tokenizer=image_processor,
    compute_metrics=compute_metrics,
)

# Train the model!
trainer.train()

dataloader_config = DataLoaderConfiguration(dispatch_batches=None, split_batches=False, even_batches=True, use_seedable_sampler=True)


  0%|          | 0/120 [00:00<?, ?it/s]

{'loss': 0.6999, 'grad_norm': 0.9236725568771362, 'learning_rate': 4.166666666666667e-06, 'epoch': 0.8}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.6830431222915649, 'eval_accuracy': 0.6166666666666667, 'eval_runtime': 11.5518, 'eval_samples_per_second': 25.97, 'eval_steps_per_second': 1.645, 'epoch': 0.96}
{'loss': 0.6815, 'grad_norm': 1.0281648635864258, 'learning_rate': 4.62962962962963e-06, 'epoch': 1.6}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.6534063816070557, 'eval_accuracy': 0.7133333333333334, 'eval_runtime': 11.4935, 'eval_samples_per_second': 26.102, 'eval_steps_per_second': 1.653, 'epoch': 2.0}
{'loss': 0.6505, 'grad_norm': 0.9051032662391663, 'learning_rate': 4.166666666666667e-06, 'epoch': 2.4}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.6296890377998352, 'eval_accuracy': 0.72, 'eval_runtime': 11.0955, 'eval_samples_per_second': 27.038, 'eval_steps_per_second': 1.712, 'epoch': 2.96}
{'loss': 0.6274, 'grad_norm': 1.0089205503463745, 'learning_rate': 3.7037037037037037e-06, 'epoch': 3.2}
{'loss': 0.613, 'grad_norm': 0.8708637952804565, 'learning_rate': 3.240740740740741e-06, 'epoch': 4.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.6032102704048157, 'eval_accuracy': 0.7666666666666667, 'eval_runtime': 11.1982, 'eval_samples_per_second': 26.79, 'eval_steps_per_second': 1.697, 'epoch': 4.0}
{'loss': 0.5935, 'grad_norm': 1.046048879623413, 'learning_rate': 2.7777777777777783e-06, 'epoch': 4.8}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5850763916969299, 'eval_accuracy': 0.79, 'eval_runtime': 13.9234, 'eval_samples_per_second': 21.546, 'eval_steps_per_second': 1.365, 'epoch': 4.96}
{'loss': 0.5805, 'grad_norm': 0.8641161918640137, 'learning_rate': 2.314814814814815e-06, 'epoch': 5.6}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5685244798660278, 'eval_accuracy': 0.8133333333333334, 'eval_runtime': 12.4622, 'eval_samples_per_second': 24.073, 'eval_steps_per_second': 1.525, 'epoch': 6.0}
{'loss': 0.559, 'grad_norm': 0.9292294979095459, 'learning_rate': 1.8518518518518519e-06, 'epoch': 6.4}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5546382665634155, 'eval_accuracy': 0.8533333333333334, 'eval_runtime': 13.6445, 'eval_samples_per_second': 21.987, 'eval_steps_per_second': 1.392, 'epoch': 6.96}
{'loss': 0.5596, 'grad_norm': 0.8832235932350159, 'learning_rate': 1.3888888888888892e-06, 'epoch': 7.2}
{'loss': 0.5524, 'grad_norm': 0.9021896719932556, 'learning_rate': 9.259259259259259e-07, 'epoch': 8.0}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5484529137611389, 'eval_accuracy': 0.8466666666666667, 'eval_runtime': 12.8302, 'eval_samples_per_second': 23.382, 'eval_steps_per_second': 1.481, 'epoch': 8.0}
{'loss': 0.5432, 'grad_norm': 0.9815702438354492, 'learning_rate': 4.6296296296296297e-07, 'epoch': 8.8}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5444304943084717, 'eval_accuracy': 0.8533333333333334, 'eval_runtime': 12.516, 'eval_samples_per_second': 23.969, 'eval_steps_per_second': 1.518, 'epoch': 8.96}
{'loss': 0.5401, 'grad_norm': 0.9436311721801758, 'learning_rate': 0.0, 'epoch': 9.6}


  0%|          | 0/19 [00:00<?, ?it/s]

{'eval_loss': 0.5427874326705933, 'eval_accuracy': 0.85, 'eval_runtime': 12.7112, 'eval_samples_per_second': 23.601, 'eval_steps_per_second': 1.495, 'epoch': 9.6}
{'train_runtime': 1482.7839, 'train_samples_per_second': 8.093, 'train_steps_per_second': 0.081, 'train_loss': 0.6000442981719971, 'epoch': 9.6}


TrainOutput(global_step=120, training_loss=0.6000442981719971, metrics={'train_runtime': 1482.7839, 'train_samples_per_second': 8.093, 'train_steps_per_second': 0.081, 'train_loss': 0.6000442981719971, 'epoch': 9.6})