In [1]:
# https://huggingface.co/docs/transformers/tasks/image_classification

!pip install transformers datasets evaluate
!pip install --upgrade transformers

Collecting transformers
  Downloading transformers-4.34.1-py3-none-any.whl (7.7 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.7/7.7 MB[0m [31m55.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets
  Downloading datasets-2.14.6-py3-none-any.whl (493 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m493.7/493.7 kB[0m [31m42.6 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub<1.0,>=0.16.4 (from transformers)
  Downloading huggingface_hub-0.18.0-py3-none-any.whl (301 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m302.0/302.0 kB[0m [31m31.4 MB/s[0m eta [36m0:00:00[0m
Collecting tokenizers<0.15,>=0.14 (from transformers)
  Downloading tokenizers-0.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl 

In [2]:
import evaluate
import numpy as np
from datasets import load_dataset
from huggingface_hub import notebook_login
from torchvision.transforms import RandomResizedCrop, Compose, Normalize, ToTensor
from transformers import AutoImageProcessor, DefaultDataCollator, AutoModelForImageClassification, TrainingArguments, Trainer

In [None]:
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [3]:
# https://huggingface.co/docs/datasets/v1.11.0/package_reference/loading_methods.html#datasets.load_dataset
food = load_dataset("food101", split="train[:5000]")

Downloading builder script:   0%|          | 0.00/6.21k [00:00<?, ?B/s]

Downloading metadata:   0%|          | 0.00/5.56k [00:00<?, ?B/s]

Downloading readme:   0%|          | 0.00/10.3k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/5.00G [00:00<?, ?B/s]

Downloading data files:   0%|          | 0/2 [00:00<?, ?it/s]

Downloading data:   0%|          | 0.00/1.47M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/489k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/75750 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/25250 [00:00<?, ? examples/s]

In [4]:
# probably split the dataset into 80% train, 20% test
food = food.train_test_split(test_size=0.2)

In [5]:
# first object in the train part
food["train"][0]

{'image': <PIL.JpegImagePlugin.JpegImageFile image mode=RGB size=512x512>,
 'label': 53}

In [6]:
# to make it easier for the model to get the label name from the label id,
# create a dictionary that maps the label name to an integer, and vice versa

# get labels
labels = food["train"].features["label"].names
print(labels)
# make two empty dicts
label2id, id2label = dict(), dict()

# for each label
for i, label in enumerate(labels):
    label2id[label] = str(i)    # set the value of key label to that of i converted to str
    id2label[str(i)] = label    # set the value of key i converted to str to that of label

['apple_pie', 'baby_back_ribs', 'baklava', 'beef_carpaccio', 'beef_tartare', 'beet_salad', 'beignets', 'bibimbap', 'bread_pudding', 'breakfast_burrito', 'bruschetta', 'caesar_salad', 'cannoli', 'caprese_salad', 'carrot_cake', 'ceviche', 'cheesecake', 'cheese_plate', 'chicken_curry', 'chicken_quesadilla', 'chicken_wings', 'chocolate_cake', 'chocolate_mousse', 'churros', 'clam_chowder', 'club_sandwich', 'crab_cakes', 'creme_brulee', 'croque_madame', 'cup_cakes', 'deviled_eggs', 'donuts', 'dumplings', 'edamame', 'eggs_benedict', 'escargots', 'falafel', 'filet_mignon', 'fish_and_chips', 'foie_gras', 'french_fries', 'french_onion_soup', 'french_toast', 'fried_calamari', 'fried_rice', 'frozen_yogurt', 'garlic_bread', 'gnocchi', 'greek_salad', 'grilled_cheese_sandwich', 'grilled_salmon', 'guacamole', 'gyoza', 'hamburger', 'hot_and_sour_soup', 'hot_dog', 'huevos_rancheros', 'hummus', 'ice_cream', 'lasagna', 'lobster_bisque', 'lobster_roll_sandwich', 'macaroni_and_cheese', 'macarons', 'miso_sou

In [7]:
id2label[str(79)], label2id["prime_rib"]

('prime_rib', '79')

In [None]:
# load a ViT image processor to process the image into a tensor
checkpoint = "google/vit-base-patch16-224-in21k"
image_processor = AutoImageProcessor.from_pretrained(checkpoint)

In [None]:
# apply some image transformations to the images to make the model more robust against overfitting
# crop a random part of the image, resize it, and normalize it with the image mean and standard deviation
normalize = Normalize(mean=image_processor.image_mean, std=image_processor.image_std)
size = (
    image_processor.size["shortest_edge"]
    if "shortest_edge" in image_processor.size
    else (image_processor.size["height"], image_processor.size["width"])
)
_transforms = Compose([RandomResizedCrop(size), ToTensor(), normalize])

In [None]:
# create a preprocessing func to apply the transforms, and return the pixel_values
# which are the inputs to the model, of the image
def transforms(examples):
    examples["pixel_values"] = [_transforms(img.convert("RGB")) for img in examples["image"]]
    del examples["image"]
    return examples

In [None]:
# apply the preprocessing function over the entire dataset
food = food.with_transform(transforms)

In [None]:
# "very simple data collator that simply collates batches of dict-like objects and
# performs special handling for potential keys named label and label_ids"
data_collator = DefaultDataCollator()

In [None]:
# evaluate model's performance
accuracy = evaluate.load("accuracy")

In [None]:
# create a function that passes your predictions and labels to calculate the accuracy
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return accuracy.compute(predictions=predictions, references=labels)

In [None]:
# load model to train
model = AutoModelForImageClassification.from_pretrained(
    checkpoint,             # a pre-trained model, google/vit-base-patch16-224-in21k
    num_labels=len(labels), # the number of labels
    id2label=id2label,      # id and label relationship dicts
    label2id=label2id
)

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224-in21k and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
!pip install accelerate -U
!pip install transformers[torch]

import accelerate
accelerate.__version__



'0.24.0'

In [None]:
# https://huggingface.co/docs/transformers/v4.34.1/en/main_classes/trainer#transformers.TrainingArguments
# define hyperparameters
training_args = TrainingArguments(
    output_dir="my_awesome_food_model", # name of the model you will upload to huggingface for other people to see
    remove_unused_columns=False,        # don't remove unused columns because you can't create pixel_values
    evaluation_strategy="epoch",        # print some info about training after every epoch
    save_strategy="epoch",              # ^ but for saving?
    learning_rate=5e-5,                 # 0.00005 learning rate, you know what this is
    per_device_train_batch_size=16,     # 16 images per batch
    gradient_accumulation_steps=4,      # 4 steps to accumulate gradients for, before backward pass
    per_device_eval_batch_size=16,      # 16 prediction steps before moving results to the cpu
    num_train_epochs=3,                 # 3 training epochs to perform
    warmup_ratio=0.1,                   # idk, somewhat related to learning_rate
    logging_steps=10,                   # 10 steps between 2 logs
    load_best_model_at_end=True,        # related to the table that shows up during training?
    metric_for_best_model="accuracy",   # the last column at the table
    push_to_hub=True                    # push this model to the hub every time the model is saved
)

trainer = Trainer(
    model=model,                        # model, AutoModelForImageClassification
    args=training_args,                 # the huge blob of crap above
    data_collator=data_collator,        # DefaultDataCollator
    train_dataset=food["train"],        # train part of the dataset, 80%
    eval_dataset=food["test"],          # test part of the dataset, 205
    tokenizer=image_processor,          # google/vit-base-patch16-224-in21k
    compute_metrics=compute_metrics     # the helper function you wrote above
)

# now sit back and die waiting
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
0,2.7593,2.553481,0.837
2,1.6207,1.638704,0.911


TrainOutput(global_step=186, training_loss=2.45877472559611, metrics={'train_runtime': 520.7807, 'train_samples_per_second': 23.042, 'train_steps_per_second': 0.357, 'total_flos': 9.232831524962304e+17, 'train_loss': 2.45877472559611, 'epoch': 2.98})

In [None]:
# push it to huggingface for other people to see
trainer.push_to_hub()

'https://huggingface.co/REDACTED/my_awesome_food_model/tree/main/'

In [None]:
# load the same dataset, get the validation part, 10 of them
ds = load_dataset("food101", split="validation[:10]")
# 1st image from the validation part
image = ds["image"][0]

In [None]:
from transformers import pipeline

# probably some guy's "my_awesome_food_model", not mine
classifier = pipeline("image-classification", model="my_awesome_food_model")
classifier(image)

[{'score': 0.3021170198917389, 'label': 'beignets'},
 {'score': 0.016185035929083824, 'label': 'hamburger'},
 {'score': 0.0160551555454731, 'label': 'chicken_wings'},
 {'score': 0.014358025044202805, 'label': 'ramen'},
 {'score': 0.013701029121875763, 'label': 'bruschetta'}]

In [None]:
!zip -r /content/my_awesome_food_model.zip /content/my_awesome_food_model

  adding: content/my_awesome_food_model/ (stored 0%)
  adding: content/my_awesome_food_model/checkpoint-125/ (stored 0%)
  adding: content/my_awesome_food_model/checkpoint-125/trainer_state.json (deflated 71%)
  adding: content/my_awesome_food_model/checkpoint-125/scheduler.pt (deflated 56%)
  adding: content/my_awesome_food_model/checkpoint-125/rng_state.pth (deflated 25%)
  adding: content/my_awesome_food_model/checkpoint-125/config.json (deflated 66%)
  adding: content/my_awesome_food_model/checkpoint-125/training_args.bin (deflated 51%)
  adding: content/my_awesome_food_model/checkpoint-125/preprocessor_config.json (deflated 47%)
  adding: content/my_awesome_food_model/checkpoint-125/optimizer.pt (deflated 8%)
  adding: content/my_awesome_food_model/checkpoint-125/pytorch_model.bin (deflated 7%)
  adding: content/my_awesome_food_model/config.json (deflated 66%)
  adding: content/my_awesome_food_model/checkpoint-62/ (stored 0%)
  adding: content/my_awesome_food_model/checkpoint-62/t

In [None]:
from google.colab import files
files.download("/content/my_awesome_food_model.zip")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [12]:
from transformers import pipeline

# TEST
classifier = pipeline("image-classification", model="REDACTED/my_awesome_food_model")
# should be guacamole
classifier("https://www.wienerschnitzel.com/wp-content/uploads/revslider/9-4-crowd-oc-sept-coupons-1/Kraut1.png")
# classifier("https://huggingface.co/datasets/huggingface/documentation-images/resolve/main/beignets-task-guide.png")

[{'score': 0.3021528422832489, 'label': 'beignets'},
 {'score': 0.016174709424376488, 'label': 'hamburger'},
 {'score': 0.016062289476394653, 'label': 'chicken_wings'},
 {'score': 0.014321570284664631, 'label': 'ramen'},
 {'score': 0.013702934607863426, 'label': 'bruschetta'}]