<a target="_blank" href="https://colab.research.google.com/github/mrdbourke/learn-huggingface/blob/main/notebooks/hugging_face_text_classification_tutorial.ipynb">
  <img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/>
</a>

In [1]:
# Goal
# Start with dataset
  # Could generate this dataset or pre-existing
  # Can have the dataset labelled manually or labelled with an LLM
  # Could label this dataset manually or have it zero-shot labelled
# Build a custom text classifier on labelled data
  # Test text classifier on labelled data vs zero-shot model

In [2]:
# Next:
 # Create a small dataset with GPT4o, e.g. 50x spam/not_spam emails and train a classifier on it ✅
    # Done, see notebook: https://colab.research.google.com/drive/14xr3KN_HINY5LjV0s2E-4i7v0o_XI3U8?usp=sharing 
 # Save the dataset to Hugging Face Datasets ✅
    # Done, see dataset: https://huggingface.co/datasets/mrdbourke/learn_hf_food_not_food_image_captions
 # Train a classifier on it
 # Save the model to the Hugging Face Model Hub
 # Create a with Gradio and test the model in the wild 

In [5]:
try:
  import datasets, evaluate, accelerate
except:
  !pip install -U datasets, evaluate, accelerate
  # !pip install -U datasets, evaluate, accelerate
  import datasets, evaluate, accelerate

from datasets import Dataset

import random
import pandas as pd

import transformers

# from google.colab import drive
# drive.mount('/content/drive')

In [11]:
# Load the dataset
dataset = datasets.load_dataset("mrdbourke/learn_hf_food_not_food_image_captions")

In [12]:
import random

random_idx = random.randint(0, len(dataset["train"]))
random_sample = dataset["train"][random_idx]

print(f"[INFO] Random sample from dataset:\n{random_sample}")

[INFO] Random sample from dataset:
{'text': 'Barbecue grill waiting on a patio', 'label': 'not_food'}


In [13]:
# Turn labels into 0 or 1 (e.g. 0 for "not_food", 1 for "food"), see: https://huggingface.co/docs/datasets/en/process#map
def map_labels_to_number(example):
  example["label"] = 0 if example["label"] == "not_food" else 1
  return example

dataset = dataset["train"].map(map_labels_to_number)
dataset[:5]

{'text': ['Creamy cauliflower curry with garlic naan, featuring tender cauliflower in a rich sauce with cream and spices, served with garlic naan bread.',
  'Set of books stacked on a desk',
  'Watching TV together, a family has their dog stretched out on the floor',
  'Wooden dresser with a mirror reflecting the room',
  'Lawn mower stored in a shed'],
 'label': [1, 0, 0, 0, 0]}

In [15]:
dataset.shuffle()[:5]

{'text': ['Silverware organizer keeping cutlery tidy in a kitchen drawer',
  'Gluten-free sushi roll using tamari sauce instead of soy sauce.',
  'A bowl of sliced oranges with a sprinkle of cinnamon and a side of cloves',
  'A slice of veggie pizza loaded with colorful and nutritious vegetables',
  'Two people sitting at a dining room table with a newspaper on it'],
 'label': [0, 1, 1, 1, 0]}

In [16]:
# Create train/test splits, see: https://huggingface.co/docs/datasets/en/process#split
dataset = dataset.train_test_split(test_size=0.2)

In [17]:
dataset

DatasetDict({
    train: Dataset({
        features: ['text', 'label'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label'],
        num_rows: 50
    })
})

In [21]:
random_idx_train = random.randint(0, len(dataset["train"]))
random_sample_train = dataset["train"][random_idx_train]

random_idx_test = random.randint(0, len(dataset["test"]))
random_sample_test = dataset["test"][random_idx_test]

print(f"[INFO] Random sample from training dataset:\n{random_sample_train}")
print(f"[INFO] Random sample from testing dataset:\n{random_sample_test}")

[INFO] Random sample from training dataset:
{'text': "Wooden cutting board with a chef's knife ready for use", 'label': 0}
[INFO] Random sample from testing dataset:
{'text': 'Fennel in a bowl, sprinkled with lemon zest and served with a side of olive oil for a light, refreshing dish.', 'label': 1}


## Preprocess

See docs: https://huggingface.co/docs/transformers/en/tasks/sequence_classification#preprocess

In [22]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("distilbert/distilbert-base-uncased")

def preprocess_function(examples):
  return tokenizer(examples["text"], truncation=True)



tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/483 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

In [23]:
tokenized_dataset = dataset.map(preprocess_function, batched=True)
tokenized_dataset

Map:   0%|          | 0/200 [00:00<?, ? examples/s]

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 200
    })
    test: Dataset({
        features: ['text', 'label', 'input_ids', 'attention_mask'],
        num_rows: 50
    })
})

In [24]:
tokenized_dataset["train"][0], tokenized_dataset["test"][0]

({'text': 'Set of books stacked on a desk',
  'label': 0,
  'input_ids': [101, 2275, 1997, 2808, 16934, 2006, 1037, 4624, 102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1]},
 {'text': 'Set of tea towels folded in a kitchen',
  'label': 0,
  'input_ids': [101, 2275, 1997, 5572, 24213, 6999, 1999, 1037, 3829, 102],
  'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1]})

In [25]:
# Collate examples and pad them each batch
from transformers import DataCollatorWithPadding
data_collator = DataCollatorWithPadding(tokenizer=tokenizer)
data_collator

DataCollatorWithPadding(tokenizer=DistilBertTokenizerFast(name_or_path='distilbert/distilbert-base-uncased', vocab_size=30522, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("[PAD]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	100: AddedToken("[UNK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	101: AddedToken("[CLS]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	102: AddedToken("[SEP]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	103: AddedToken("[MASK]", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
}, padding=True, max_length=None, pad_to_multiple_of=None, return_tensor

## Evaluation

See: https://huggingface.co/docs/transformers/en/tasks/sequence_classification#evaluate

In [26]:
import evaluate
import numpy as np

accuracy = evaluate.load("accuracy")

def compute_metrics(eval_pred):
  predictions, labels = eval_pred
  predictions = np.argmax(predictions, axis=1)
  return accuracy.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/4.20k [00:00<?, ?B/s]

## Train

See: https://huggingface.co/docs/transformers/en/tasks/sequence_classification#train

3 steps for training:

1. Define model
2. Define training arguments
3. Pass training arguments to Trainer
4. Call `train()`

In [27]:
# Create mapping from id2label and label2id
id2label = {0: "not_food", 1: "food"}
label2id = {"not_food": 0, "food": 1}

In [28]:
from transformers import AutoModelForSequenceClassification, Trainer, TrainingArguments

model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path="distilbert/distilbert-base-uncased",
    num_labels=2,
    id2label=id2label,
    label2id=label2id
)

model.safetensors:   0%|          | 0.00/268M [00:00<?, ?B/s]

Some weights of DistilBertForSequenceClassification were not initialized from the model checkpoint at distilbert/distilbert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight', 'pre_classifier.bias', 'pre_classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [38]:
# Create training arguments
# See: https://huggingface.co/docs/transformers/v4.40.2/en/main_classes/trainer#transformers.TrainingArguments
# TODO: Turn off Weights & Biases logging? Or add it in?
training_args = TrainingArguments(
    output_dir="food_not_food_text_model", # TODO: change this path to model save path, e.g. 'learn_hf_food_not_food_text_classifier_model' 
    learning_rate=2e-5,
    per_device_train_batch_size=32,
    per_device_eval_batch_size=32,
    num_train_epochs=10,
    weight_decay=0.01,
    evaluation_strategy="epoch",
    save_strategy="epoch",
    load_best_model_at_end=True,
    # push_to_hub=True
)

In [39]:
# Setup Trainer
# Note: Trainer applies dynamic padding by default when you pass `tokenizer` to it.
# In this case, you don't need to specify a data collator explicitly.
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    tokenizer=tokenizer,
    #data_collator=data_collator,
    compute_metrics=compute_metrics
)

In [40]:
# Batch size 16
#  [ 391/15234 00:22 < 14:27, 17.12 it/s, Epoch 0.05/2]

# Batch size 32
# [ 724/7618 01:08 < 10:51, 10.58 it/s, Epoch 0.19/2]

# Batch size 64
#  [ 150/3810 00:31 < 12:52, 4.74 it/s, Epoch 0.08/2]

In [41]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy
1,No log,0.076706,0.98
2,No log,0.110371,0.98
3,No log,0.116438,0.98
4,No log,0.11824,0.98
5,No log,0.123329,0.98
6,No log,0.127266,0.98
7,No log,0.128845,0.98
8,No log,0.12917,0.98
9,No log,0.12997,0.98
10,No log,0.130197,0.98


TrainOutput(global_step=70, training_loss=0.0031478273017065865, metrics={'train_runtime': 7.0065, 'train_samples_per_second': 285.449, 'train_steps_per_second': 9.991, 'total_flos': 17222831628384.0, 'train_loss': 0.0031478273017065865, 'epoch': 10.0})

In [42]:
# Optional: push the model to Hugging Face Hub for re-use later
# Note: Requires Hugging Face login
# trainer.push_to_hub()

In [43]:
# Save model
# See: https://discuss.huggingface.co/t/how-to-save-my-model-to-use-it-later/20568/4
# TODO: Make a models/ dir to save models to (so we don't have to commit them to git)
trainer.save_model("learn_hf_food_not_food_text_classifier_model")

In [55]:
# TK - Push model to hub (for later re-use)
# TODO: Push this model to the hub to be able to use it later

## Inference

Making predictions on our own text options.

See: https://huggingface.co/docs/transformers/en/tasks/sequence_classification#inference

In [44]:
sample_text = "A delicious photo of a plate of scrambled eggs, bacon and toast"

### Pipeline mode

In [46]:
from transformers import pipeline

food_not_food_classifier = pipeline(task="text-classification", model="./learn_hf_food_not_food_text_classifier_model")
food_not_food_classifier(sample_text)

[{'label': 'food', 'score': 0.9857270121574402}]

In [47]:
sample_text_not_food = "A yellow tractor driving over the hill"
food_not_food_classifier(sample_text_not_food)

[{'label': 'not_food', 'score': 0.9952113032341003}]

In [48]:
# Predicting works with lists
# Can find the examples with highest confidence and keep those
sentences = [
    "I whipped up a fresh batch of code, but it seems to have a syntax error.",
    "We need to marinate these ideas overnight before presenting them to the client.",
    "The new software is definitely a spicy upgrade, taking some time to get used to.",
    "Her social media post was the perfect recipe for a viral sensation.",
    "He served up a rebuttal full of facts, leaving his opponent speechless.",
    "The team needs to simmer down a bit before tackling the next challenge.",
    "Our budget is a bit thin, so we'll have to use budget-friendly materials for this project.",
    "The presentation was a delicious blend of humor and information, keeping the audience engaged.",
    "I'm feeling overwhelmed by this workload – it's a real information buffet.",
    "We're brainstorming new content ideas, hoping to cook up something innovative."
]

food_not_food_classifier(sentences)

[{'label': 'food', 'score': 0.5004892349243164},
 {'label': 'not_food', 'score': 0.8031824827194214},
 {'label': 'food', 'score': 0.5688744187355042},
 {'label': 'food', 'score': 0.517037570476532},
 {'label': 'not_food', 'score': 0.6362245678901672},
 {'label': 'not_food', 'score': 0.7544245719909668},
 {'label': 'not_food', 'score': 0.7407550811767578},
 {'label': 'not_food', 'score': 0.5384439826011658},
 {'label': 'not_food', 'score': 0.8630059957504272},
 {'label': 'not_food', 'score': 0.9562842845916748}]

### PyTorch mode

In [50]:
from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("learn_hf_food_not_food_text_classifier_model")
inputs = tokenizer(sample_text, return_tensors="pt")

In [53]:
import torch
from transformers import AutoModelForSequenceClassification

model = AutoModelForSequenceClassification.from_pretrained("learn_hf_food_not_food_text_classifier_model")
with torch.no_grad():
  logits = model(**inputs).logits

In [54]:
# Get predicted class
predicted_class_id = logits.argmax().item()
print(f"Text: {sample_text}")
print(f"Predicted label: {model.config.id2label[predicted_class_id]}")

Text: A delicious photo of a plate of scrambled eggs, bacon and toast
Predicted label: food


In [56]:
# TODO: Make a demo of the model with Gradio and test it in the wild