- Fine tuning the DistilBERT model on emotion dataset from huggingface

Use Conda to install the below libraries in the environment. This environment uses Python 3.11

In [None]:
# Import necessary libraries
import torch
from datasets import load_dataset
import evaluate
from transformers import AutoTokenizer, AutoModelForSequenceClassification, TrainingArguments, Trainer
import numpy as np
import pandas as pd

In [None]:
# --- 1. Load a Hugging Face Dataset (small subset for quick training) ---
print("Loading dataset...")
# Small subset of the 'emotion' dataset for quick training
# For a faster run, you can even take a smaller slice or a simpler dataset
dataset = load_dataset("dair-ai/emotion")
# designed for sentiment analysis

In [None]:
# Let's take a small sample to ensure quick training
# You can adjust the size of the samples
# Train dataset - where the model learns patterns
train_dataset = dataset["train"].shuffle(seed=42).select(range(1000)) # 1000 examples
# Validate dataset - Tune hyperparameters (learning rate, batch size), prevent overfitting
eval_dataset = dataset["validation"].shuffle(seed=42).select(range(200)) # 200 examples
# Test dataset - Final, unbiased evaluation, used once after training is complete
test_dataset = dataset["test"].shuffle(seed=42).select(range(200)) # 200 examples

print(f"Train dataset size: {len(train_dataset)}")
print(f"Validation dataset size: {len(eval_dataset)}")
print(f"Test dataset size: {len(test_dataset)}")

# Inspect a sample
print("\nSample from train dataset:")
print(train_dataset[0])

In [None]:
# --- 2. Load Pre-trained Tokenizer and Model ---
print("\nLoading tokenizer and model...")
model_name = "distilbert/distilbert-base-uncased" # A small, fast model

# Loads tokenizer associated with DistilBERT model
# this tokenizer knows to convert raw text into numberical:
# input_ids, attention_mask, token_type_ids
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=dataset["train"].features["label"].num_classes # Automatically get number of labels
)

# Set up device for training (MPS for Apple Silicon if available, else CPU)
if torch.backends.mps.is_available():
    device = torch.device("mps")
    print(f"Using Apple Silicon (MPS) for training: {device}")
elif torch.cuda.is_available():
    device = torch.device("cuda")
    print(f"Using CUDA (GPU) for training: {device}")
else:
    device = torch.device("cpu")
    print(f"Using CPU for training: {device}")

model.to(device)

In [None]:
# --- 3. Preprocess the Dataset ---
print("\nTokenizing and preprocessing dataset...")

# Simple function to tokenize the text field
def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length")

# Apply tokenization to the entire dataset (batches for efficiency)
tokenized_train_dataset = train_dataset.map(tokenize_function, batched=True)
tokenized_eval_dataset = eval_dataset.map(tokenize_function, batched=True)
tokenized_test_dataset = test_dataset.map(tokenize_function, batched=True)

# Rename the 'label' column to 'labels' for the Trainer
# Often needed for Hugging Face Trainer class
tokenized_train_dataset = tokenized_train_dataset.rename_column("label", "labels")
tokenized_eval_dataset = tokenized_eval_dataset.rename_column("label", "labels")
tokenized_test_dataset = tokenized_test_dataset.rename_column("label", "labels")

# Set the format to PyTorch tensors, required for training with Pytorch
tokenized_train_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_eval_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])
tokenized_test_dataset.set_format("torch", columns=["input_ids", "attention_mask", "labels"])

Issue with transformers and accelerate versions in the environment, reinstalled with conda.

In [None]:
import accelerate
import transformers
print(f'Accelerate: {accelerate.__version__}')
print(f'Transformers: {transformers.__version__}')

In [None]:
# --- 4. Define Metrics ---
print("\nDefining evaluation metrics...")
# These are standard metrics to load
metric = evaluate.load("accuracy") # You can add more like "f1", "precision", "recall"

# Function is required by Trainer to calculate metrics during evaluation
# logits are the model's predictions
# labels are the true labels
def compute_metrics(eval_pred):
    logits, labels = eval_pred
    predictions = np.argmax(logits, axis=-1)
    return metric.compute(predictions=predictions, references=labels)

# --- 5. Configure Training Arguments ---
print("\nSetting up Training Arguments...")
training_args = TrainingArguments(
    output_dir="./results",               # Output directory for model checkpoints and logs
    num_train_epochs=3,                   # Number of training epochs (keep low for quick run)
    per_device_train_batch_size=16,       # Batch size per device during training
    per_device_eval_batch_size=16,        # Batch size for evaluation
    warmup_steps=100,                     # Number of warmup steps for learning rate scheduler
    weight_decay=0.01,                    # Strength of weight decay
    logging_dir="./logs",                 # Directory for storing logs
    logging_steps=50,                     # Log every X updates steps
    eval_strategy="epoch",                # Evaluate at the end of each epoch
    save_strategy="epoch",                # Save model at the end of each epoch
    load_best_model_at_end=True,          # Load the best model at the end of training
    metric_for_best_model="accuracy",     # Metric to use to compare models
    report_to="none",                     # Don't report to any online tracker (e.g., wandb, mlflow)
)

In [None]:
# --- 6. Create and Train the Trainer ---
# Trainer is the core class for training models with Hugging Face
print("\nInitializing and training the Trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_dataset,
    eval_dataset=tokenized_eval_dataset,
    compute_metrics=compute_metrics,
    tokenizer=tokenizer, # Pass tokenizer to handle padding dynamically in batches
)

trainer.train()

## Training Results Summary

`TrainOutput(global_step=189, training_loss=1.1334309249958665, metrics={'train_runtime': 293.2748, 'train_samples_per_second': 10.229, 'train_steps_per_second': 0.644, 'total_flos': 397430544384000.0, 'train_loss': 1.1334309249958665, 'epoch': 3.0})`

### Training Metrics Overview
|Metric|Value|Assessment|
|-------|----|----------|
|Final Training Loss|1.133|Moderate, could use adjusted learning rate, more epochs, etc.|
|Training Steps|189|Completed successfully|
|Epochs Completed|3.0|Full Cycle complete|

### Performance Metrics
|Performance Indicator|Value|Notes|
|---|---|----|
|Total Runtime|293.27 seconds/~5 min|Reasonable for small dataset|
|Samples/Second|10.23|Training Throughput|
|Steps/Second|.644|Processing Speed|
|Total FLOPs|397.4 trillion|Computational operations|


In [None]:
# --- 7. Evaluate the Trained Model ---
print("\nEvaluating the model on the test set...")
results = trainer.evaluate(tokenized_test_dataset)
print(f"Test Set Results: {results}")

# --- 8. Make Predictions (Optional) ---
print("\nMaking predictions on a sample...")
# Get some texts from the test set
sample_texts = [test_dataset["text"][i] for i in range(5)]
sample_labels = [test_dataset["label"][i] for i in range(5)]

# Predict
predictions = trainer.predict(tokenized_test_dataset.select(range(5)))
predicted_labels = np.argmax(predictions.predictions, axis=-1)

# Map label IDs back to names (if you have them)
label_names = dataset["train"].features["label"].names

print("\nSample Predictions:")
for i, text in enumerate(sample_texts):
    true_label_name = label_names[sample_labels[i]]
    predicted_label_name = label_names[predicted_labels[i]]
    print(f"Text: \"{text}\"")
    print(f"True Label: {true_label_name}, Predicted Label: {predicted_label_name}\n")

In the Results folder, is a checkpoint for each epoch (3). Split up by number of training steps (189).

Files in each checkpoint:

## Model Files
|File|Purpose|Size|
|---|---|-----|
| model.safetensors | model weights | largest file|
|config.json|model architecture and hyperparameters|Small|

## Tokenizer files
|File|Purpose|
|---|---|
| tokenizer.json | Tokenizer vocabulary and merges |
|tokenizer_config.json|Tokenizer settings|
|special_tokens_map.json|Special tokens (CLS, SEP, PAD, etc)|
|vocab.txt|vocab file (for some tokenizers)|

## Training State Files
|File | Purpose | When You Need It| 
|---|---|---|
|training_args.bin |Your TrainingArguments settings | Resuming training| 
|trainer_state.json | Training progress, loss history | Resuming training|
|optimizer.pt | Optimizer state (Adam, etc.) | Resuming training| 
| scheduler.pt | Learning rate scheduler state| Resuming training |
| rng_state.pth | Random number generator state | Reproducible resuming |

## Important:
- Use the below to resume training, now that you have these files:

```python
# Load for inference - only needs model + tokenizer files
from transformers import AutoModelForSequenceClassification, AutoTokenizer

model = AutoModelForSequenceClassification.from_pretrained("./results/checkpoint-189")
tokenizer = AutoTokenizer.from_pretrained("./results/checkpoint-189")

# Now you can make predictions
```
