In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
# Create local folder where the dataset will be extracted
!mkdir -p /content/data

In [3]:
# Unzip dataset from Google Drive into Colab local storage
!unzip -q "/content/drive/MyDrive/ecoscan/data/garbage_classification.zip" -d /content/data/

In [None]:
!pip install -q transformers accelerate datasets evaluate scikit-learn pillow

[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/84.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
import torch, transformers, sklearn, PIL
print("CUDA:", torch.cuda.is_available())
print("Transformers:", transformers.__version__)

CUDA: True
Transformers: 4.55.2


**Goal**: Set up imports, reproducibility, device (CPU/GPU), and project paths for training and model outputs.

In [None]:
# Imports & basic setup
import os, json, random
import pandas as pd
from pathlib import Path
import numpy as np
from PIL import Image
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
import torch
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoImageProcessor,
    TrainingArguments,
    Trainer,
    ViTForImageClassification,
)
from transformers.trainer_callback import EarlyStoppingCallback
from transformers.trainer_utils import IntervalStrategy, SaveStrategy

# Reproducibility
SEED = 42
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

# (Optional) make CUDA runs more deterministic
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

# Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)
if torch.cuda.is_available():
    print("GPU:", torch.cuda.get_device_name(0))

# Paths
# Dataset lives in Colab local storage (fast)
DATA_DIR = Path("/content/data/garbage_classification")

# Outputs (checkpoints, final model, metadata) go to Google Drive
ROOT_DRIVE = Path("/content/drive/MyDrive/ecoscan")
OUT_DIR = ROOT_DRIVE / "models" / "vit_ecoscan_v1"
OUT_DIR.mkdir(parents=True, exist_ok=True)

print("DATA_DIR:", DATA_DIR.resolve())
print("OUT_DIR :", OUT_DIR.resolve())

# Safety check: fail early if DATA_DIR does not exist
assert DATA_DIR.exists(), f"Dataset folder not found at {DATA_DIR}. Check your path."



Device: cuda
GPU: NVIDIA A100-SXM4-40GB
DATA_DIR: /content/data/garbage_classification
OUT_DIR : /content/drive/MyDrive/ecoscan/models/vit_ecoscan_v1


**Goal**: Scan dataset folders (one class per folder), count images per class, and create a stratified 80/20 train/validation split

In [None]:
# Scan dataset & create stratified train/val split
IMG_EXTS = {".jpg", ".jpeg", ".png", ".bmp", ".webp"}
IGNORE_DIRS = {'.ipynb_checkpoints', '__MACOSX'}

# List class names from subfolders
classes = sorted([d.name for d in DATA_DIR.iterdir() if d.is_dir() and d.name not in IGNORE_DIRS])
print("Classes:", classes, "\nNum classes:", len(classes))

# Mappings for later use in Trainer
label2id = {name: i for i, name in enumerate(classes)}
id2label = {i: name for i, name in enumerate(classes)}

# Collect all (image_path, class_id) pairs
samples = []
for cid, cname in enumerate(classes):
    class_dir = DATA_DIR / cname
    for p in class_dir.rglob("*"):
        if p.is_file() and p.suffix.lower() in IMG_EXTS:
            samples.append((p, cid))  # keep as Path for now

print("Total images found:", len(samples))
assert len(samples) > 0, "No images found. Check your dataset folder structure."

# (Optional) ensure each class has enough samples for stratify
import collections
raw_counts = collections.Counter([cid for _, cid in samples])
too_small = {classes[k]: v for k, v in raw_counts.items() if v < 2}
assert not too_small, f"Some classes have <2 images (needed for stratify): {too_small}"

# Build lists for split
paths  = [s[0] for s in samples]
labels = [s[1] for s in samples]

# Stratified split to keep class balance
train_paths, val_paths, y_train, y_val = train_test_split(
    paths, labels, test_size=0.2, random_state=SEED, stratify=labels
)

print(f"Train: {len(train_paths)} images | Val: {len(val_paths)} images")

# Per-class counts (sanity check)
def count_by_class(lbls):
    c = collections.Counter(lbls)
    return {classes[k]: v for k, v in sorted(c.items())}

print("Train per class:", count_by_class(y_train))
print("Val  per class :", count_by_class(y_val))


Classes: ['battery', 'brown-glass', 'cardboard', 'clothes', 'electronics', 'green-glass', 'metal_packaging', 'oil', 'organic', 'paper', 'plastic', 'shoes', 'tetrapak', 'trash', 'white-glass'] 
Num classes: 15
Total images found: 16022
Train: 12817 images | Val: 3205 images
Train per class: {'battery': 756, 'brown-glass': 486, 'cardboard': 713, 'clothes': 4260, 'electronics': 156, 'green-glass': 503, 'metal_packaging': 615, 'oil': 146, 'organic': 788, 'paper': 840, 'plastic': 692, 'shoes': 1581, 'tetrapak': 103, 'trash': 558, 'white-glass': 620}
Val  per class : {'battery': 189, 'brown-glass': 121, 'cardboard': 178, 'clothes': 1065, 'electronics': 39, 'green-glass': 126, 'metal_packaging': 154, 'oil': 37, 'organic': 197, 'paper': 210, 'plastic': 173, 'shoes': 396, 'tetrapak': 26, 'trash': 139, 'white-glass': 155}


In [None]:
# ViT processor, label maps (reused), and Dataset

MODEL_NAME = "google/vit-base-patch16-224"

# Processor: resize/normalize to ViT format (224x224, mean/std)
# use_fast=True evita el warning y acelera la preparación
processor = AutoImageProcessor.from_pretrained(MODEL_NAME, use_fast=True)

# Reuse label maps created in cell 2:
# classes, label2id, id2label  <-- ya existen

# Save label maps (useful for inference/app)
(OUT_DIR / "meta").mkdir(parents=True, exist_ok=True)
with open(OUT_DIR / "meta" / "id2label.json", "w") as f:
    json.dump(id2label, f, indent=2)
with open(OUT_DIR / "meta" / "label2id.json", "w") as f:
    json.dump(label2id, f, indent=2)

# Custom Dataset: reads a path, opens image, applies processor, returns tensors
class EcoScanDataset(torch.utils.data.Dataset):
    def __init__(self, paths, labels, processor, train=False):
        self.paths = paths          # list[Path|str]
        self.labels = labels        # list[int]
        self.processor = processor
        self.train = train

    def __len__(self):
        return len(self.paths)

    def __getitem__(self, idx):
        # Open image (Path or str is fine)
        img = Image.open(self.paths[idx]).convert("RGB")

        # Light augmentation for train only
        if self.train:
            if np.random.rand() < 0.5:
                img = img.transpose(Image.FLIP_LEFT_RIGHT)
            if np.random.rand() < 0.3:
                w, h = img.size
                crop = int(min(w, h) * 0.9)  # keep ~90% area
                left = np.random.randint(0, w - crop + 1)
                top  = np.random.randint(0, h - crop + 1)
                img = img.crop((left, top, left + crop, top + crop)).resize((w, h))

        # Processor -> pixel_values tensor (includes resize to 224 and normalize)
        enc = self.processor(images=img, return_tensors="pt")
        item = {k: v.squeeze(0) for k, v in enc.items()}  # remove batch dim
        item["labels"] = torch.tensor(self.labels[idx]).long()
        return item

# Build train/val datasets
train_ds = EcoScanDataset(train_paths, y_train, processor, train=True)
val_ds   = EcoScanDataset(val_paths,   y_val,   processor, train=False)

# Sanity check
sample = train_ds[0]
print("Sample keys:", sample.keys())
print("pixel_values shape:", sample["pixel_values"].shape, "| label:", sample["labels"].item())



The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


preprocessor_config.json:   0%|          | 0.00/160 [00:00<?, ?B/s]

config.json: 0.00B [00:00, ?B/s]

Sample keys: dict_keys(['pixel_values', 'labels'])
pixel_values shape: torch.Size([3, 224, 224]) | label: 14


This cell prepares the dataset so the Vision Transformer can read the images. The processor resizes each image to 224×224, normalizes colors, and turns it into numbers (tensors). The dictionaries id2label and label2id just translate between class names (like “plastic”) and numbers (like 12). The custom Dataset class loads each image, applies the processor, and adds the correct label. We then create two datasets: one for training and one for validation. Finally, checking a sample shows that each item has pixel_values (the image as numbers) and labels (the class ID). The shape [3, 224, 224] simply means 3 color channels (RGB) and an image size of 224×224.

**Goal**: Prepare DataLoaders so the model can efficiently iterate over training and validation samples in mini-batches

In [None]:
# Quick sanity check: batch shapes from the training dataset
BATCH_SIZE = 16  # small if training on CPU

tmp_loader = DataLoader(train_ds, batch_size=BATCH_SIZE, shuffle=True)
batch = next(iter(tmp_loader))
print("Batch pixel_values shape:", batch["pixel_values"].shape)  # [B, 3, 224, 224]
print("Batch labels shape     :", batch["labels"].shape)         # [B]
del tmp_loader  # not needed by HuggingFace Trainer


Batch pixel_values shape: torch.Size([16, 3, 224, 224])
Batch labels shape     : torch.Size([16])


 This makes training faster and easier to handle on CPU. The train_loader also shuffles the images so the model does not memorize the order. The final print shows that each batch has shape [B, 3, 224, 224] for the images (B = batch size, 16) and [B] for the labels.

**Goal**: Load a pre-trained ViT model and adapt it for our number of classes.

In [None]:

# Load pre-trained ViT and replace the classification head with ours
model = ViTForImageClassification.from_pretrained(
    MODEL_NAME,
    num_labels=len(classes),     # our number of classes
    id2label=id2label,           # id -> class
    label2id=label2id,           # class -> id
    ignore_mismatched_sizes=True
)

# Print model config to check it worked
print(model.config)


model.safetensors:   0%|          | 0.00/346M [00:00<?, ?B/s]

Some weights of ViTForImageClassification were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized because the shapes did not match:
- classifier.bias: found shape torch.Size([1000]) in the checkpoint and torch.Size([15]) in the model instantiated
- classifier.weight: found shape torch.Size([1000, 768]) in the checkpoint and torch.Size([15, 768]) in the model instantiated
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


ViTConfig {
  "architectures": [
    "ViTForImageClassification"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "id2label": {
    "0": "battery",
    "1": "brown-glass",
    "2": "cardboard",
    "3": "clothes",
    "4": "electronics",
    "5": "green-glass",
    "6": "metal_packaging",
    "7": "oil",
    "8": "organic",
    "9": "paper",
    "10": "plastic",
    "11": "shoes",
    "12": "tetrapak",
    "13": "trash",
    "14": "white-glass"
  },
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "label2id": {
    "battery": 0,
    "brown-glass": 1,
    "cardboard": 2,
    "clothes": 3,
    "electronics": 4,
    "green-glass": 5,
    "metal_packaging": 6,
    "oil": 7,
    "organic": 8,
    "paper": 9,
    "plastic": 10,
    "shoes": 11,
    "tetrapak": 12,
    "trash": 13,
    "white-glass": 14
  },
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_a

This cell loads the Vision Transformer (ViT) model pre-trained on ImageNet, but adapts it to our custom dataset. By default, the original model has a classification head with 1000 output neurons (for 1000 ImageNet classes). Since our dataset only has 15 classes, we use the parameter ignore_mismatched_sizes=True to tell Hugging Face to discard the old head and create a new one with the correct number of classes. This way, we can fine-tune the pre-trained features of ViT on our task without dimension mismatches

**Goal**: Define training arguments for fine-tuning our ViT model

In [None]:
# Training Arguments (A100-optimized; save to Drive OUT_DIR defined above)

BATCH_SIZE = 64  # try 64 on A100 (fall back to 32 if you get OOM)

training_args = TrainingArguments(
    output_dir=str(OUT_DIR),                 # checkpoints in Drive
    logging_dir=str(OUT_DIR / "logs"),       # logs in Drive

    # Eval / saving once per epoch
    eval_strategy="epoch",
    save_strategy="epoch",
    save_total_limit=2,                      # keep last 2 checkpoints
    load_best_model_at_end=True,
    metric_for_best_model="accuracy",
    greater_is_better=True,

    # Core hyperparameters
    num_train_epochs=3,
    per_device_train_batch_size=BATCH_SIZE,
    per_device_eval_batch_size=BATCH_SIZE,
    learning_rate=5e-5,
    weight_decay=0.01,
    warmup_ratio=0.05,                       # small LR warmup helps stability

    # Performance tweaks for A100
    bf16=True,                               # A100 supports bfloat16 → faster/safer than fp16
    dataloader_pin_memory=True,
    dataloader_num_workers=4,                # speed up data loading
    logging_steps=50,                        # fewer log writes → less overhead
    report_to="none",
    seed=SEED,
    save_safetensors=True,                   # smaller/safer checkpoint files
)

print("TrainingArguments ready →", training_args.eval_strategy, training_args.save_strategy)
print("Checkpoints will be saved to:", OUT_DIR)



TrainingArguments ready → IntervalStrategy.EPOCH SaveStrategy.EPOCH
Checkpoints will be saved to: /content/drive/MyDrive/ecoscan/models/vit_ecoscan_v1


This cell defines how our model training will run. We tell Hugging Face where to save the model and logs, how often to evaluate and save checkpoints (once per epoch), and which metric to use to keep the best version (accuracy). We also set basic hyperparameters like number of epochs, batch size, learning rate, and how often to show progress. At the end, the print confirms that evaluation and saving are synchronized (both every 801 steps), which means the training is correctly configured.


**Goal**: create the Trainer object that connects the model, data, and training configuration.

In [None]:
# Metrics & Trainer

# Accuracy metric
def compute_metrics(eval_pred):
    try:
        from transformers import EvalPrediction
        if isinstance(eval_pred, EvalPrediction):
            logits, labels = eval_pred.predictions, eval_pred.label_ids
        else:
            logits, labels = eval_pred
    except Exception:
        logits, labels = eval_pred

    if isinstance(logits, (tuple, list)):
        logits = logits[0]

    preds = np.argmax(logits, axis=-1)
    acc = (preds == labels).mean()
    return {"accuracy": float(acc)}

# Create the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    processing_class=processor,
    compute_metrics=compute_metrics,
    callbacks=[EarlyStoppingCallback(early_stopping_patience=2)]
)

print("Trainer created successfully")


Trainer created successfully


  trainer = Trainer(


In [None]:
# ---- SAFE RESUME TRAINING BLOCK ----
import os, time, json, pathlib, torch
from transformers.trainer_utils import get_last_checkpoint

torch.backends.cudnn.benchmark = True  # pequeño speed-up en GPU

# Asegura que el directorio de salida existe
pathlib.Path(training_args.output_dir).mkdir(parents=True, exist_ok=True)

# Detecta último checkpoint (si existe)
last_ckpt = get_last_checkpoint(training_args.output_dir)
if last_ckpt is not None:
    print(" Found checkpoint → resuming from:", last_ckpt)
else:
    print("No checkpoint found → training from scratch.")
resume_arg = last_ckpt  # None si no hay checkpoint

print("🚀 Starting training...")
t0 = time.time()
train_result = trainer.train(resume_from_checkpoint=resume_arg)
t_secs = time.time() - t0
print(f" Training finished in {t_secs/60:.1f} min ({t_secs/3600:.2f} h)")

# ---- SAVE MODEL + PROCESSOR + TRAINER STATE ----
OUT_DIR = training_args.output_dir  # usamos el mismo output_dir
trainer.save_model(OUT_DIR)          # model + config
processor.save_pretrained(OUT_DIR)   # preprocessor para inferencia
trainer.save_state()                 # estado del optimizador, etc.

# ---- EVALUATION ----
eval_metrics = trainer.evaluate()
print("📊 Validation metrics:", eval_metrics)

# ---- SUMMARY JSON ----
summary = {
    "runtime_sec": float(t_secs),
    "runtime_min": float(t_secs / 60),
    "runtime_hours": float(t_secs / 3600),
    "epochs": float(training_args.num_train_epochs),
    "batch_size": int(training_args.per_device_train_batch_size),
    "train_images": len(train_ds),
    "val_images": len(val_ds),
    **{k: float(v) for k, v in eval_metrics.items()},
}
with open(os.path.join(OUT_DIR, "run_summary.json"), "w") as f:
    json.dump(summary, f, indent=2)

print(" Saved to:", OUT_DIR)



No checkpoint found → training from scratch.
🚀 Starting training...


Epoch,Training Loss,Validation Loss,Accuracy
1,0.1064,0.061728,0.983151
2,0.0377,0.04093,0.990016
3,0.0153,0.037245,0.988768


 Training finished in 1.8 min (0.03 h)


📊 Validation metrics: {'eval_loss': 0.04093022644519806, 'eval_accuracy': 0.9900156006240249, 'eval_runtime': 6.0218, 'eval_samples_per_second': 532.235, 'eval_steps_per_second': 8.469, 'epoch': 3.0}
 Saved to: /content/drive/MyDrive/ecoscan/models/vit_ecoscan_v1


## Training Results Summary

- The Vision Transformer (ViT) was fine-tuned successfully on the **garbage classification dataset** (15 classes, ~16k images).  
- **Training completed in ~1.8 minutes** on an NVIDIA A100 GPU.  
- **Final performance:**
  - Training Loss: **0.015**
  - Validation Loss: **0.037**
  - Validation Accuracy: **99.0%**

###  Interpretation
- The model shows **very high accuracy** (≈99%) on the validation set, indicating it learned to distinguish the waste categories effectively.  
- The **training and validation loss decreased smoothly**, suggesting good convergence and **no severe overfitting**.  
- These results demonstrate that the ViT is a strong choice for this dataset.  
- Next steps should include:
  - **Per-class evaluation** (precision, recall, F1) to check whether smaller classes (e.g., *electronics*, *oil*) are as well recognized as dominant ones (e.g., *clothes*).  
  - **Confusion matrix** inspection to detect potential misclassifications between visually similar categories (e.g., *glass vs brown-glass*).  
  - Exporting the model and testing it in a **real-world app scenario** (uploading an image and predicting the category).  

Overall, the model is ready for deployment and further evaluation in production-like settings.
