In [None]:
#This cell just loads the CLIP transformer
import torch
from transformers import pipeline

clip = pipeline("zero-shot-image-classification", model="openai/clip-vit-base-patch16", dtype=torch.bfloat16)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json: 0.00B [00:00, ?B/s]

pytorch_model.bin:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/599M [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.
Device set to use cuda:0


In [None]:
#Use cuda if available
device = "cuda" if torch.cuda.is_available() else "cpu"

In [None]:
#Extract total parameters from the model
model = clip.model
total_params = sum(p.numel() for p in model.parameters())
print(f"Total parameters: {total_params:,}")

Total parameters: 149,620,737


In [None]:
#Freeze the parameters of CLIP; define the two additional layers required for a classifier
import torch
import torch.nn as nn

for p in model.parameters():
    p.requires_grad = False

in_dim = model.config.projection_dim
num_classes = 10  # SVHN
classifier = nn.Sequential(
    nn.Linear(in_dim, 256),
    nn.ReLU(inplace=True),
    nn.Dropout(0.1),
    nn.Linear(256, num_classes)
).to(device)

In [None]:
#Load the datasets for SVHN for training
!pip install torch torchvision scipy
import torch
import torchvision
from torchvision import datasets
from torch.utils.data import DataLoader


train_ds = datasets.SVHN(root=".", split="train", download=True, transform=None)
test_ds  = datasets.SVHN(root=".", split="test",  download=True, transform=None)

def collate_fn(batch):
    imgs, labels = zip(*batch)
    return list(imgs), torch.tensor(labels, dtype=torch.long)

train_loader = DataLoader(train_ds, batch_size=128, shuffle=True, num_workers=2, collate_fn=collate_fn)
test_loader  = DataLoader(test_ds,  batch_size=256, shuffle=False, num_workers=2, collate_fn=collate_fn)



In [None]:
#Loading processor
from transformers import AutoProcessor

model_name = "openai/clip-vit-base-patch16"
processor = AutoProcessor.from_pretrained(model_name)
print("Processor loaded successfully!")

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Processor loaded successfully!


In [None]:
from torch.optim import AdamW
from torch.nn.functional import normalize
import torch.nn as nn

#Define the optimizer and loss function
opt = AdamW(classifier.parameters(), lr=1e-3, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()

# Extract features (embeddings) from the image
def extract_features(images):
    with torch.no_grad():
        # Use the processor to preprocess the images
        inputs = processor(images=images, return_tensors="pt").to(device)
        # Ensure pixel_values have the correct dtype for the model
        pixel_values = inputs.pixel_values.to(dtype=model.dtype)
        feats = model.get_image_features(pixel_values=pixel_values)
        feats = normalize(feats, dim=-1)
    return feats

#Extract features using CLIP and then pass them in through the classifier (to be trained)
def train_epoch():
    classifier.train()
    total, correct, total_loss = 0, 0, 0.0
    for images, labels in train_loader:
        # Ensure labels are on the correct device and dtype
        labels = labels.to(device, dtype=torch.long)
        feats = extract_features(images)
        logits = classifier(feats)
        loss = criterion(logits, labels)

        opt.zero_grad()
        loss.backward()
        opt.step()

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=1).cpu()
        correct += (preds == labels.cpu()).sum().item()
        total += labels.size(0)
    return total_loss/total, correct/total

#Evaluation loop
@torch.no_grad()
def eval_epoch():
    classifier.eval()
    total, correct, total_loss = 0, 0, 0.0
    for images, labels in test_loader:
        # Ensure labels are on the correct device and dtype
        labels = labels.to(device, dtype=torch.long)
        feats = extract_features(images)
        logits = classifier(feats)
        loss = criterion(logits, labels)

        total_loss += loss.item() * labels.size(0)
        preds = logits.argmax(dim=1).cpu()
        correct += (preds == labels.cpu()).sum().item()
        total += labels.size(0)
    return total_loss/total, correct/total

# Train for a few epochs
for epoch in range(5):
    tr_loss, tr_acc = train_epoch()
    te_loss, te_acc = eval_epoch()
    print(f"Epoch {epoch+1}: train loss {tr_loss:.4f} acc {tr_acc:.3f} | test loss {te_loss:.4f} acc {te_acc:.3f}")

Epoch 1: train loss 1.6266 acc 0.456 | test loss 1.2395 acc 0.609
Epoch 2: train loss 1.3241 acc 0.561 | test loss 1.1325 acc 0.636
Epoch 3: train loss 1.2422 acc 0.589 | test loss 1.0661 acc 0.658
Epoch 4: train loss 1.1852 acc 0.607 | test loss 1.0125 acc 0.675
Epoch 5: train loss 1.1433 acc 0.620 | test loss 0.9737 acc 0.684
