## CLIP ViT-B/32 Zero-Shot Analysis - CIFAR-10

In [1]:
import torch
from torchvision import datasets
from torch.utils.data import DataLoader, ConcatDataset
import clip


  from pkg_resources import packaging


In [None]:
device = "cuda" if torch.cuda.is_available() else "mps" if torch.mps.is_available() else "cpu"
print("Device:", device)
model, preprocess = clip.load("ViT-B/32", device=device)

Device: cuda


### CIFAR-10 Dataloader (full set)

In [None]:
train_dataset = datasets.CIFAR10(
        root="./data",
        train=True,
        download=True,
        transform=preprocess
    )
test_dataset = datasets.CIFAR10(
        root="./data",
        train=False,
        download=True,
        transform=preprocess
    )

full_dataset = ConcatDataset([train_dataset, test_dataset])

loader = DataLoader(
        full_dataset,
        batch_size= 128,
        shuffle=False,
        num_workers=2
    )

### Generating text embeddings of prompts with ensembling

In [4]:
cifar10_classes = [
    "airplane", "automobile", "bird", "cat", "deer",
    "dog", "frog", "horse", "ship", "truck"
]

# Prompts used in the paper "Learning Transferable Visual Models From Natural Language Supervision"
templates = [
    'a photo of a {}.',
    'a blurry photo of a {}.',
    'a black and white photo of a {}.',
    'a low contrast photo of a {}.',
    'a high contrast photo of a {}.',
    'a bad photo of a {}.',
    'a good photo of a {}.',
    'a photo of a small {}.',
    'a photo of a big {}.',
    'a photo of the {}.',
    'a blurry photo of the {}.',
    'a black and white photo of the {}.',
    'a low contrast photo of the {}.',
    'a high contrast photo of the {}.',
    'a bad photo of the {}.',
    'a good photo of the {}.',
    'a photo of the small {}.',
    'a photo of the big {}.',
]

def get_text_features(templates, classes, model, device):
    all_text_features = []
    for classname in classes:
        if templates is not None:
            texts = [template.format(classname) for template in templates]
        else:
            texts = [classname]
        tokenized = clip.tokenize(texts).to(device)
        text_features = model.encode_text(tokenized)
        text_features /= text_features.norm(dim=-1, keepdim=True)
        class_feature = text_features.mean(dim=0)
        class_feature /= class_feature.norm()
        all_text_features.append(class_feature)
    text_features = torch.stack(all_text_features, dim=0)
    return text_features

plain_text_features = get_text_features(None, cifar10_classes, model, device).cpu()
ensemble_text_features = get_text_features(templates, cifar10_classes, model, device).cpu()

### Zero-shot classification

In [5]:
def zero_shot_classification(loader, text_features, model, device, classes):
    correct = 0
    total = 0
    class_correct = [0] * len(classes)
    class_total = [0] * len(classes)
    class_accuracies = {}
    model.eval()
    with torch.no_grad():
        for images, labels in loader:
            images, labels = images.to(device), labels.to(device)

            # Encode images
            image_features = model.encode_image(images)
            image_features /= image_features.norm(dim=-1, keepdim=True)

            # Compute similarity with text features
            similarity = 100.0 * image_features @ text_features.to(device).T
            preds = similarity.argmax(dim=1)

            correct += (preds == labels).sum().item()
            total += labels.size(0)

            for i in range(len(labels)):
                label = labels[i].item()
                if preds[i].item() == label:
                    class_correct[label] += 1
                class_total[label] += 1

    accuracy = correct / total * 100

    for i, classname in enumerate(classes):
        acc = 100.0 * class_correct[i] / class_total[i]
        class_accuracies[classname] = acc

    return accuracy, class_accuracies

In [6]:
plain_accuracy, plain_class_accuracies = zero_shot_classification(loader, plain_text_features, model, device, cifar10_classes)
ensemble_accuracy, ensemble_class_accuracies = zero_shot_classification(loader, ensemble_text_features, model, device, cifar10_classes)

print(f"Zero-shot accuracy on CIFAR-10 (plain labels): {plain_accuracy:.2f}%")
print("Per-class accuracy:")
for classname, acc in plain_class_accuracies.items():
    print(f"    {classname:10s}: {acc:.2f}%")

print(f"\nZero-shot accuracy on CIFAR-10 (label ensembling): {ensemble_accuracy:.2f}%")
print("Per-class accuracy:")
for classname, acc in ensemble_class_accuracies.items():
    print(f"    {classname:10s}: {acc:.2f}%")

Zero-shot accuracy on CIFAR-10 (plain labels): 87.49%
Per-class accuracy:
    airplane  : 83.37%
    automobile: 89.83%
    bird      : 90.88%
    cat       : 79.30%
    deer      : 80.70%
    dog       : 85.85%
    frog      : 77.40%
    horse     : 97.45%
    ship      : 96.30%
    truck     : 93.80%

Zero-shot accuracy on CIFAR-10 (label ensembling): 89.68%
Per-class accuracy:
    airplane  : 92.02%
    automobile: 93.62%
    bird      : 89.18%
    cat       : 85.23%
    deer      : 80.85%
    dog       : 88.30%
    frog      : 81.62%
    horse     : 97.35%
    ship      : 95.55%
    truck     : 93.03%
