# Interacting with CLIP

This is a self-contained notebook that shows how to download and run CLIP models, calculate the similarity between arbitrary image and text inputs, and perform zero-shot image classifications.

# Preparation for Colab

Make sure you're running a GPU runtime; if not, select "GPU" as the hardware accelerator in Runtime > Change Runtime Type in the menu. The next cells will print the CUDA version of the runtime if it has a GPU, and install PyTorch 1.7.1.

In [1]:
import subprocess

CUDA_version = [s for s in subprocess.check_output(["nvcc", "--version"]).decode("UTF-8").split(", ") if s.startswith("release")][0].split(" ")[-1]
print("CUDA version:", CUDA_version)

if CUDA_version == "10.0":
    torch_version_suffix = "+cu100"
elif CUDA_version == "10.1":
    torch_version_suffix = "+cu101"
elif CUDA_version == "10.2":
    torch_version_suffix = ""
else:
    torch_version_suffix = "+cu110"

CUDA version: 9.1


In [2]:
#! pip install torch==1.7.1{torch_version_suffix} torchvision==0.8.2{torch_version_suffix} -f https://download.pytorch.org/whl/torch_stable.html ftfy regex

In [3]:
import numpy as np
import torch

print("Torch version:", torch.__version__)

Torch version: 1.7.1


# Downloading the model

CLIP models are distributed as TorchScript modules.

In [4]:
MODELS = {
    "ViT-B/32":       "https://openaipublic.azureedge.net/clip/models/40d365715913c9da98579312b702a82c18be219cc2a73407c4526f58eba950af/ViT-B-32.pt",
}

In [5]:
#! wget {MODELS["ViT-B/32"]} -O model.pt

# Run a model

In [6]:
import os
import clip
import torch
from PIL import Image
from helpers import get_classes

In [7]:
# Load the model
device = "cpu"
model, preprocess = clip.load('ViT-B/32', device)

In [8]:
# Prepare the inputs
image = Image.open('shark.jpg')
image_input = preprocess(image).unsqueeze(0).to(device)
classes = get_classes("cifar100_classes.txt")
text_inputs = torch.cat([clip.tokenize(f"a photo of a {c}") for c in classes]).to(device)

In [9]:
# Calculate features
with torch.no_grad():
    image_features = model.encode_image(image_input)
    text_features = model.encode_text(text_inputs)

In [10]:
# Pick the top 5 most similar labels for the image
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
similarity = (100.0 * image_features @ text_features.T).softmax(dim=-1)
values, indices = similarity[0].topk(5)

# Print the result
print("\nTop predictions:\n")
for value, index in zip(values, indices):
    print(f"{classes[index]:>16s}: {100 * value.item():.2f}%")


Top predictions:

         dolphin: 57.62%
           shark: 36.93%
           whale: 2.86%
   aquarium_fish: 0.83%
             sea: 0.46%
