Let's start with using a standard OpenCLIP model

In [None]:
# import packages
import numpy as np
import torch
import open_clip
from open_clip import tokenizer

In [None]:
# lets check out the example image we will be using for classification
import matplotlib.pyplot as plt
from skimage import data, data_dir
import os
from PIL import Image
%matplotlib inline

img = data.astronaut()

plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:
# install clip_benchmark from source: https://github.com/AlexKoff88/CLIP_benchmark/tree/openvino_alt
from clip_benchmark.models import load_clip

name = "ViT-B-16-plus-240"
pretrained = "laion400m_e32"
model_dir = "openvino_tome"
model, preprocess, _  = load_clip("openvino_clip", model_dir, pretrained, cache_dir=None, device="cpu")


img = Image.open(os.path.join(data_dir, 'astronaut.png')).convert("RGB")
img_preprocessed = preprocess(img).unsqueeze(0)

descriptions = {
    "page": "a page of text about segmentation",
    "chelsea": "a facial photo of a tabby cat",
    "astronaut": "a portrait of an astronaut with the American flag",
    "rocket": "a rocket standing on a launchpad",
    "motorcycle_right": "a red motorcycle standing in a garage",
    "camera": "a person looking at a camera on a tripod",
    "horse": "a black-and-white silhouette of a horse", 
    "coffee": "a cup of coffee on a saucer"
}
texts = descriptions.values()

text_processed = tokenizer.tokenize(texts)

In [None]:
img_embedding, text_embedding, _ = model(img_preprocessed, text_processed)
probs = (100 * img_embedding @ text_embedding.T).softmax(dim=-1)
plt.bar(descriptions.keys(), probs.squeeze().detach().cpu().numpy())
plt.xticks(rotation=45)
plt.ylabel('Probability (%)')
plt.show()