This tutorial demonstrates how to use int8 operations with OpenCLIP.

Usually matrix multiplies are conducted in float16 or bfloat16, but int8 operations are faster.

For more information please see https://github.com/mlfoundations/open_clip#int8-support

We ran this on an A100 GPU

Note that this tutorial requires two additional pip installs on top of those required for standard OpenCLIP.

In [None]:
# necessary installs for int8
%pip install scikit-image

Let's start with using a standard OpenCLIP model

In [None]:
# import packages
import numpy as np
import torch
import open_clip
from open_clip import tokenizer

In [None]:
# load the model
name = "ViT-B-16-plus-240"
pretrained = "laion400m_e32"
model, _, preprocess = open_clip.create_model_and_transforms(name, pretrained=pretrained)

In [None]:
# lets check out the example image we will be using for classification
import matplotlib.pyplot as plt
from skimage import data, data_dir
import os
from PIL import Image
%matplotlib inline

img = data.astronaut()

plt.imshow(img)
plt.axis('off')
plt.show()

In [None]:

# preprocess image and text
img = Image.open(os.path.join(data_dir, 'astronaut.png')).convert("RGB")
img_preprocessed = preprocess(img).unsqueeze(0)

descriptions = {
    "page": "a page of text about segmentation",
    "chelsea": "a facial photo of a tabby cat",
    "astronaut": "a portrait of an astronaut with the American flag",
    "rocket": "a rocket standing on a launchpad",
    "motorcycle_right": "a red motorcycle standing in a garage",
    "camera": "a person looking at a camera on a tripod",
    "horse": "a black-and-white silhouette of a horse", 
    "coffee": "a cup of coffee on a saucer"
}
texts = descriptions.values()

text_processed = tokenizer.tokenize(texts)

In [None]:
# get predictions from the model
with torch.cuda.amp.autocast():
    img_embedding, text_embedding, _ = model(img_preprocessed, text_processed)
probs = (100 * img_embedding @ text_embedding.T).softmax(dim=-1)
plt.bar(descriptions.keys(), probs.squeeze().detach().cpu().numpy())
plt.xticks(rotation=45)
plt.ylabel('Probability (%)')
plt.show()

In [None]:
import tomeov
from copy import deepcopy

model_opt = deepcopy(model)

tomeov.patch_openclip(model_opt, 24)

In [None]:
import time
with torch.cuda.amp.autocast():
    start = time.time()
    img_embedding, text_embedding, _ = model_opt(img_preprocessed, text_processed)
    print(f"elapsed: {time.time() - start} seconds")
probs = (100 * img_embedding @ text_embedding.T).softmax(dim=-1)
plt.bar(descriptions.keys(), probs.squeeze().detach().cpu().numpy())
plt.xticks(rotation=45)
plt.ylabel('Probability (%)')
plt.show()

In [None]:
from pathlib import Path

ouptut_dir = "openvino_tome"
if not os.path.exists(ouptut_dir):
    os.makedirs(ouptut_dir)
    
with open(Path(ouptut_dir) / "model_index.txt", 'w') as fd:
    fd.write(f"{name},{pretrained}\n")

In [None]:
tokenizer = open_clip.get_tokenizer(name)

image = preprocess(Image.open("../../docs/CLIP.png")).unsqueeze(0)
text = tokenizer("a cat")

In [None]:
torch.onnx.export(
    model_opt.visual,
    image,
    "image_encoder.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["image"],  # the model's input names
    output_names=["image_embedding"],  # the model's output names
    dynamic_axes={  # variable length axes
        "image": {0: "batch"},
        "image_embedding": {0: "batch"},
    }
)

In [None]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_encoder = convert_model("image_encoder.onnx")
opt_image_encoder_path = ouptut_dir + "/image_encoder.xml"
ov.serialize(ov_encoder, opt_image_encoder_path)

In [None]:
class TextTransformerExportWrapper(torch.nn.Module):
    def __init__(self, model):
        super().__init__()
        self.model = model

    def forward(self, text):
        x = self.model.token_embedding(text)
        x = x + self.model.positional_embedding
        x = x.permute(1, 0, 2)  # NLD -> LND
        x = self.model.transformer(x, attn_mask=self.model.attn_mask)
        x = x.permute(1, 0, 2)  # LND -> NLD
        x = self.model.ln_final(x)  # [batch_size, n_ctx, transformer.width]
        # take features from the eot embedding (eot_token is the highest number in each sequence)
        x = x[torch.arange(x.shape[0]), text.argmax(dim=-1)] @ self.model.text_projection
        return x

transformer_export_wrapper = TextTransformerExportWrapper(model_opt)

In [None]:
torch.onnx.export(
    transformer_export_wrapper,
    text,
    "text_encoder.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input_ids"],  # the model's input names
    output_names=["text_embeds"],  # the model's output names
    dynamic_axes={  # variable length axes
        "input_ids": {0: "batch"}
    }
)

In [None]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_transformer = convert_model("text_encoder.onnx")
ov.serialize(ov_transformer, ouptut_dir + "/text_encoder.xml")