In [1]:
import torch
from PIL import Image
import open_clip
import os

In [2]:
name = "ViT-B-16-plus-240"
pretrained = "laion400m_e32"
model, train_transform, eval_transform = open_clip.create_model_and_transforms(name, pretrained=pretrained)

In [None]:
os.makedirs("openvino_fp32")
with open("openvino_fp32/model_index.txt", 'a') as fd:
    fd.write(f"{name}, {pretrained}\n")

In [3]:
tokenizer = open_clip.get_tokenizer('ViT-B-16-plus-240')

image = eval_transform(Image.open("../docs/CLIP.png")).unsqueeze(0)
text = tokenizer(["a diagram", "a dog", "a cat"])

In [5]:
torch.onnx.export(
    model.visual,
    image,
    "image_encoder.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["image"],  # the model's input names
    output_names=["image_embedding"],  # the model's output names
    dynamic_axes={  # variable length axes
        "image": {0: "batch", 1: "num_channels", 2: "height", 3: "width"},
        "image_embedding": {0: "batch"},
    }
)

In [6]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_encoder = convert_model("image_encoder.onnx")
ov.serialize(ov_encoder, "openvino_fp32/image_encoder.xml")

In [7]:
cast_dtype = model.transformer.get_cast_dtype()

x = model.token_embedding(text).to(cast_dtype)  # [batch_size, n_ctx, d_model]

x = x + model.positional_embedding.to(cast_dtype)
x = x.permute(1, 0, 2)  # NLD -> LND


In [8]:
torch.onnx.export(
    model.transformer,
    (x, model.attn_mask),
    "text_encoder.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["input_ids", "attention_mask"],  # the model's input names
    output_names=["text_embeds"],  # the model's output names
    dynamic_axes={  # variable length axes
        "input_ids": {0: "batch", 1: "sequence"},
        "attention_mask": {0: "batch", 1: "sequence"},
    }
)

In [9]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_transformer = convert_model("text_encoder.onnx")
ov.serialize(ov_transformer, "openvino_fp32/text_encoder.xml")



In [10]:
!benchmark_app -m image_encoder.onnx -shape "image[1,3,240,240]" -api sync

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.0-9771-d7f47aa1228
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.0-9771-d7f47aa1228
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 617.92 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     image (node: image) : f32 / [...] / [?,?,?,?]
[ INFO ] Model outputs:
[ INFO ]     image_embedding (node: image_embedding) : f32 / [...] / [?,640]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'image': [1,3,240,240]
[ INFO ] Reshape model took 66.17 ms
[Step 6/11] Configuring input of the model
[ INFO ] Model inputs:
[ INFO ]     image (node: image) : u8 /

In [11]:
!benchmark_app -m text_encoder.onnx -shape "input_ids[77,1,640],attention_mask[77,77]" -api sync

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.0-9771-d7f47aa1228
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.0-9771-d7f47aa1228
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 479.99 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     input_ids (node: input_ids) : f32 / [...] / [?,?,640]
[ INFO ]     attention_mask (node: attention_mask) : f32 / [...] / [?,?]
[ INFO ] Model outputs:
[ INFO ]     text_embeds (node: text_embeds) : f32 / [...] / [?,?,640]
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'input_ids': [77,1,640], 'attention_mask': [77,77]
[ INFO ] Reshape model took 83.06 ms
[S

In [12]:
torch.onnx.export(
    model,
    (image, text),
    "model.onnx",  # where to save the model
    opset_version=14,  # the ONNX version to export the model to
    input_names=["image", "text"],  # the model's input names
    output_names=["image_embedding"],  # the model's output names
    dynamic_axes={  # variable length axes
        "image": {0: "batch", 1: "num_channels", 2: "height", 3: "width"},
        "text": {0: "batch"},
    }
)

  _C._jit_pass_onnx_node_shape_type_inference(node, params_dict, opset_version)
  _C._jit_pass_onnx_graph_shape_type_inference(
  _C._jit_pass_onnx_graph_shape_type_inference(


In [13]:
from openvino.tools.mo import convert_model
import openvino.runtime as ov

ov_transformer = convert_model("model.onnx")
ov.serialize(ov_transformer, "model.xml")



In [16]:
!benchmark_app -m model.xml -shape "image[1,3,240,240],text[1,77]" -api sync

[Step 1/11] Parsing and validating input arguments
[ INFO ] Parsing input parameters
[Step 2/11] Loading OpenVINO Runtime
[ INFO ] OpenVINO:
[ INFO ] Build ................................. 2023.0.0-9771-d7f47aa1228
[ INFO ] 
[ INFO ] Device info:
[ INFO ] CPU
[ INFO ] Build ................................. 2023.0.0-9771-d7f47aa1228
[ INFO ] 
[ INFO ] 
[Step 3/11] Setting device configuration
[Step 4/11] Reading model files
[ INFO ] Loading model files
[ INFO ] Read model took 549.20 ms
[ INFO ] Original model I/O parameters:
[ INFO ] Model inputs:
[ INFO ]     image (node: image) : f32 / [...] / [?,?,?,?]
[ INFO ]     text (node: text) : i64 / [...] / [?,77]
[ INFO ] Model outputs:
[ INFO ]     image_embedding (node: image_embedding) : f32 / [...] / [?,640]
[ INFO ]     3690 (node: 3690) : f32 / [...] / [?,640]
[ INFO ]     3691 (node: 3691) : f32 / [...] / []
[Step 5/11] Resizing model to match image sizes and given batch
[ INFO ] Model batch size: 1
[ INFO ] Reshaping model: 'image