In [None]:
onnx_model_path = "./model/openvino_model_quant_st.onnx"
ExecutionProvider="OpenVINOExecutionProvider"

In [None]:
from PIL import Image
import requests
 
from transformers import CLIPProcessor
import onnxruntime as ort
import numpy as np
import torch
 
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14", use_fast=False)
 
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
 
inputs = processor(text=["a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog"],
                images=image, return_tensors="np", padding="max_length",
                max_length= 77, truncation=True)
 

def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
    ep_devices = ort.get_ep_devices()
    for ep_device in ep_devices:
        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
            print(f"Adding {ep_name} for {device_type}")
            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)
 
opts = ort.SessionOptions()
 
add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)
assert opts.has_providers()

# options = ort.SessionOptions()
session = ort.InferenceSession(onnx_model_path,
    sess_options=opts,
    # providers=[ExecutionProvider],
    # provider_options=[provider_options]
)
logits_per_image = session.run(["logits_per_image"],
                     {
                        "input_ids": inputs['input_ids'].astype(np.int64),
                        "attention_mask": inputs['attention_mask'].astype(np.int64),
                        "pixel_values": inputs['pixel_values']
                    })
 
probs = torch.tensor(logits_per_image[0]).softmax(dim=1)
print("Label probs:", probs)