In [None]:
onnx_model_path = "./model/openvino_model_quant_st.onnx"
ExecutionProvider="OpenVINOExecutionProvider"

In [None]:
def _fix_winrt_runtime():
    """This function removes the msvcp140.dll from the winrt-runtime package.
    So it does not cause issues with other libraries.
    """
    from importlib import metadata
    from pathlib import Path
    site_packages_path = Path(str(metadata.distribution('winrt-runtime').locate_file('')))
    dll_path = site_packages_path / 'winrt' / 'msvcp140.dll'
    if dll_path.exists():
        dll_path.unlink()
            
def _get_ep_paths() -> dict[str, str]:
    from winui3.microsoft.windows.applicationmodel.dynamicdependency.bootstrap import (
        InitializeOptions,
        initialize
    )
    import winui3.microsoft.windows.ai.machinelearning as winml
    eps = {}
    with initialize(options = InitializeOptions.ON_NO_MATCH_SHOW_UI):
        catalog = winml.ExecutionProviderCatalog.get_default()
        providers = catalog.find_all_providers()
        for provider in providers:
            provider.ensure_ready_async().get()
            eps[provider.name] = provider.library_path
            # DO NOT call provider.try_register in python. That will register to the native env.
    return eps

def _regsiter_executino_providers_to_onnxruntime():
    import onnxruntime as ort

    paths = _get_ep_paths()
    for item in paths.items():
        print(f"----register ort ep---- {item[0]} {item[1]}")
        ort.register_execution_provider_library(item[0], item[1])

_fix_winrt_runtime()
_regsiter_executino_providers_to_onnxruntime()

In [None]:
from PIL import Image
import requests
 
from transformers import CLIPProcessor
import onnxruntime as ort
import numpy as np
import torch
 
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32", use_fast=False)
 
url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
 
inputs = processor(text=["a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog", "a photo of a cat", "a photo of a dog"],
                images=image, return_tensors="np", padding="max_length",
                max_length= 77, truncation=True)
 

def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
    ep_devices = ort.get_ep_devices()
    for ep_device in ep_devices:
        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
            print(f"Adding {ep_name} for {device_type}")
            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)
 
opts = ort.SessionOptions()
 
add_ep_for_device(opts, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)
assert opts.has_providers()

# options = ort.SessionOptions()
session = ort.InferenceSession(onnx_model_path,
    sess_options=opts,
    # providers=[ExecutionProvider],
    # provider_options=[provider_options]
)
logits_per_image = session.run(["logits_per_image"],
                     {
                        "input_ids": inputs['input_ids'].astype(np.int64),
                        "attention_mask": inputs['attention_mask'].astype(np.int64),
                        "pixel_values": inputs['pixel_values']
                    })
 
probs = torch.tensor(logits_per_image[0]).softmax(dim=1)
print("Label probs:", probs)