In [None]:
onnx_model_path = "./model/model.onnx"
ExecutionProvider="QNNExecutionProvider"

### Data Processor

In [None]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

### Model Inference with ORT-QNN

In [None]:
import onnxruntime as ort
import torch
import numpy as np

def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
    ep_devices = ort.get_ep_devices()
    for ep_device in ep_devices:
        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
            print(f"Adding {ep_name} for {device_type}")
            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)


session_options = ort.SessionOptions()

add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)

text_model = ort.InferenceSession(
    onnx_model_path, # a model wirh QNN EPContext nodes
    sess_options=session_options,
)

def get_text_embedding(text):
    inputs = processor(
        text=text,
        padding="max_length",
        max_length=77,#text_model.sequence_length,
        truncation=True,
        add_special_tokens=True,
        return_tensors="np",
    )
    output = text_model.run(None, {
        "input_ids": inputs["input_ids"].astype(np.int32),
        "attention_mask": inputs["attention_mask"].astype(np.int32),
    })
    return torch.from_numpy(output[0])

def calculate_score(emb_1, emb_2):
    emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)
    emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)
    return torch.matmul(emb_1, emb_2.T) * 100.0

# Get source embedding and calculate the similarity score for each target
# We need to process one by one because to static quantization, we fixed the batch size to 1
def ask(source, targets):
    source_emb = get_text_embedding(source)
    scores = []
    for i, target in enumerate(targets):
        target_emb = get_text_embedding(target)
        score = calculate_score(source_emb, target_emb)
        print(f"Similarity score of sentence {i}：{score.item()}")


### Play with Samples

In [None]:
ask("a photo containing two cats", ["a photo of tshirt", "a photo of two cats"])