### Data Processor

In [None]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("laion/CLIP-ViT-B-32-laion2B-s34B-b79K")

### Model Inference with ORT-QNN

In [None]:
from pathlib import Path

vision_model_path = Path("./model/model.onnx").resolve()

In [None]:
import onnxruntime as ort
import torch
import numpy as np

#qnn session
options = ort.SessionOptions()
# options.add_session_config_entry("session.disable_cpu_ep_fallback", "1")
vision_model = ort.InferenceSession(vision_model_path,
    sess_options=options,
    providers=["QNNExecutionProvider"],
    provider_options=[{"backend_path": "QnnHtp.dll"}])

def get_image_embedding(image):
    inputs = processor(images=image, return_tensors="np")
    output = vision_model.run(None, { "pixel_values": inputs["pixel_values"] })
    return torch.from_numpy(output[0])

def calculate_score(emb_1, emb_2):
    emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)
    emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)
    return torch.matmul(emb_1, emb_2.T) * 100.0

# Get source embedding and calculate the similarity score for each target
# We need to process one by one because to static quantization, we fixed the batch size to 1
def ask(source, targets):
    source_emb = get_image_embedding(source)
    scores = []
    for target in targets:
        target_emb = get_image_embedding(target)
        score = calculate_score(source_emb, target_emb)
        scores.append(score)
    return torch.tensor(scores).softmax(dim=0)

### Play with Samples

In [None]:
import requests
from PIL import Image

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
url = "http://images.cocodataset.org/train2017/000000208833.jpg"
image1 = Image.open(requests.get(url, stream=True).raw)
image1

In [None]:
url = "http://images.cocodataset.org/train2017/000000125690.jpg"
image2 = Image.open(requests.get(url, stream=True).raw)
image2

In [None]:
ask(image, [image1, image2])