### Data Processor

In [None]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch16")

### Model Inference with ORT-QNN

In [None]:
from pathlib import Path

# TODO need to manually replace one
text_model_path = Path("./output_model/model.onnx").resolve()
vision_model_path = Path("./output_model/model.onnx").resolve()
(text_model_path, vision_model_path)

In [None]:
import onnxruntime as ort
import torch
import numpy as np

#qnn session
options = ort.SessionOptions()
# options.add_session_config_entry("session.disable_cpu_ep_fallback", "1")
text_model = ort.InferenceSession(text_model_path,
    sess_options=options,
    providers=["QNNExecutionProvider"],
    provider_options=[{"backend_path": "QnnHtp.dll"}])
vision_model = ort.InferenceSession(vision_model_path,
    sess_options=options,
    providers=["QNNExecutionProvider"],
    provider_options=[{"backend_path": "QnnHtp.dll"}])

def get_image_embedding(image):
    inputs = processor(images=image, return_tensors="np")
    output = vision_model.run(None, { "pixel_values": inputs["pixel_values"] })
    return torch.from_numpy(output[0])

def _create_4d_mask(mask, input_shape, masked_value=-50.0):
    batch_sz, seq_len = input_shape
    expanded_mask = mask[:, None, None, :].expand(
        batch_sz, 1, seq_len, seq_len)
    inverted_mask = 1.0 - expanded_mask.float()
    return inverted_mask.masked_fill(inverted_mask.bool(), masked_value)

def get_text_embedding(text):
    inputs = processor(
        text=text,
        padding="max_length",
        max_length=77,#text_model.sequence_length,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
    )
    mask = _create_4d_mask(
            inputs["attention_mask"],
            inputs["input_ids"].shape,
        )
    output = text_model.run(None, {
        "input_ids": inputs["input_ids"].numpy().astype(np.int32),
        "attention_mask": mask.numpy(),
    })
    return torch.from_numpy(output[0])

def calculate_score(text_emb, image_emb):
    image_emb /= torch.norm(image_emb, dim=-1, keepdim=True)
    text_emb /= torch.norm(text_emb, dim=-1, keepdim=True)
    return torch.matmul(text_emb, image_emb.T) * 100.0

# Get image embedding and calculate the similarity score for each caption
# We need to process one by one because to static quantization, we fixed the batch size to 1
def ask(image, captions):
    image_emb = get_image_embedding(image)
    scores = []
    for caption in captions:
        text_emb = get_text_embedding(caption)
        score = calculate_score(text_emb, image_emb)
        scores.append(score)
    return torch.tensor(scores).softmax(dim=0)

### Play with Samples

In [None]:
import requests
from PIL import Image

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
image

In [None]:
ask(image, ["a photo of tshirt", "a photo of two cats"])