### Data Processor

In [None]:
from transformers import CLIPProcessor

processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

### Model Inference with ORT-QNN

In [None]:
from pathlib import Path

text_model_path = Path("./model/model.onnx").resolve()

In [None]:
import onnxruntime as ort
import torch
import numpy as np

#qnn session
options = ort.SessionOptions()
# options.add_session_config_entry("session.disable_cpu_ep_fallback", "1")
text_model = ort.InferenceSession(text_model_path,
    sess_options=options,
    providers=["QNNExecutionProvider"],
    provider_options=[{"backend_path": "QnnHtp.dll"}])

def _create_4d_mask(mask, input_shape, masked_value=-50.0):
    batch_sz, seq_len = input_shape
    expanded_mask = mask[:, None, None, :].expand(
        batch_sz, 1, seq_len, seq_len)
    inverted_mask = 1.0 - expanded_mask.float()
    return inverted_mask.masked_fill(inverted_mask.bool(), masked_value)

def get_text_embedding(text):
    inputs = processor(
        text=text,
        padding="max_length",
        max_length=77,#text_model.sequence_length,
        truncation=True,
        add_special_tokens=True,
        return_tensors="pt",
    )
    mask = _create_4d_mask(
            inputs["attention_mask"],
            inputs["input_ids"].shape,
        )
    output = text_model.run(None, {
        "input_ids": inputs["input_ids"].numpy().astype(np.int32),
        "attention_mask": mask.numpy(),
    })
    return torch.from_numpy(output[0])

def calculate_score(emb_1, emb_2):
    emb_1 /= torch.norm(emb_1, dim=-1, keepdim=True)
    emb_2 /= torch.norm(emb_2, dim=-1, keepdim=True)
    return torch.matmul(emb_1, emb_2.T) * 100.0

# Get source embedding and calculate the similarity score for each target
# We need to process one by one because to static quantization, we fixed the batch size to 1
def ask(source, targets):
    source_emb = get_text_embedding(source)
    scores = []
    for target in targets:
        target_emb = get_text_embedding(target)
        score = calculate_score(source_emb, target_emb)
        scores.append(score)
    return torch.tensor(scores).softmax(dim=0)

### Play with Samples

In [None]:
ask("a photo containing two cats", ["a photo of tshirt", "a photo of two cats"])