In [None]:
onnx_model_path = "./model/model.onnx"

ExecutionProvider="QNNExecutionProvider"
if ExecutionProvider == "OpenVINOExecutionProvider":
    onnx_model_path = "./model/openvino_model_st_quant.onnx"

In [None]:
inputs = "This is an example sentence."

In [None]:
import onnxruntime as ort
import torch
import torch.nn.functional as F

from transformers import AutoModel, AutoTokenizer

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = torch.tensor(model_output[0])
    input_mask_expanded = attention_mask.unsqueeze(-1).expand_as(token_embeddings).float()
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(input_mask_expanded.sum(1), min=1e-9)

In [None]:
tokenizer = AutoTokenizer.from_pretrained('google-bert/bert-base-multilingual-cased')
encoded_input = tokenizer(
    inputs,
    padding="max_length",
    max_length=128,
    truncation=True,
    add_special_tokens=True,
    return_tensors="pt",
)

In [None]:
def add_ep_for_device(session_options, ep_name, device_type, ep_options=None):
    ep_devices = ort.get_ep_devices()
    for ep_device in ep_devices:
        if ep_device.ep_name == ep_name and ep_device.device.type == device_type:
            print(f"Adding {ep_name} for {device_type}")
            session_options.add_provider_for_devices([ep_device], {} if ep_options is None else ep_options)


session_options = ort.SessionOptions()

add_ep_for_device(session_options, ExecutionProvider, ort.OrtHardwareDeviceType.NPU)

session = ort.InferenceSession(
    onnx_model_path, # a model wirh QNN EPContext nodes
    sess_options=session_options,
)

input_ids = encoded_input["input_ids"]
attention_mask = encoded_input["attention_mask"]
token_type_ids = encoded_input["token_type_ids"]
inputs = {
    "input_ids": input_ids.long().cpu().numpy(),
    "attention_mask": attention_mask.long().cpu().numpy(),
    "token_type_ids": token_type_ids.long().cpu().numpy()
}

outputs = session.run(None, inputs)
embeds_1 = mean_pooling(outputs, encoded_input['attention_mask'])
embeds_1 = F.normalize(embeds_1, p=2, dim=1)

In [None]:
# get text embedding from orinal model, as ground truth.
model = AutoModel.from_pretrained('google-bert/bert-base-multilingual-cased').eval()
with torch.no_grad():
    outputs = model(**encoded_input)
    embeds_2 = mean_pooling(outputs, encoded_input['attention_mask'])
    embeds_2 = F.normalize(embeds_2, p=2, dim=1)

In [None]:
similarity = F.cosine_similarity(embeds_1, embeds_2).item()
print("Similarity: ", similarity)