In [1]:
import os
from pathlib import Path

from huggingface_hub import hf_hub_download

from onnxruntime import InferenceSession
from transformers import AutoModelWithHeads, AutoTokenizer

import numpy as np

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def onnx_import(base_model, adapter_id, quantized=False):
    adapter = f"AdapterHub/{base_model}-pf-{adapter_id}"

    model_id = adapter.split("/")[1]+"-onnx"
    directory_path = Path("onnx/{}".format(model_id))
    directory_path.mkdir(parents=True, exist_ok=True)

    if quantized:
        model_path = hf_hub_download(repo_id="UKP-SQuARE/"+model_id, filename="model_quant.onnx")
    else:
        model_path = hf_hub_download(repo_id="UKP-SQuARE/"+model_id, filename="model.onnx")

    return model_path

In [3]:
def preprocessing(question, context, tokenizer):
    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    return {key: np.array(inputs[key], dtype=np.int64) for key in inputs}

def postprocessing(outputs, inputs, tokenizer):
    start_scores = outputs[0]
    end_scores = outputs[1]
    ans_start = np.argmax(start_scores)
    ans_end = np.argmax(end_scores)+1
    return tokenizer.decode(inputs['input_ids'][0, ans_start:ans_end])

def onnx_inference(onnx_path, tokenizer, question, context):
    onnx_model = get_onnx_model(onnx_path)

    inputs = preprocessing(question, context, tokenizer)
    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)
    answer = postprocessing(outputs, inputs, tokenizer)
    return answer

def get_onnx_model(onnx_path):
    return InferenceSession(
        str(onnx_path), providers=["CPUExecutionProvider"]
    )

In [4]:
base_model = 'bert-base-uncased'
head = 'drop'

model_path = onnx_import(base_model, head)

context = 'ONNX is an open format built to represent machine learning models. The key benefits of using ONNX are interoperability of frameworks and HARDware optimization.'
question = 'What are advantages of ONNX?'

tokenizer = AutoTokenizer.from_pretrained(base_model)

answer = onnx_inference(model_path, tokenizer, question, context)
print(answer)

interoperability of frameworks and hardware optimization
