In [23]:
# https://huggingface.co/docs/transformers/model_sharing
# https://huggingface.co/docs/optimum/v1.2.1/en/onnxruntime/modeling_ort
# https://huggingface.co/docs/optimum/v1.2.1/en/onnxruntime/modeling_ort#optimum.onnxruntime.ORTModelForQuestionAnswering

# https://github.com/huggingface/optimum/blob/ed95b9fa8019af29ce1904ac3cfef8729eb4f4be/optimum/modeling_base.py#L12
# https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/hf_api.py#L1458
# https://github.com/huggingface/huggingface_hub/blob/664cfdd25adfb69f429decf19e2d65ed5599f9fd/src/huggingface_hub/utils/_deprecation.py#L7


In [63]:
import os
from typing import Mapping, OrderedDict
from pathlib import Path

from huggingface_hub import HfApi, HfFolder, hf_hub_download
from transformers import AutoModelWithHeads, AutoTokenizer, AutoConfig
from transformers.onnx import OnnxConfig, export

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

import pandas as pd

from transformers.models.bert import BertOnnxConfig
from transformers.models.roberta import RobertaOnnxConfig

In [64]:
def push_to_hub(save_dir, repository_id):
    huggingface_token = HfFolder.get_token()
    api = HfApi()

    api.create_repo(
        token=huggingface_token,
        repo_id=f'UKP-SQuARE/{repository_id}',
        exist_ok=True,
        private=False
    )

    for path, subdirs, files in os.walk(save_dir):
        for name in files:
            local_file_path = os.path.join(path, name)
            _, hub_file_path = os.path.split(local_file_path)
            try:
                api.upload_file(
                    token=huggingface_token,
                    repo_id=f"UKP-SQuARE/{repository_id}",
                    path_or_fileobj=os.path.join(os.getcwd(), local_file_path),
                    path_in_repo=hub_file_path,
                )
            except KeyError:
                pass
            except NameError:
                pass


In [65]:
def generate_readme(directory_path, base_model, adapter):
    model_id = adapter.split("/")[1]+"-onnx"

    readme_path = hf_hub_download(repo_id=adapter, filename="README.md")

    onnx_readme = "{}/README.md".format(directory_path)

    skip = False
    with open(readme_path, 'r') as src, open(onnx_readme, 'w') as dst:
        for line in src:
            # Insert onnx tag
            if line == 'tags:\n':
                dst.write(line)
                dst.write('- onnx\n')
                continue

            if line.startswith("# Adapter"):
                skip = True

                # Insert custom README
                dst.write("# ONNX export of " + line[2:])
                dst.write(f"## Conversion of [{adapter}](https://huggingface.co/{adapter}) for UKP SQuARE\n\n\n")
                dst.write("## Usage\n")
                dst.write("```python\n")
                dst.write(f"onnx_path = hf_hub_download(repo_id='UKP-SQuARE/{model_id}', filename='model.onnx') # or model_quant.onnx for quantization\n")
                dst.write("onnx_model = InferenceSession(onnx_path, providers=['CPUExecutionProvider'])\n\n")
                dst.write("context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.'\n")
                dst.write("question = 'What are advantages of ONNX?'\n")
                dst.write(f"tokenizer = AutoTokenizer.from_pretrained('{base_model}')\n\n")
                dst.write("inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors='np')\n")
                dst.write("inputs_int64 = {key: np.array(inputs[key], dtype=np.int64) for key in inputs}\n")
                dst.write("outputs = onnx_model.run(input_feed=dict(inputs_int64), output_names=None)\n")
                dst.write("```\n\n")

            # Continue with normal model card
            if line.startswith("## Architecture & Training"): 
                skip = False

            if not skip: 
                dst.write(line)

In [67]:
class CustomOnnxConfig(OnnxConfig):
    # Inspired by BertONNXConfig, can be extended to support other QA tasks
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}

        return OrderedDict(
                [
                    ("input_ids", dynamic_axis),
                    ("attention_mask", dynamic_axis),
                    ("token_type_ids", dynamic_axis), # Roberta doesn't use this
                ]
            )


def onnx_export(base_model, adapter_id, quantize_model=True):
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelWithHeads.from_pretrained(base_model)
    
    adapter = f"AdapterHub/{base_model}-pf-{adapter_id}"
    adapter_name = model.load_adapter(adapter, source="hf")
    model.active_adapters = adapter_name

    config = AutoConfig.from_pretrained(base_model)

    if base_model.startswith("bert"):
        onnx_config = BertOnnxConfig(config, task="question-answering")
    elif base_model.startswith("roberta"):
        onnx_config = RobertaOnnxConfig(config, task="question-answering")
    else:
        onnx_config = CustomOnnxConfig(config, task="question-answering")
        

    # Generate the local directory in onnx/
    model_id = adapter.split("/")[1]+"-onnx"
    directory_path = Path("onnx/{}".format(model_id))
    directory_path.mkdir(parents=True, exist_ok=True)
    onnx_model_path = Path("{}/model.onnx".format(directory_path))

    # Export ONNX model
    export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_model_path)

    # Create config.json of vanilla model
    model.save_pretrained(directory_path)
    os.remove(directory_path / "pytorch_model.bin")

    # Create README.md
    generate_readme(directory_path, base_model, adapter)

    if quantize_model:
        quantized_model_path = "{}/model_quant.onnx".format(directory_path)
        quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=QuantType.QInt8)

    print("Uploading model to hub... (may take a few minutes)")
    push_to_hub(
        save_dir = directory_path,
        repository_id = model_id,
    )

In [68]:
skills = pd.read_csv('square_skills/extractive_qa_skills.csv')[0:3]

for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
    print("Exporting: {} {}".format(reader, adapter))
    onnx_export(reader, adapter)

Exporting: bert-base-uncased drop
Uploading model to hub... (may take a few minutes)
Exporting: roberta-base drop
Uploading model to hub... (may take a few minutes)
Exporting: bert-base-uncased hotpotqa
Uploading model to hub... (may take a few minutes)
