In [1]:
# https://huggingface.co/docs/transformers/model_sharing
# https://huggingface.co/docs/optimum/v1.2.1/en/onnxruntime/modeling_ort
# https://huggingface.co/docs/optimum/v1.2.1/en/onnxruntime/modeling_ort#optimum.onnxruntime.ORTModelForQuestionAnswering

# https://github.com/huggingface/optimum/blob/ed95b9fa8019af29ce1904ac3cfef8729eb4f4be/optimum/modeling_base.py#L12
# https://github.com/huggingface/huggingface_hub/blob/main/src/huggingface_hub/hf_api.py#L1458
# https://github.com/huggingface/huggingface_hub/blob/664cfdd25adfb69f429decf19e2d65ed5599f9fd/src/huggingface_hub/utils/_deprecation.py#L7


In [1]:
import os
from typing import Mapping, OrderedDict
from pathlib import Path

from huggingface_hub import HfApi, HfFolder, hf_hub_download
from transformers import AutoModelWithHeads, AutoTokenizer, AutoConfig
from transformers.onnx import OnnxConfig, export

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType

import pandas as pd

from transformers.models.bert import BertOnnxConfig
from transformers.models.roberta import RobertaOnnxConfig
from transformers.models.bart import BartOnnxConfig

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def push_to_hub(save_dir, repository_id):
    huggingface_token = HfFolder.get_token()
    api = HfApi()

    api.create_repo(
        token=huggingface_token,
        repo_id=f'UKP-SQuARE/{repository_id}',
        exist_ok=True,
        private=False
    )

    for path, subdirs, files in os.walk(save_dir):
        for name in files:
            local_file_path = os.path.join(path, name)
            _, hub_file_path = os.path.split(local_file_path)
            try:
                api.upload_file(
                    token=huggingface_token,
                    repo_id=f"UKP-SQuARE/{repository_id}",
                    path_or_fileobj=os.path.join(os.getcwd(), local_file_path),
                    path_in_repo=hub_file_path,
                )
            except KeyError:
                pass
            except NameError:
                pass


In [3]:
def generate_readme(directory_path, base_model, adapter, skill):
    model_id = adapter.split("/")[1]+"-onnx"

    readme_path = hf_hub_download(repo_id=adapter, filename="README.md")

    onnx_readme = "{}/README.md".format(directory_path)

    skip = False
    with open(readme_path, 'r') as src, open(onnx_readme, 'w') as dst:
        for line in src:
            # Insert onnx tag
            if line == 'tags:\n':
                dst.write("inference: false\n")
                dst.write(line)
                dst.write('- onnx\n')
                continue

            if line.startswith("# Adapter"):
                skip = True

                # Insert custom README
                dst.write("# ONNX export of " + line[2:])
                dst.write(f"## Conversion of [{adapter}](https://huggingface.co/{adapter}) for UKP SQuARE\n\n\n")
                dst.write("## Usage\n")
                dst.write("```python\n")
                dst.write(f"onnx_path = hf_hub_download(repo_id='UKP-SQuARE/{model_id}', filename='model.onnx') # or model_quant.onnx for quantization\n")
                dst.write("onnx_model = InferenceSession(onnx_path, providers=['CPUExecutionProvider'])\n\n")

                if (skill == "span-extraction"):
                    dst.write("context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.'\n")
                    dst.write("question = 'What are advantages of ONNX?'\n")
                    dst.write(f"tokenizer = AutoTokenizer.from_pretrained('UKP-SQuARE/{model_id}')\n\n")
                    dst.write("inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors='np')\n")
                    dst.write("outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)\n")
                    dst.write("```\n\n")

                elif (skill == "categorical"):
                    dst.write("context = 'English orthography typically represents vowel sounds with the five conventional vowel letters ⟨a, e, i, o, u⟩, as well as ⟨y⟩, which may also be a consonant depending on context. However, outside of abbreviations, there are a handful of words in English that do not have vowels, either because the vowel sounds are not written with vowel letters or because the words themselves are pronounced without vowel sounds'.\n")
                    dst.write("question = 'can there be a word without a vowel'\n")
                    dst.write(f"tokenizer = AutoTokenizer.from_pretrained('UKP-SQuARE/{model_id}')\n\n")
                    dst.write("inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors='np')\n")
                    dst.write("outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)\n")
                    dst.write("```\n\n")

                elif skill == "multiple-choice":
                    dst.write("context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.'\n")
                    dst.write("question = 'What are advantages of ONNX?'\n")
                    dst.write('choices = ["Cat", "Horse", "Tiger", "Fish"]')

                    dst.write(f"tokenizer = AutoTokenizer.from_pretrained('UKP-SQuARE/{model_id}')\n\n")

                    dst.write("raw_input = [[context, question + " " + choice] for choice in choices]\n")
                    dst.write('inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")\n')

                    dst.write("inputs['token_type_ids'] = np.expand_dims(inputs['token_type_ids'], axis=0)\n")
                    dst.write("inputs['input_ids'] =  np.expand_dims(inputs['input_ids'], axis=0)\n")
                    dst.write("inputs['attention_mask'] =  np.expand_dims(inputs['attention_mask'], axis=0)\n")
                    dst.write("outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)\n")

                    dst.write("```\n\n")

                elif skill == "abstractive":
                    dst.write("context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.'\n")
                    dst.write("question = 'What are advantages of ONNX?'\n")
                    dst.write(f"tokenizer = AutoTokenizer.from_pretrained('UKP-SQuARE/{model_id}')\n\n")
                    dst.write("inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors='np')\n")
                    dst.write("outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)\n")
                    dst.write("```\n\n")


            # Continue with normal model card
            if line.startswith("## Architecture & Training"): 
                skip = False

            if not skip: 
                dst.write(line)

In [16]:
class CustomOnnxConfig(OnnxConfig):
    # Inspired by BertONNXConfig, can be extended to support other QA tasks
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}

        return OrderedDict(
                [
                    ("input_ids", dynamic_axis),
                    ("attention_mask", dynamic_axis),
                    ("token_type_ids", dynamic_axis), # Roberta doesn't use this
                ]
            )


def onnx_export(base_model, adapter_id, skill, quantize_model=True):
    if adapter_id == "narrativeqa":
        tokenizer = AutoTokenizer.from_pretrained("facebook/bart-base")
        model = AutoModelWithHeads.from_pretrained("facebook/bart-base")
        adapter_name = model.load_adapter("AdapterHub/narrativeqa", source="hf", set_active=True)
        model.set_active_adapters(adapter_name)
        adapter = "AdapterHub/narrativeqa"

        config = AutoConfig.from_pretrained("facebook/bart-base")
    else:

        tokenizer = AutoTokenizer.from_pretrained(base_model)
        model = AutoModelWithHeads.from_pretrained(base_model)
        adapter = f"AdapterHub/{base_model}-pf-{adapter_id}"
        
        config = AutoConfig.from_pretrained(base_model)




    if (skill == "span-extraction") | (skill == "categorical"):
        if base_model.startswith("bert"):
            onnx_config = BertOnnxConfig(config)
        elif base_model.startswith("roberta"):
            onnx_config = RobertaOnnxConfig(config)
        else:
            onnx_config = CustomOnnxConfig(config)
            
    elif skill == "multiple-choice":
        if base_model.startswith("bert"):
            onnx_config = BertOnnxConfig(config, task="multiple-choice")
        elif base_model.startswith("roberta"):
            onnx_config = RobertaOnnxConfig(config, task="multiple-choice")
        else:
            onnx_config = CustomOnnxConfig(config, task="multiple-choice")
            
    elif skill == "abstractive":
        if base_model.startswith("facebook/bart-base") or base_model.startswith("bart"):
            onnx_config = BartOnnxConfig(config, task="causal-lm")
        else:
            raise NotImplementedError("Only BART is supported for abstractive qa")

    
    # Generate the local directory in onnx/
    model_id = adapter.split("/")[1]+"-onnx"
    directory_path = Path("onnx/{}".format(model_id))
    directory_path.mkdir(parents=True, exist_ok=True)
    onnx_model_path = Path("{}/model.onnx".format(directory_path))

    # Export ONNX model
    export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_model_path)


    # Create config.json of vanilla model
    model.save_pretrained(directory_path)
    os.remove(directory_path / "pytorch_model.bin")

    # Save tokenizer
    tokenizer.save_pretrained(directory_path)

    # Create README.md
    generate_readme(directory_path, base_model, adapter, skill)

    if quantize_model:
        quantized_model_path = "{}/model_quant.onnx".format(directory_path)
        quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=QuantType.QInt8)

    print("Uploading model to hub... (may take a few minutes)")
    push_to_hub(
        save_dir = directory_path,
        repository_id = model_id,
    )

# Exporting Models and Uplaoding them. 

In [17]:
# available_skills = ["span-extraction", "categorical", "multiple-choice", "abstractive"]
# available_skills = ["span-extraction"]
#available_skills = ["categorical"]
# available_skills = ["multiple-choice"]
available_skills = ["abstractive"]

# available_adapters = ["cosmos_qa", "multirc", "quartz", "race", "quail"]
available_adapters = [] # all 

all_skills = pd.read_csv(f'square_skills/all_skills.csv')

for skill in available_skills:
    skills = all_skills[all_skills["Type"] == skill]

    print(f"Exporting {skill}")
    print("_________________________\n")
    for reader, adapter in zip(skills["Reader Model"], skills["Reader Adapter"]):
        
        if (not available_adapters) or (adapter in available_adapters): #if no adapter is specified -> loop over all. OR: is some are specified-> one do these. 
            print(f"Exporting: {reader} {adapter}")
            onnx_export(reader, adapter, skill)
        else:
            print(f"Ups. {reader} {adapter} not available yet.")

Exporting abstractive
_________________________

Exporting: bart-base narrativeqa


Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 3016.40it/s]
  if attn_weights.size() != (bsz * self.num_heads, tgt_len, src_len):
  if attention_mask.size() != (bsz, 1, tgt_len, src_len):
  if attn_output.size() != (bsz * self.num_heads, tgt_len, self.head_dim):
  if context.output_adapter_gating_scores:
  if tensor is not None and hidden_states.shape[0] != tensor.shape[0]:
  if input_shape[-1] > 1:
  mask = torch.full((tgt_len, tgt_len), torch.tensor(torch.finfo(dtype).min))
  if getattr(ctx, "output_" + attr, False):
  if input_ids is not None and x.shape[1] == input_ids.shape[1]:
  if len(torch.unique(eos_mask.sum(1))) > 1:


Ignore MatMul due to non constant B: /[/model/decoder/layers.0/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/decoder/layers.0/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/encoder/layers.0/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/encoder/layers.0/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/encoder/layers.1/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/encoder/layers.1/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/encoder/layers.2/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/encoder/layers.2/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/encoder/layers.3/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/encoder/layers.3/self_attn/MatMul_1]
Ignore MatMul due to non constant B: /[/model/encoder/layers.4/self_attn/MatMul]
Ignore MatMul due to non constant B: /[/model/encoder/layers.4/self_attn/MatMul_1]
Ignore MatMul du

HfHubHTTPError: 403 Client Error: Forbidden for url: https://huggingface.co/api/models/UKP-SQuARE/narrativeqa-onnx/commit/main (Request ID: Root=1-63c7d840-4959f0a56726503558a116b8)

Forbidden: pass `create_pr=1` as a query parameter to create a Pull Request

# 

# Archive

In [9]:
def generate_readme(directory_path, base_model, adapter):
    model_id = adapter.split("/")[1]+"-onnx"

    readme_path = hf_hub_download(repo_id=adapter, filename="README.md")

    onnx_readme = "{}/README.md".format(directory_path)

    skip = False
    with open(readme_path, 'r') as src, open(onnx_readme, 'w') as dst:
        for line in src:
            # Insert onnx tag
            if line == 'tags:\n':
                dst.write("inference: false\n")
                dst.write(line)
                dst.write('- onnx\n')
                continue

            if line.startswith("# Adapter"):
                skip = True

                # Insert custom README
                dst.write("# ONNX export of " + line[2:])
                dst.write(f"## Conversion of [{adapter}](https://huggingface.co/{adapter}) for UKP SQuARE\n\n\n")
                dst.write("## Usage\n")
                dst.write("```python\n")
                dst.write(f"onnx_path = hf_hub_download(repo_id='UKP-SQuARE/{model_id}', filename='model.onnx') # or model_quant.onnx for quantization\n")
                dst.write("onnx_model = InferenceSession(onnx_path, providers=['CPUExecutionProvider'])\n\n")
                dst.write("context = 'ONNX is an open format to represent models. The benefits of using ONNX include interoperability of frameworks and hardware optimization.'\n")
                dst.write("question = 'What are advantages of ONNX?'\n")
                dst.write(f"tokenizer = AutoTokenizer.from_pretrained('UKP-SQuARE/{model_id}')\n\n")
                dst.write("inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors='np')\n")
                dst.write("outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)\n")
                dst.write("```\n\n")

            # Continue with normal model card
            if line.startswith("## Architecture & Training"): 
                skip = False

            if not skip: 
                dst.write(line)

In [10]:
class CustomOnnxConfig(OnnxConfig):
    # Inspired by BertONNXConfig, can be extended to support other QA tasks
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}

        return OrderedDict(
                [
                    ("input_ids", dynamic_axis),
                    ("attention_mask", dynamic_axis),
                    ("token_type_ids", dynamic_axis), # Roberta doesn't use this
                ]
            )


def onnx_export(base_model, adapter_id, quantize_model=True):
    tokenizer = AutoTokenizer.from_pretrained(base_model)
    model = AutoModelWithHeads.from_pretrained(base_model)
    
    adapter = f"AdapterHub/{base_model}-pf-{adapter_id}"
    adapter_name = model.load_adapter(adapter, source="hf")
    model.active_adapters = adapter_name

    config = AutoConfig.from_pretrained(base_model)

    if base_model.startswith("bert"):
        onnx_config = BertOnnxConfig(config, task="question-answering")
    elif base_model.startswith("roberta"):
        onnx_config = RobertaOnnxConfig(config, task="question-answering")
    else:
        onnx_config = CustomOnnxConfig(config, task="question-answering")
        

    # Generate the local directory in onnx/
    model_id = adapter.split("/")[1]+"-onnx"
    directory_path = Path("onnx/{}".format(model_id))
    directory_path.mkdir(parents=True, exist_ok=True)
    onnx_model_path = Path("{}/model.onnx".format(directory_path))

    # Export ONNX model
    export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_model_path)

    # Create config.json of vanilla model
    model.save_pretrained(directory_path)
    os.remove(directory_path / "pytorch_model.bin")

    # Save tokenizer
    tokenizer.save_pretrained(directory_path)

    # Create README.md
    generate_readme(directory_path, base_model, adapter)

    if quantize_model:
        quantized_model_path = "{}/model_quant.onnx".format(directory_path)
        quantize_dynamic(onnx_model_path, quantized_model_path, weight_type=QuantType.QInt8)

    print("Uploading model to hub... (may take a few minutes)")
    push_to_hub(
        save_dir = directory_path,
        repository_id = model_id,
    )