In [7]:
from transformers import QuestionAnsweringPipeline, AutoAdapterModel, AutoModelWithHeads, AutoTokenizer, AutoConfig
from transformers.onnx import OnnxConfig, validate_model_outputs, export
from transformers.models.bert import BertOnnxConfig
from transformers.onnx.features import FeaturesManager

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
import onnxruntime

from onnx_opcounter import calculate_params

import os
import time
import torch
import numpy as np

from datasets import load_metric, load_dataset

from typing import Mapping, OrderedDict
from pathlib import Path
import random
import pandas as pd

from huggingface_hub import hf_hub_download

In [71]:
# working and semi working 
# cosmos_qa, multirc, quartz, race und quail  

# input diff
# commonsense_qa und social_i_qa 


## Helper Code 

In [9]:
def onnx_inference(tokenizer, question, context, choices, onnx_model):
    # onnx_model = InferenceSession(
    #     str(onnx_path), providers=["CPUExecutionProvider"]
    # )

    

    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="np")

    inputs['token_type_ids'] = np.expand_dims(inputs['token_type_ids'], axis=0)
    inputs['input_ids'] =  np.expand_dims(inputs['input_ids'], axis=0)
    inputs['attention_mask'] =  np.expand_dims(inputs['attention_mask'], axis=0)

    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    answer_idx = np.argmax(outputs[0])
    return choices[answer_idx]

## Working adapter models

In [3]:
# adapters based on sequence options 
adapter_list = ["cosmos_qa", "multirc", "quartz", "race", "quail"]
adapter = adapter_list[0]
print(f"Using {adapter}")

Using cosmos_qa


In [4]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter(f"AdapterHub/bert-base-uncased-pf-{adapter}", source="hf")

model.active_adapters = adapter_name

def mc_model_inference(question, context, choices):
    outputs = []

    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    answer_idx = torch.argmax(outputs.logits)
    return choices[answer_idx]

question = "What animal has the most hair?"
context = "Fish are typically not hairy. Cats have 10g of hair. Tigers have 12g of hair. Horses have 100g of hair."

choices = ["Cat", "Horse", "Tiger", "Fish"]
answer = mc_model_inference(question, context, choices)
print(answer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 2811.51it/s]


Horse


In [83]:
# export to onnx

In [8]:
def load_model(model_onnx, model_onnx_quant, as_list=False):
    local_onnx_model = onnxruntime.InferenceSession(model_onnx, providers=["CPUExecutionProvider"])
    local_onnx_model_quant = onnxruntime.InferenceSession(model_onnx_quant, providers=["CPUExecutionProvider"])
    
    so = onnxruntime.SessionOptions()
    so.graph_optimization_level = onnxruntime.GraphOptimizationLevel.ORT_ENABLE_ALL
    
    local_onnx_model_opt = onnxruntime.InferenceSession(model_onnx, so)
    local_onnx_model_quant_opt = onnxruntime.InferenceSession(model_onnx_quant, so)
    
    if as_list:
        return [local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt]
    return local_onnx_model, local_onnx_model_opt, local_onnx_model_quant, local_onnx_model_quant_opt

def repo_builder(reader, adapter):
    repo_id = f"UKP-SQuARE/{reader}-pf-{adapter}-onnx"
    filename_onnx = "model.onnx"
    filename_onnx_quant = "model_quant.onnx"

    model_onnx = hf_hub_download(repo_id=repo_id, filename=filename_onnx)
    model_onnx_quant = hf_hub_download(repo_id=repo_id, filename=filename_onnx_quant)

    return model_onnx, model_onnx_quant

In [20]:
data = load_dataset("cosmos_qa", split=f"validation[:{10}]")

Found cached dataset cosmos_qa (/Users/michaelhermann/.cache/huggingface/datasets/cosmos_qa/default/0.1.0/3e18538cbfdb2c04189b16642715f0f6da3e97ed5df0aadcec3641245b2cf157)


In [21]:
data[0]

{'id': '3BFF0DJK8XA7YNK4QYIGCOG1A95STE##3180JW2OT5AF02OISBX66RFOCTG5J7##A2LTOS0AZ3B28A##Blog_56156##q1_a1##378G7J1SJNCDAAIN46FM2P7T6KZEW2',
 'context': 'Do i need to go for a legal divorce ? I wanted to marry a woman but she is not in the same religion , so i am not concern of the marriage inside church . I will do the marriage registered with the girl who i am going to get married . But legally will there be any complication , like if the other woman comes back one day , will the girl who i am going to get married now will be in trouble or Is there any complication ?',
 'question': 'Why is this person asking about divorce ?',
 'answer0': 'If he gets married in the church he wo nt have to get a divorce .',
 'answer1': 'He wants to get married to a different person .',
 'answer2': 'He wants to know if he does nt like this girl can he divorce her ?',
 'answer3': 'None of the above choices .',
 'label': 1}

In [22]:
#load onnx models
model_onnx, model_onnx_quant = repo_builder("bert-base-uncased", adapter)
onnx_models_list = load_model(model_onnx, model_onnx_quant, as_list=True)
onnx_models_name_helper_list = ["ONNX", "ONNX-OPT", "ONNX Quantized", "ONNX-OPT Quantized"]


# data = load_dataset("cosmos_qa", split=f"validation[:{10}]")
for example in data:
    question = example["question"]
    choices = [example["answer0"], example["answer1"], example["answer2"], example["answer3"]]

    base_answer = mc_model_inference(question, context, choices)

    # eval onnx models
    for onnx_model, onnx_model_name in zip(onnx_models_list, onnx_models_name_helper_list):
        # performance_log(reader , adapter, "seq_length", onnx_model_name, onnx_model, tokenizer, data)
        # answer = mc_model_inference(question, context, choices)
        answer = onnx_inference(tokenizer, question, context, choices, onnx_model)

        print(f"{onnx_model_name}: {answer} {base_answer == answer}")

ONNX: He wants to know if he does nt like this girl can he divorce her ? True
ONNX-OPT: He wants to know if he does nt like this girl can he divorce her ? True
ONNX Quantized: He wants to know if he does nt like this girl can he divorce her ? True
ONNX-OPT Quantized: He wants to know if he does nt like this girl can he divorce her ? True
ONNX: He knows that he will be committing polygamy . True
ONNX-OPT: He knows that he will be committing polygamy . True
ONNX Quantized: He knows that he will be committing polygamy . True
ONNX-OPT Quantized: He knows that he will be committing polygamy . True
ONNX: The bus would arrive at the stop soon True
ONNX-OPT: The bus would arrive at the stop soon True
ONNX Quantized: An ambulance would likely come to the scene False
ONNX-OPT Quantized: An ambulance would likely come to the scene False
ONNX: Medical personnel would come to help the old man True
ONNX-OPT: Medical personnel would come to help the old man True
ONNX Quantized: The bus would arrive a

In [5]:
config = AutoConfig.from_pretrained("bert-base-uncased")
onnx_config = BertOnnxConfig(config, task = "multiple-choice")

onnx_path = Path(f"onnx/{adapter}/model.onnx")

onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_path)

answer = onnx_inference(tokenizer, onnx_path, question, context, choices)
print(answer)

  if context.output_adapter_gating_scores:
  if tensor is not None and hidden_states.shape[0] != tensor.shape[0]:
  if getattr(ctx, "output_" + attr, False):


FileNotFoundError: [Errno 2] No such file or directory: 'onnx/cosmos_qa/model.onnx'

## Non working Adapter models 

In [84]:
# "commonsense_qa", "social_i_qa" 
# ony diff -> need diff amount of choices.

### commonsense_qa

In [85]:
adapter = "commonsense_qa"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter(f"AdapterHub/bert-base-uncased-pf-{adapter}", source="hf")
model.active_adapters = adapter_name

def mc_model_inference(question, question_concept, choices):
    outputs = []
    
    raw_input = [[question_concept, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    answer_idx = torch.argmax(outputs.logits)
    return choices[answer_idx]

question = "The sanctions against the school were a punishing blow, and they seemed to what the efforts the school had made to change?"
question_concept = "punishing"
choices = ["ignore", "enforce", "authoritarian", "yell at", "avoid"] #len 5

answer = mc_model_inference(question, question_concept, choices)
print(answer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4133.00it/s]

ignore





### Social_i_qa

In [86]:
# social_i_qa
adapter = "social_i_qa"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter(f"AdapterHub/bert-base-uncased-pf-{adapter}", source="hf")

model.active_adapters = adapter_name

def mc_model_inference(question, context, choices):
    outputs = []

    raw_input = [[context, question + " " + choice] for choice in choices]
    inputs = tokenizer(raw_input, padding=True, truncation=True, return_tensors="pt")

    with torch.no_grad():
        outputs = model(**inputs)

    answer_idx = torch.argmax(outputs.logits)
    return choices[answer_idx]

question = "What animal has the most hair?"
context = "Fish are typically not hairy. Cats have 10g of hair. Tigers have 12g of hair. Horses have 100g of hair."

choices = ["Cat", "Horse", "Tiger"] ## only diff: 3 instead of 5 or 4 choices
answer = mc_model_inference(question, context, choices)
print(answer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.seq_relationship.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 4140.48it/s]

Tiger





In [92]:
# export to onnx

In [87]:
class CustomOnnxConfig(OnnxConfig):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        
        return OrderedDict(
                [
                    ("input_ids", {0: "batch", 1: "choice"}),
                    ("attention_mask", {0: "batch", 1: "choice"}),
                    # ("token_type_ids", {0: "batch", 1: "choice", 2: "sequence"}), # Roberta doesn't use this
                ]
            )

In [94]:
config = AutoConfig.from_pretrained("bert-base-uncased")
# onnx_config = BertOnnxConfig(config, task = "multiple-choice")

onnx_config = CustomOnnxConfig(config)

onnx_path = Path(f"onnx/{adapter}/model.onnx")
print(onnx_path)

onnx/social_i_qa/model.onnx


In [96]:
print(onnx_config.outputs)
print(onnx_config.default_onnx_opset)

OrderedDict([('last_hidden_state', {0: 'batch', 1: 'sequence'})])
11


In [90]:
export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_path)

  if context.output_adapter_gating_scores:
  if tensor is not None and hidden_states.shape[0] != tensor.shape[0]:
  if getattr(ctx, "output_" + attr, False):


RuntimeError: shape '[-1, 3]' is invalid for input of size 2

In [None]:
answer = onnx_inference(tokenizer, onnx_path, question, context, choices)
print(answer)