In [6]:
from transformers import QuestionAnsweringPipeline, AutoAdapterModel, AutoModelWithHeads, AutoTokenizer, AutoConfig
from transformers.onnx import OnnxConfig, validate_model_outputs, export
from transformers.models.bert import BertOnnxConfig

import onnx
from onnxruntime.quantization import quantize_dynamic, QuantType
from onnxruntime import InferenceSession
from onnx_opcounter import calculate_params

import os
import torch
import numpy as np

from datasets import load_metric, load_dataset

from typing import Mapping, OrderedDict
from pathlib import Path

### Extractive QA (DROP BERT)

In [2]:
# Basic extractive QA Example
question = "What is Mary's job?"
context = "Thomas likes animals. Mary likes strawberry ice and is a physician"

tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = model.load_adapter("AdapterHub/bert-base-uncased-pf-drop", source="hf")
model.active_adapters = adapter_name

def base_model_inference(question, context):
    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="pt")
    
    model.eval()
    with torch.no_grad():
        outputs = model(**inputs)

    start_idx = torch.argmax(outputs.start_logits, axis=1).numpy()[0]
    end_idx = (torch.argmax(outputs.end_logits, axis=1) + 1).numpy()[0]
    return tokenizer.decode(inputs['input_ids'][0, start_idx:end_idx])

answer = base_model_inference(question, context)
print(answer)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModelWithHeads: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias', 'cls.predictions.bias', 'cls.predictions.transform.LayerNorm.weight']
- This IS expected if you are initializing BertModelWithHeads from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModelWithHeads from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Fetching 6 files: 100%|██████████| 6/6 [00:00<00:00, 5400.39it/s]

physician





In [3]:
# Export to ONNX with custom config
class DropBertOnnxConfig(OnnxConfig):
    @property
    def inputs(self) -> Mapping[str, Mapping[int, str]]:
        if self.task == "multiple-choice":
            dynamic_axis = {0: "batch", 1: "choice", 2: "sequence"}
        else:
            dynamic_axis = {0: "batch", 1: "sequence"}
        return OrderedDict(
            [
                ("input_ids", dynamic_axis),
                ("attention_mask", dynamic_axis),
                ("token_type_ids", dynamic_axis),
            ]
        )

config = AutoConfig.from_pretrained("bert-base-uncased")
onnx_config = DropBertOnnxConfig(config, task="question-answering")

onnx_path = Path("onnx/dropbert/model.onnx")

onnx_inputs, onnx_outputs = export(tokenizer, model, onnx_config, onnx_config.default_onnx_opset, onnx_path)

onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)

  if context.output_adapter_gating_scores:
  if tensor is not None and hidden_states.shape[0] != tensor.shape[0]:
  if getattr(ctx, "output_" + attr, False):


In [4]:
# ONNX model inference
def onnx_inference(onnx_path, question, context):
    onnx_model = InferenceSession(
        str(onnx_path), providers=["CPUExecutionProvider"]
    )

    inputs = tokenizer(question, context, padding=True, truncation=True, return_tensors="np")
    outputs = onnx_model.run(input_feed=dict(inputs), output_names=None)

    start_scores = outputs[0]
    end_scores = outputs[1]
    ans_start = np.argmax(start_scores)
    ans_end = np.argmax(end_scores)+1
    return tokenizer.decode(inputs['input_ids'][0, ans_start:ans_end])

question = 'Where do I live?'
context = 'My name is Paul and I live next to Darmstadt'

answer = onnx_inference(onnx_path, question, context)
print(answer)

next to darmstadt


### Model Quantization

In [10]:
# Quantize base model
quantized_model = torch.quantization.quantize_dynamic(
    model, {torch.nn.Linear}, dtype=torch.qint8
)

def print_size_of_model(model):
    torch.save(model.state_dict(), "temp.p")
    print('Size (MB):', os.path.getsize("temp.p")/(1024*1024))
    os.remove('temp.p')

print_size_of_model(model)
print_size_of_model(quantized_model)

Size (MB): 421.1566400527954
Size (MB): 396.7126741409302


In [9]:
def quantize_onnx_model(onnx_model_path, quantized_model_path):
    quantize_dynamic(onnx_model_path,
                     quantized_model_path,
                     weight_type=QuantType.QInt8)

    print(f"Quantized model saved to:{quantized_model_path}")

quantize_onnx_model('onnx/dropbert/model.onnx', 'onnx/dropbert/model_quant.onnx')

print('ONNX full precision model size (MB):', os.path.getsize("onnx/dropbert/model.onnx")/(1024*1024))
print('ONNX quantized model size (MB):', os.path.getsize("onnx/dropbert/model_quant.onnx")/(1024*1024))

Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.0/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.1/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.2/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.3/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.4/attention/self/MatMul_1]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/attention/self/MatMul]
Ignore MatMul due to non constant B: /[/bert/encoder/layer.5/atten

In [17]:
print('Number of base model params:', model.num_parameters())

def count_params_onnx(model_path):
    onnx_model = onnx.load(model_path)
    params = calculate_params(onnx_model)
    return params

print("ONNX number of full precision model params:", count_params_onnx("onnx/dropbert/model.onnx"))
print("ONNX number of quantized model params:", count_params_onnx("onnx/dropbert/model_quant.onnx"))

Number of base model params: 110378306
ONNX number of full precision model params: 109788226
ONNX number of quantized model params: 109788758.0


### Multiple-Choice QA (Cosmo BERT)

In [None]:
tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")
mc_model = AutoModelWithHeads.from_pretrained("bert-base-uncased")
adapter_name = mc_model.load_adapter("AdapterHub/bert-base-uncased-pf-cosmos_qa", source="hf")
mc_model.active_adapters = adapter_name

def mc_model_inference(question, context, answer0, answer1, answer2, answer3):
    inputs = tokenizer(question, context, answer0, answer1, answer2, answer3, padding=True, truncation=True, return_tensors="pt")
    
    mc_model.eval()
    with torch.no_grad():
        outputs = mc_model(**inputs)

    print(outputs)

    start_idx = torch.argmax(outputs.start_logits, axis=1).numpy()[0]
    end_idx = (torch.argmax(outputs.end_logits, axis=1) + 1).numpy()[0]
    return tokenizer.decode(inputs['input_ids'][0, start_idx:end_idx])


question = "What may happen during one of your visits to Conneticut?"
context = "After spending a few days in New York City this week, I ventured into Connecticut to spend the weekend with some friends. We had great weather and I enjoyed reconnecting with them after way too long. We enjoyed some great meals including Lenny & Joe's for amazing fried clams and lobster rolls, the River Tavern in Chester, and O'Rourke's Diner in Middletown (try the \"Irish Embassy\" for breakfast)."
answer0 = "I would avoid old relationships."
answer1= "I would end up going to New York instead."
answer2= "I'd only eat Irish food."
answer3= "I'd eat some amazing food."
answer = mc_model_inference(question, context, answer0, answer1, answer2, answer3)
print(answer)


### Export Fails due mismatch (MultiRC+RoBERTa)
See onnx_example.ipynb

Should be solvable by increasing atol from 1e-5 to 3e-5 

### Measure Accuracy & Performance on Dataset
See onnx_example.ipynb