In [1]:
from transformers.onnx.features import FeaturesManager

distilbert_features = list(FeaturesManager.get_supported_features_for_model_type("bert").keys())
print(distilbert_features)

['default', 'masked-lm', 'causal-lm', 'sequence-classification', 'token-classification', 'question-answering']


In [2]:
!python -m transformers.onnx --model=distilbert-base-uncased-distilled-squad --feature=question-answering onnx/

Using framework PyTorch: 1.10.2+cpu
Validating ONNX model...
	-[✓] ONNX model output names match reference model ({'start_logits', 'end_logits'})
	- Validating ONNX Model output "start_logits":
		-[✓] (2, 8) matches (2, 8)
		-[✓] all values close (atol: 1e-05)
	- Validating ONNX Model output "end_logits":
		-[✓] (2, 8) matches (2, 8)
		-[✓] all values close (atol: 1e-05)
All good, model saved at: onnx/model.onnx


In [30]:
import torch

from onnxruntime import (
    InferenceSession, SessionOptions, GraphOptimizationLevel
)
from transformers import (
    TokenClassificationPipeline, AutoTokenizer, AutoModelForTokenClassification,
    DistilBertTokenizer, DistilBertForQuestionAnswering, QuestionAnsweringPipeline,
    AutoModelForQuestionAnswering
)

In [31]:
options = SessionOptions() # initialize session options
options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL

session = InferenceSession(
    "onnx/model.onnx", sess_options=options, providers=["CPUExecutionProvider"]
)

# disable session.run() fallback mechanism, it prevents for a reset of the execution provider
session.disable_fallback() 

In [36]:
class OnnxQuestionAnsweringPipeline(QuestionAnsweringPipeline):

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        
    
    def _forward(self, model_inputs):

        # This comes from the original implementation of the pipeline
        special_tokens_mask = model_inputs.pop("special_tokens_mask")
        offset_mapping = model_inputs.pop("offset_mapping", None)
        sentence = model_inputs.pop("sentence")

        inputs = {k: v.cpu().detach().numpy() for k, v in model_inputs.items()} # dict of numpy arrays
        outputs_name = session.get_outputs()[0].name # get the name of the output tensor

        logits = session.run(output_names=[outputs_name], input_feed=inputs)[0] # run the session
        logits = torch.tensor(logits) # convert to torch tensor to be compatible with the original implementation

        return {
            "logits": logits,
            "special_tokens_mask": special_tokens_mask,
            "offset_mapping": offset_mapping,
            "sentence": sentence,
            **model_inputs,
        }

    # We need to override the preprocess method because the onnx model is waiting for the attention masks as inputs
    # along with the embeddings.
    def preprocess(self, question, context, offset_mapping=None):
        truncation = True if self.tokenizer.model_max_length and self.tokenizer.model_max_length > 0 else False
        model_inputs = self.tokenizer(
            sentence,
            return_attention_mask=True, # This is the only difference from the original implementation
            return_tensors=self.framework,
            truncation=truncation,
            return_special_tokens_mask=True,
            return_offsets_mapping=self.tokenizer.is_fast,
        )
        if offset_mapping:
            model_inputs["offset_mapping"] = offset_mapping

        model_inputs["context"] = context
        model_inputs["question"] = question


        return model_inputs

In [37]:
model_name_from_hub = "distilbert-base-uncased-distilled-squad"

tokenizer = AutoTokenizer.from_pretrained(model_name_from_hub)
model = AutoModelForQuestionAnswering.from_pretrained(model_name_from_hub)

onnx_pipeline = OnnxQuestionAnsweringPipeline(
    task="question-answering", 
    model=model,
    tokenizer=tokenizer,
    framework="pt",
    aggregation_strategy="simple",
)

In [38]:

result = onnx_pipeline(question='What kind of scan is this?', context='No spinal cord demyelination. Spinal MRI within normal limits.  Right cerebellar hemangioblastoma again seen Approved and ElectronicallySigned by')

print(result)

TypeError: preprocess() missing 1 required positional argument: 'context'

In [17]:
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased-distilled-squad")
model = AutoModelForQuestionAnswering.from_pretrained("distilbert-base-uncased-distilled-squad")


# Convert the model to ONNX format
torch.onnx.export(model,
                  (torch.zeros(1, 128).long(), torch.zeros(1, 128).long()),
                  "qamodel.onnx",
                  input_names=["input_ids", "attention_mask"],
                  output_names=["start_scores", "end_scores"],
                  dynamic_axes={"input_ids": {0: "batch_size", 1: "sequence_length"},
                                "attention_mask": {0: "batch_size", 1: "sequence_length"},
                                "start_scores": {0: "batch_size"},
                                "end_scores": {0: "batch_size"}},
                opset_version=11)




In [18]:
import onnxruntime
# Load the ONNX model
session = onnxruntime.InferenceSession("qamodel.onnx")

In [29]:
from transformers import QuestionAnsweringPipeline
import numpy as np
class ONNXQuestionAnsweringPipeline(QuestionAnsweringPipeline):
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.session = session
    def _forward(self, model_inputs):
        # Run inference with the ONNX model
        input_ids = model_inputs["input_ids"].numpy()
        attention_mask = model_inputs["attention_mask"].numpy()
        print(input_ids)
        print(attention_mask)
        print(model_inputs)
        outputs = self.session.run(
            None, 
            {"input_ids": input_ids, "attention_mask": attention_mask}
            )
        print(outputs)
        start_scores, end_scores = outputs
        # Find the best start and end positions
        start = np.argmax(start_scores)
        end = np.argmax(end_scores)

        # Include the example in the output
        example = model_inputs["example"]
        p_mask = model_inputs["p_mask"].numpy()
        print(example)
        print(p_mask)
        return {
            "start_logits": start_scores, 
            "end_logits": end_scores, 
            "start": start, 
            "end": end,
            "example":example,
            "p_mask": p_mask
        }

    def preprocess(self, example):
            model_inputs = super().preprocess(example)
            print(model_inputs) # Add a print statement here to see the value of model_inputs
            return model_inputs


# Create an instance of the custom pipeline
qa_pipeline = ONNXQuestionAnsweringPipeline(model=model, tokenizer=tokenizer)

# Use the pipeline for question answering with context
result = qa_pipeline(question="What is your name?", context="My name is John.")
print(result)


<generator object QuestionAnsweringPipeline.preprocess at 0x0000029E16F64C80>
[[ 101 2054 2003 2115 2171 1029  102 2026 2171 2003 2198 1012  102]]
[[1 1 1 1 1 1 1 1 1 1 1 1 1]]
{'example': <transformers.data.processors.squad.SquadExample object at 0x0000029E279836A0>, 'is_last': True, 'input_ids': tensor([[ 101, 2054, 2003, 2115, 2171, 1029,  102, 2026, 2171, 2003, 2198, 1012,
          102]]), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]), 'p_mask': tensor([[False,  True,  True,  True,  True,  True,  True, False, False, False,
         False, False,  True]]), 'token_type_ids': array([0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1]), 'cls_index': None, 'example_index': 0, 'unique_id': 0, 'paragraph_len': 0, 'token_is_max_context': 0, 'tokens': [], 'token_to_orig_map': {}, 'start_position': 0, 'end_position': 0, 'is_impossible': False, 'qas_id': None, 'encoding': Encoding(num_tokens=13, attributes=[ids, type_ids, tokens, offsets, attention_mask, special_tokens_mask, overflo

KeyError: 'encoding'

In [None]:
<class 'bool'>