Copyright (c) Microsoft Corporation. All rights reserved.  
Licensed under the MIT License.

## ONNX Runtime Question Answering with DistilBert model


In this tutorial, you will learn the end-to-end steps from obtaining a HuggingFace model, converting to ONNX format, adding pre/post processing steps to the ONNX model using onnxruntime-extensions library and finally plug in and apply inference in a sample mobile android/ios app if applicable.

### 0. Prerequisites

You will need to pip install `onnxruntime onnx onnxruntime_extensions transformers` as the necessary libraries.

```
    pip install onnx onnxruntime onnxruntime_extensions
```
```
    pip install transformers
```

To work with Python in Jupyter Notebooks, you must activate an [Anaconda](https://www.anaconda.com/) environment or another Python environment in which you've installed the [Jupyter package](https://pypi.org/project/jupyter/). 

In [1]:
import io
import numpy as np
import onnx
import onnxruntime as ort

###  1. Prepare ONNX Model from HuggingFace DistilBert model

In [2]:
import transformers
from transformers.onnx import FeaturesManager
from pathlib import Path
from onnxruntime.quantization import quantize_dynamic, QuantType

Original model: https://huggingface.co/Xenova/distilbert-base-uncased-distilled-squad

In [3]:
def create_onnx_model_from_huggingface(hf_model_name, onnx_model_path):
    """
        Load the model from huggingface and export it to onnx
    """
    tokenizer = transformers.AutoTokenizer.from_pretrained(hf_model_name)
    model = transformers.DistilBertForQuestionAnswering.from_pretrained(hf_model_name)
    
    model_kind, model_onnx_config = FeaturesManager.check_supported_model_or_raise(model, feature="question-answering")
    onnx_config = model_onnx_config(model.config)
    
    # Export the hf model to onnx
    onnx_inputs, onnx_outputs = transformers.onnx.export(tokenizer, # pretrained generic tokenizer class for the model
                                                         model, # pretrained hf model
                                                         onnx_config, # onnx configurations which includes input/output names/types info
                                                         16, # opset_version - the ONNX version to export the model to
                                                         onnx_model_path) # where to save the exported onnx model

In [4]:
onnx_model_path = Path('distilbert-base-uncased-distilled-squad.onnx')
if not onnx_model_path.exists():
    print("Creating ONNX model from huggingface model...")
    create_onnx_model_from_huggingface('distilbert-base-uncased-distilled-squad', onnx_model_path)

Creating ONNX model from huggingface model...


  mask, torch.tensor(torch.finfo(scores.dtype).min)


verbose: False, log level: Level.ERROR



Check if the output ONNX model is exported successfully.

In [5]:
assert onnx_model_path.exists()

Quantize the output model.

In [6]:
def quantize_model(model_path: Path):
    """
        Quantize the model, so that it can be run on mobile devices with smaller memory footprint
    """
    quantized_model_path = model_path.with_name(model_path.stem+"_quant").with_suffix(model_path.suffix)
    quantize_dynamic(model_path, quantized_model_path, weight_type=QuantType.QInt8)
    model_path.unlink()
    return quantized_model_path

In [7]:
quantized_model = quantize_model(onnx_model_path)

Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.0/attention/MatMul]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.0/attention/MatMul_1]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.1/attention/MatMul]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.1/attention/MatMul_1]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.2/attention/MatMul]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.2/attention/MatMul_1]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.3/attention/MatMul]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.3/attention/MatMul_1]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.4/attention/MatMul]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.4/attention/MatMul_1]
Ignore MatMul due to non constant B: /[/distilbert/transformer/layer.5/attention/MatMul]
Ignore MatM

###  2. Add pre and post processing steps to ONNX model

In [8]:
from onnxruntime_extensions.tools.pre_post_processing import *
from onnxruntime_extensions.tools import add_pre_post_processing_to_model as add_ppp
from contextlib import contextmanager

In [9]:
def add_pre_post_processing(input_model_path: Path, output_model_path: str, model_name: str = "Xenova/distilbert-base-uncased-distilled-squad"):
    """
    Add pre and post processing to the model, for tokenization and post processing
    """
    onnx_opset = 16
    model = onnx.load(str(input_model_path.resolve(strict=True)))
    inputs = [create_named_value("input_text", onnx.TensorProto.STRING, [1, "num_sentences"])]  # Fix the batch size to be 1
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(model_name)

    @contextmanager
    def temp_vocab_file():
        vocab_file = input_model_path.parent/ "vocab.txt"
        yield vocab_file

    with temp_vocab_file() as vocab_file:
        import json
        with open(str(vocab_file), 'w') as f:
            f.write(json.dumps(tokenizer.vocab))

        pipeline = PrePostProcessor(inputs, onnx_opset)
        
        tokenizer_args = TokenizerParam(
            vocab_or_file=vocab_file,
            do_lower_case=True,
            tweaked_bos_id=0,
            is_sentence_pair=True,
        )
        
        pipeline.add_pre_processing(
            [
                BertTokenizer(tokenizer_args), # convert input_text into input_ids, attention_masks, token_type_ids
            ]
        )
        
        pipeline.add_post_processing(
            [
                (BertTokenizerQADecoder(tokenizer_args), # decode the input_ids to text
                [utils.IoMapEntry("BertTokenizer", producer_idx=0, consumer_idx=2)]) # input_ids
            ]
        )

    new_model = pipeline.run(model)
    onnx.save_model(new_model, output_model_path)

In [10]:
output_model_path = str(quantized_model).replace(".onnx", "_with_pre_post_processing.onnx")
add_pre_post_processing(quantized_model, output_model_path)

### 3. Test output ONNX model

In [11]:
from onnxruntime_extensions import get_library_path

In [12]:
def test_onnx_model(model_path: str):
    
    so = ort.SessionOptions()
    so.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_EXTENDED

    # Note: register the custom operators for the image decode/encode pre/post processing provided by onnxruntime-extensions
    # with onnxruntime. if we do not do this we'll get an error on model load about the operators not being found.
    ortext_lib_path = get_library_path()
    so.register_custom_ops_library(ortext_lib_path)
    inference_session = ort.InferenceSession(model_path, so)
    
    test_context = "The game was played on February 7, 2016 at Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."
    test_question = "What day was the game played on?"
    outputs = inference_session.run(['text'], {'input_text': [[test_question, test_context]]})
    output_answer = outputs[0][0]
    print("Answer:  " + output_answer)

In [13]:
test_onnx_model(output_model_path)

Answer:  february 7, 2016


2023-10-19 16:12:09.798773 [W:onnxruntime:, graph.cc:3543 CleanUnusedInitializersAndNodeArgs] Removing initializer '_ppp0_i64_0'. It is not used by any node and should be removed from the model.
2023-10-19 16:12:09.818140 [W:onnxruntime:, unsqueeze_elimination.cc:20 Apply] UnsqueezeElimination cannot remove node post_process_7


### 4. Build and run inference with the output model in a mobile application

- **Android**

    **[ORT Question Answering Android Sample application](mobile/examples/question_answering/android)**

    **Note:** You can skip the step about `preparing model` and place the above generated model under `mobile\examples\question_answering\android\app\src\main\res\raw` directory.

    See more information about general prerequisites to build and run an Android application in [README.md/Requirements](mobile/examples/question_answering/android/README.md)


Example code snippet for initalizing ort session and register ort extensions for pre/post processing support:
```kotlin

   // Initialize Ort Session and register the onnxruntime extensions package that contains the custom operators.
    val sessionOptions: OrtSession.SessionOptions = OrtSession.SessionOptions()
    if (ep.contains("CPU")){
    } else if (ep.contains("NNAPI")) {
        sessionOptions.addNnapi()
    } else if (ep.contains("XNNAPCK")) {
        val po = mapOf<String, String>()
        sessionOptions.addXnnpack(po)
    }
    sessionOptions.setSessionLogLevel(OrtLoggingLevel.ORT_LOGGING_LEVEL_VERBOSE)
    sessionOptions.setSessionLogVerbosityLevel(0)
    sessionOptions.registerCustomOpLibrary(OrtxPackage.getLibraryPath())
    ortSession = ortEnv.createSession(readModel(), sessionOptions)
```

Example code snippet for performing Question Answering task with MobileBERT model in an Android app:
```kotlin

    // Step 1: Prepare (question, article) pair as the input text
    val article = article_seq.toString()
    val question = qustion_seq.toString()

    // Step 2: Create input tensor
    val shape = longArrayOf(1, 2)
    val inputTensor = OnnxTensor.createTensor(ortEnv, arrayOf(question, article), shape)

    inputTensor.use {
        // Step 3: Call Ort InferenceSession Run
        val output = ortSession.run(Collections.singletonMap("input_text", inputTensor))

        // Step 4: Analyze the inference result
        output.use {
            val rawOutput = (output?.get(1)?.value) as Array<String>
            outputAnswer = rawOutput[0]
        }
    }
```

- **iOS**

    **[ORT Question Answering iOS Sample application](mobile/examples/question_answering/iOS)**

    **Note:** You can skip the step about `prepare model` and place the above generated model under `mobile/examples/question_answering/ios/ORTQuestionAnswering/ORTQuestionAnswering`. Adding that as an bundle asset to your iOS app.

    See more information about general prerequisites to build and run an iOS application in [README.md/Requirements](mobile/examples/question_answering/iOS/README.md)

Example code snippet for performing Question Answering task in an iOS app:


```c
    // Step 1: Register custom ops
    const auto ort_log_level = ORT_LOGGING_LEVEL_INFO;
    auto ort_env = Ort::Env(ort_log_level, "ORTQuestionAnswering");
    auto session_options = Ort::SessionOptions(); 
    
    if (RegisterCustomOps(session_options, OrtGetApiBase()) != nullptr) {
        throw std::runtime_error("RegisterCustomOps failed");
    }
            
    // Step 2: Load model   
    NSString *model_path = [NSBundle.mainBundle pathForResource:@"distilbert_base_uncased_squad_quant_with_pre_post_processing"
                                                                ofType:@"onnx"];
    if (model_path == nullptr) {
        throw std::runtime_error("Failed to get model path");
    }
            
    // Step 2: Create Ort Inference Session    
    auto sess = Ort::Session(ort_env, [model_path UTF8String], session_options);
            
    // Step 3: Prepare input tensors and input/output names        
    std::vector<int64_t> input_dims{1, 2};
    Ort::AllocatorWithDefaultOptions ortAllocator;
    auto input_tensor = Ort::Value::CreateTensor(ortAllocator, input_dims.data(), input_dims.size(), ONNX_TENSOR_ELEMENT_DATA_TYPE_STRING);

    std::vector<std::string> question_article_vec;
    question_article_vec.reserve(2);
    question_article_vec.push_back([input_user_question UTF8String]);
    question_article_vec.push_back([input_article UTF8String]);
    std::vector<const char*> p_str;
    for (const auto& s : question_article_vec) {
        p_str.push_back(s.c_str());
    }
    input_tensor.FillStringTensor(p_str.data(), p_str.size());
            
    // Step 4: Call inference session run
    constexpr auto input_names = std::array{"input_text"};
    constexpr auto output_names = std::array{"text"};
    const auto outputs = sess.Run(Ort::RunOptions(), input_names.data(),
                                &input_tensor, 1, output_names.data(), 1);
                    
    // Step 5: Analyze model outputs
    if (outputs.size() != 1) {
        throw std::runtime_error("Unexpected number of outputs");
    }        
    const auto &output_tensor = outputs.at(1);
    const std::string* output_string_raw = output_tensor.GetTensorData<std::string>();
    output_text = [NSString stringWithCString:output_string_raw->c_str() encoding:NSUTF8StringEncoding];
```

**Note**: Model input/output names, sizes, types may require corresponding adjustment according to specific models using. The above act as sample code for demonstration usage purpose.