# Converting a Huggingface model to ONNX with tf2onnx

This is a simple example how to convert a [huggingface](https://huggingface.co/) model to ONNX using [tf2onnx](https://github.com/onnx/tensorflow-onnx).

We use the [TFBertForQuestionAnswering](https://huggingface.co/transformers/model_doc/bert.html#tfbertforquestionanswering) example from huggingface.

Other models will work similar. You'll find additional examples for other models in our unit tests [here](https://github.com/onnx/tensorflow-onnx/blob/master/tests/huggingface.py).

## Install dependencies

In [None]:
!pip install tensorflow transformers tf2onnx onnxruntime

## The keras code

In [1]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ""

import warnings
warnings.filterwarnings('ignore')

import numpy as np
import onnxruntime as rt
import tensorflow as tf
import tf2onnx

In [2]:
from transformers import BertTokenizer, TFBertForQuestionAnswering
tokenizer = BertTokenizer.from_pretrained('bert-base-cased')
model = TFBertForQuestionAnswering.from_pretrained('bert-base-cased')
question, text = "Who was Jim Henson?", "Jim Henson was a nice puppet"
input_dict = tokenizer(question, text, return_tensors='tf')
tf_results = model(input_dict)
tf_results



TFQuestionAnsweringModelOutput(loss=None, start_logits=<tf.Tensor: shape=(1, 16), dtype=float32, numpy=
array([[ 0.27443457,  0.02250022, -0.32903647, -0.32448006, -0.26440915,
        -0.03356116, -0.11466929, -0.12272861, -0.23254037, -0.21369037,
         0.02170385, -0.38734213, -0.14865303, -0.04804918,  0.02706608,
        -0.12273058]], dtype=float32)>, end_logits=<tf.Tensor: shape=(1, 16), dtype=float32, numpy=
array([[-0.23549399,  0.11830041, -0.16875415,  0.04315909,  0.00721513,
         0.20957005,  0.00850991, -0.49158442,  0.10791501,  0.07153591,
         0.26274043, -0.15160318, -0.01847767,  0.03389414,  0.25666913,
        -0.49158433]], dtype=float32)>, hidden_states=None, attentions=None)

## Convert to ONNX

In [3]:
# describe the inputs
input_spec = (
    tf.TensorSpec((None,  None), tf.int32, name="input_ids"),
    tf.TensorSpec((None,  None), tf.int32, name="token_type_ids"),
    tf.TensorSpec((None,  None), tf.int32, name="attention_mask")
)

# and convert
_, _ = tf2onnx.convert.from_keras(model, input_signature=input_spec, opset=13, output_path="bert.onnx")



## Test the ONNX model with onnxruntime

In [4]:
# get the names we want as output
output_names = list(tf_results.keys())

# switch the input_dict to numpy
input_dict_np = {k: v.numpy() for k, v in input_dict.items()}

opt = rt.SessionOptions()
sess = rt.InferenceSession("bert.onnx")
onnx_results = sess.run(output_names, input_dict_np)
onnx_results

[array([[ 0.27443478,  0.02250013, -0.32903633, -0.32448038, -0.26440892,
         -0.03356095, -0.11466938, -0.12272887, -0.2325401 , -0.21369015,
          0.02170385, -0.3873423 , -0.148653  , -0.04804894,  0.02706566,
         -0.1227307 ]], dtype=float32),
 array([[-0.23549382,  0.11830062, -0.16875397,  0.0431588 ,  0.00721494,
          0.2095699 ,  0.00850987, -0.49158436,  0.10791501,  0.07153573,
          0.26274025, -0.15160298, -0.01847767,  0.03389416,  0.25666922,
         -0.49158415]], dtype=float32)]

## Make sure tensorflow and onnxruntime results are the same

In [5]:
for i, name in enumerate(output_names):
    np.testing.assert_allclose(tf_results[name], onnx_results[i], rtol=1e-5, atol=1e-5)