In [1]:
import torch
import tensorflow as tf
import onnx
import numpy as np
from transformers import ViTImageProcessor, ViTForImageClassification
from PIL import Image
import onnx2tf
import os

2024-11-20 10:54:56.966825: I tensorflow/core/util/port.cc:153] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-11-20 10:54:56.993860: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1732080297.025521   19152 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1732080297.035344   19152 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-11-20 10:54:57.070722: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instr

# Download ViT-B16 from HuggingFace

In [2]:
model_name = "google/vit-base-patch16-224"
processor = ViTImageProcessor.from_pretrained(model_name)
model = ViTForImageClassification.from_pretrained(model_name)
model.eval()
model, processor

(ViTForImageClassification(
   (vit): ViTModel(
     (embeddings): ViTEmbeddings(
       (patch_embeddings): ViTPatchEmbeddings(
         (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
       )
       (dropout): Dropout(p=0.0, inplace=False)
     )
     (encoder): ViTEncoder(
       (layer): ModuleList(
         (0-11): 12 x ViTLayer(
           (attention): ViTSdpaAttention(
             (attention): ViTSdpaSelfAttention(
               (query): Linear(in_features=768, out_features=768, bias=True)
               (key): Linear(in_features=768, out_features=768, bias=True)
               (value): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.0, inplace=False)
             )
             (output): ViTSelfOutput(
               (dense): Linear(in_features=768, out_features=768, bias=True)
               (dropout): Dropout(p=0.0, inplace=False)
             )
           )
           (intermediate): ViTIntermediate(
         

# Convert to ONNX

In [3]:
onnx_path = "vit_b16_huggingface.onnx"

dummy_input = torch.randn(1, 3, 224, 224)
# Get the expected input names
input_names = ['pixel_values']
output_names = ['logits']

# Export to ONNX with opset 14
torch.onnx.export(
    model,
    dummy_input,
    onnx_path,
    input_names=input_names,
    output_names=output_names,
    dynamic_axes={
        'pixel_values': {0: 'batch_size'},
        'logits': {0: 'batch_size'}
    },
    do_constant_folding=True,
    # opset_version=14,  # Updated to opset 14
    opset_version=15,  # Updated to opset 14
    operator_export_type=torch.onnx.OperatorExportTypes.ONNX_FALLTHROUGH
)

# Verify ONNX model
onnx_model = onnx.load(onnx_path)
onnx.checker.check_model(onnx_model)
print("ONNX model verified successfully")

ONNX model verified successfully


# Simplify onnx for tf conversion

In [4]:
onnx_simplified_path = "vit_b16_simplified_huggingface.onnx"
import onnxsim
model = onnx.load(onnx_path)
model_simp, check = onnxsim.simplify(model)
if check:
    onnx.save(model_simp, onnx_simplified_path)
    print("ONNX model simplified successfully")
else:
    print("ONNX simplification failed")

ONNX model simplified successfully


# Convert ONNX to TF

In [5]:
tf_path = "vit_b16_huggingface"
# onnx2tf.convert(
#         input_onnx_file_path=onnx_path,
#         output_folder_path=tf_path,
#         output_signaturedefs=True,
# )
import json

config = {
        "input_shapes": {
            "pixel_values": [1, 3, 224, 224]
        },
        "output_shapes": {
            "logits": [1, 1000]
        }
    }

shape_config_json = 'shape_config.json'

with open(shape_config_json, 'w') as f:
        json.dump(config, f)
 
onnx2tf.convert(
        input_onnx_file_path=onnx_simplified_path,
        output_folder_path=tf_path,
        output_signaturedefs=True,
        # Shape inference options
        batch_size=1,
        keep_ncw_or_nchw_or_ncdhw_input_names=['pixel_values'],
        # Additional options to handle dimension issues
        # custom_output_shape_dict=shape_config_json,
        # skip_unknown_shape=True,
        # keep_input_tensor_shapes=True,
        # Optimization options
        # preserve_custom_attributes=True,
        # parallel_processing=True,
        # optimization_for_gpu=True
)


Simplifying[33m...[0m
Finish! Here is the difference:
┏━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━┳━━━━━━━━━━━━━━━━━━┓
┃[1m [0m[1m          [0m[1m [0m┃[1m [0m[1mOriginal Model[0m[1m [0m┃[1m [0m[1mSimplified Model[0m[1m [0m┃
┡━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━╇━━━━━━━━━━━━━━━━━━┩
│ Add        │ 159            │ 159              │
│ Concat     │ 3              │ 3                │
│ Constant   │ 191            │ 191              │
│ Conv       │ 1              │ 1                │
│ Div        │ 37             │ 37               │
│ Equal      │ 1              │ 1                │
│ Erf        │ 12             │ 12               │
│ Expand     │ 1              │ 1                │
│ Gather     │ 2              │ 2                │
│ Gemm       │ 1              │ 1                │
│ MatMul     │ 72             │ 72               │
│ Mul        │ 73             │ 73               │
│ Pow        │ 25             │ 25               │
│ ReduceMean │ 50             │ 50               │
│ Re

2024-11-20 10:56:18.477031: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)



[32mINFO:[0m [32m2 / 615[0m
[32mINFO:[0m [35monnx_op_type[0m: Shape[35m onnx_op_name[0m: wa/vit/embeddings/Shape
[32mINFO:[0m [36m input_name.1[0m: pixel_values [36mshape[0m: [1, 3, 224, 224] [36mdtype[0m: float32
[32mINFO:[0m [36m output_name.1[0m: wa/vit/embeddings/Shape_output_0 [36mshape[0m: [4] [36mdtype[0m: int64
[32mINFO:[0m [35mtf_op_type[0m: shape_v2
[32mINFO:[0m [34m input.1.x[0m: [34mname[0m: tf.compat.v1.transpose/transpose:0 [34mshape[0m: (1, 224, 224, 3) [34mdtype[0m: <dtype: 'float32'> 
[32mINFO:[0m [34m input.2.out_type[0m: [34mname[0m: int64 [34mshape[0m: () 
[32mINFO:[0m [34m output.1.output[0m: [34mname[0m: tf.compat.v1.shape/wa/vit/embeddings/Shape:0 [34mshape[0m: (4,) [34mdtype[0m: <dtype: 'int64'> 

[32mINFO:[0m [32m3 / 615[0m
[32mINFO:[0m [35monnx_op_type[0m: Conv[35m onnx_op_name[0m: wa/vit/embeddings/patch_embeddings/projection/Conv
[32mINFO:[0m [36m input_name.1[0m: pixel_values [36mshap

2024-11-20 10:56:22.904661: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: ConcatOp : Dimension 1 in both shapes must be equal: shape[0] = [1,768,1] vs. shape[1] = [1,14,10752]
2024-11-20 10:56:22.977097: I tensorflow/core/framework/local_rendezvous.cc:405] Local rendezvous is aborting with status: INVALID_ARGUMENT: ConcatOp : Dimension 1 in both shapes must be equal: shape[0] = [1,768,1] vs. shape[1] = [1,14,10752]


[32mINFO:[0m [35mtf_op_type[0m: divide
[32mINFO:[0m [34m input.1.x[0m: [34mname[0m: tf.math.subtract/Sub:0 [34mshape[0m: (None, 768, 197) [34mdtype[0m: <dtype: 'float32'> 
[32mINFO:[0m [34m input.2.y[0m: [34mname[0m: tf.math.sqrt/Sqrt:0 [34mshape[0m: (None, 1, 197) [34mdtype[0m: <dtype: 'float32'> 
[32mINFO:[0m [34m output.1.output[0m: [34mname[0m: tf.math.divide/truediv:0 [34mshape[0m: (None, 768, 197) [34mdtype[0m: <dtype: 'float32'> 

[32mINFO:[0m [32m24 / 615[0m
[32mINFO:[0m [35monnx_op_type[0m: Mul[35m onnx_op_name[0m: wa/vit/encoder/layer.0/layernorm_before/Mul
[32mINFO:[0m [36m input_name.1[0m: wa/vit/encoder/layer.0/layernorm_before/Div_output_0 [36mshape[0m: ['unk__4', 197, 768] [36mdtype[0m: float32
[32mINFO:[0m [36m input_name.2[0m: vit.encoder.layer.0.layernorm_before.weight [36mshape[0m: [768] [36mdtype[0m: float32
[32mINFO:[0m [36m output_name.1[0m: wa/vit/encoder/layer.0/layernorm_before/Mul_output_0 [36msh

SystemExit: 1