From e86c93f32598bdb4aa2a8aeea6d88ca18ddc37bb Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Sat, 18 Feb 2023 13:42:23 -0800
Subject: [PATCH 01/22] VIT test, saving as a branch, not for checkin

---
 .../tools/transformers/dev_benchmark.cmd      | 14 +++---
 .../tools/transformers/fusion_attention.py    | 50 +++++++++++++------
 .../tools/transformers/huggingface_models.py  |  2 +
 .../python/tools/transformers/import.py       | 20 ++++++++
 4 files changed, 65 insertions(+), 21 deletions(-)
 create mode 100644 onnxruntime/python/tools/transformers/import.py

diff --git a/onnxruntime/python/tools/transformers/dev_benchmark.cmd b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
index 7a9b3254a1708..c3de6519bd197 100644
--- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd
+++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
@@ -21,25 +21,27 @@ set run_torchscript=false
 
 REM Devices to test.
 REM Attention: You cannot run both CPU and GPU at the same time: gpu need onnxruntime-gpu, and CPU need onnxruntime.
-set run_gpu_fp32=false
+set run_gpu_fp32=true
 set run_gpu_fp16=false
-set run_cpu_fp32=true
-set run_cpu_int8=true
+set run_cpu_fp32=false
+set run_cpu_int8=false
 
 set average_over=100
 
 REM Enable optimizer (use script instead of OnnxRuntime for graph optimization)
 set use_optimizer=true
 
-set batch_sizes=1
-set sequence_length=8 128
+set batch_sizes=1 4
+set sequence_length=32 64
 
 REM Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model.
 REM Note that different input count might lead to different performance
 set input_counts=1
 
 REM Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
-set models_to_test=bert-base-cased
+REM set models_to_test=bert-base-cased
+set models_to_test="google/vit-base-patch16-224"
+REM set models_to_test="google/vit-base-patch32"
 
 REM If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 REM set CUDA_VISIBLE_DEVICES=1
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 342d43306e699..82d8095328765 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -419,18 +419,27 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             else:
                 return
 
-        other_inputs = []
-        for i, input in enumerate(start_node.input):
-            if input not in output_name_to_node:
-                continue
+        # Match Vit
+        vit_nodes = self.model.match_parent_path(
+            matmul_qkv, ["Transpose", "Reshape", "Add", "MatMul"], [None, None, None, None]
+        )
+        if vit_nodes is not None:
+            root_input = vit_nodes[3].input[0]
+        else:
+            other_inputs = []
+            for i, input in enumerate(start_node.input):
+                if input not in output_name_to_node:
+                    continue
+
+                if input == qkv_nodes[0].output[0]:
+                    continue
+                other_inputs.append(input)
+            if len(other_inputs) != 1:
+                return
+
+            root_input = other_inputs[0]
 
-            if input == qkv_nodes[0].output[0]:
-                continue
-            other_inputs.append(input)
-        if len(other_inputs) != 1:
-            return
 
-        root_input = other_inputs[0]
         """
         Match flaubert                     Mask
                                             |
@@ -471,11 +480,13 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
 
         is_distill = False
         is_distill_add = False
+        is_no_add = False
         qk_paths = {
             "path1": (["Softmax", "Add", "Div", "MatMul"], [0, 0, None, 0]),
             "path2": (["Softmax", "Add", "Mul", "MatMul"], [0, 0, None, 0]),
             "path3": (["Softmax", "Where", "MatMul", "Div"], [0, 0, 2, 0]),
             "path4": (["Softmax", "Add", "Where", "MatMul"], [0, 0, 0, 2]),
+            "path5": (["Softmax", "Div", "MatMul"], [0, None, 0]),
         }
 
         qk_nodes = None
@@ -487,6 +498,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 is_distill = True
             if k == "path4":
                 is_distill_add = True
+            if k == "path5":
+                is_no_add = True
             break
 
         if qk_nodes is None:
@@ -500,6 +513,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             (_, where_qk, matmul_qk, _) = qk_nodes
         elif is_distill_add:
             (_, add_qk, where_qk, matmul_qk) = qk_nodes
+        elif is_no_add:
+            (_, _, matmul_qk) = qk_nodes
         else:
             (_, add_qk, _, matmul_qk) = qk_nodes
 
@@ -557,6 +572,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 if add_qk_str is None:
                     logger.debug(f"fuse_attention: failed to verify shape inference of {add_qk}")
                     return
+        elif is_no_add:
+            pass
         else:
             _, mask_nodes, _ = self.model.match_parent_paths(
                 add_qk,
@@ -569,17 +586,20 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 ],
                 output_name_to_node,
             )
-        if mask_nodes is None:
-            logger.debug("fuse_attention: failed to match mask path")
-            return
+#        if mask_nodes is None:
+#            logger.debug("fuse_attention: failed to match mask path")
+#            return
 
-        if len(mask_nodes) > 1 and mask_nodes[0].op_type == "Mul":
+        if mask_nodes is not None and len(mask_nodes) > 1 and mask_nodes[0].op_type == "Mul":
             _, mul_val = self.model.get_constant_input(mask_nodes[0])
             if mul_val != -10000:
                 self.mask_filter_value = mul_val
 
         if matmul_v.input[0] == root_input and matmul_q.input[0] == root_input and matmul_k.input[0] == root_input:
-            mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
+            if mask_nodes is None:
+                mask_index = None
+            else:
+                mask_index = self.attention_mask.process_mask(mask_nodes[-1].input[0])
 
             attention_last_node = reshape_qkv if einsum_node is None else transpose_qkv
 
diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index cdf75efb1e62d..0aabcad3de2a2 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -16,6 +16,8 @@
 # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
 # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type
 MODELS = {
+    "google/vit-base-patch16-224": (["input_ids"], 12, False, "bert"),
+
     # BERT
     "bert-base-uncased": (
         ["input_ids", "attention_mask", "token_type_ids"],
diff --git a/onnxruntime/python/tools/transformers/import.py b/onnxruntime/python/tools/transformers/import.py
new file mode 100644
index 0000000000000..9c734e459ae40
--- /dev/null
+++ b/onnxruntime/python/tools/transformers/import.py
@@ -0,0 +1,20 @@
+from transformers import AutoImageProcessor, ViTModel
+import torch
+from datasets import load_dataset
+
+dataset = load_dataset("huggingface/cats-image")
+image = dataset["test"]["image"][0]
+
+image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
+model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
+
+inputs = image_processor(image, return_tensors="pt")
+
+with torch.no_grad():
+    outputs = model(**inputs)
+
+# last_hidden_states = outputs.last_hidden_state
+# list(last_hidden_states.shape)
+
+# print(inputs)
+torch.onnx.export(model, inputs['pixel_values'], "TestModel.onnx", verbose=True)

From 11b1d5d1b193df9ee46a2daf36d1448aa4609429 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Wed, 8 Mar 2023 16:16:05 -0800
Subject: [PATCH 02/22] Push branch for testing on benchmark machine

---
 .../python/tools/transformers/benchmark.py    |  6 ++-
 .../tools/transformers/onnx_exporter.py       | 50 ++++++++++++++++---
 .../tools/transformers/run_benchmark.sh       |  8 +--
 3 files changed, 52 insertions(+), 12 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 23f1be3eeed2f..afc392402cd27 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -258,7 +258,8 @@ def run_onnxruntime(
                     }
 
                     logger.info(
-                        "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
+                        # "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
+                        "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224])
                     )
 
                     if disable_ort_io_binding:
@@ -359,7 +360,8 @@ def run_pytorch(
                 if max_input_size is not None and sequence_length > max_input_size:
                     continue
 
-                logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
+                # logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
+                logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224]))
                 input_ids = torch.randint(
                     low=0,
                     high=config.vocab_size - 1,
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index c4dda99496ebe..f947d04553c30 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -10,13 +10,23 @@
 from pathlib import Path
 
 import numpy
+import requests
 import torch
 from affinity_helper import AffinitySetting
 from benchmark_helper import OptimizerInfo, Precision, create_onnxruntime_session
 from huggingface_models import MODEL_CLASSES
+from PIL import Image
 from quantize_helper import QuantizeHelper
 from torch_onnx_export_helper import torch_onnx_export
-from transformers import AutoConfig, AutoTokenizer, LxmertConfig, TransfoXLConfig
+from transformers import (
+    AutoConfig,
+    AutoModelForImageClassification,
+    AutoTokenizer,
+    LxmertConfig,
+    TransfoXLConfig,
+    ViTForImageClassification,
+    ViTImageProcessor,
+)
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
 from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState
@@ -49,9 +59,11 @@ def restore_torch_functions():
 
 
 def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config, data_type=numpy.int64):
-    input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, sequence_length), dtype=data_type)
+    # input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, sequence_length), dtype=data_type)
+    input_ids = numpy.random.rand(batch_size, 3, 224, 224).astype(numpy.float32)
 
-    inputs = {"input_ids": input_ids}
+    # inputs = {"input_ids": input_ids}
+    inputs = {"pixel_values": input_ids}
 
     if "attention_mask" in input_names:
         attention_mask = numpy.ones([batch_size, sequence_length], dtype=data_type)
@@ -106,6 +118,15 @@ def build_dynamic_axes(example_inputs, outputs_flatten):
                 dynamic_axes[output_name].update({j: "seq_len"})
     return dynamic_axes, output_names
 
+def build_dynamic_axes_vit(example_inputs, outputs_flatten):
+    # dynamic_axes={
+    #     'pixel_values': {0: 'batch_size', 1: 'num_channels', 2: 'height', 3:'width'},
+    #     'logits': {0: 'batch_size', 1: 'sequence_length'}
+    # }
+
+    dynamic_axes = {key: {0: "pixel_values"} for key in example_inputs.keys()}
+    output_names = ["logits"]
+    return dynamic_axes, output_names
 
 def validate_onnx_model(
     onnx_model_path,
@@ -439,7 +460,8 @@ def validate_and_optimize_onnx(
                 model_fusion_statistics,
             )
 
-    return onnx_model_path, is_valid_onnx_model, config.vocab_size
+    # return onnx_model_path, is_valid_onnx_model, config.vocab_size
+    return onnx_model_path, is_valid_onnx_model, config.num_labels
 
 
 def export_onnx_model_from_pt(
@@ -466,6 +488,7 @@ def export_onnx_model_from_pt(
     # config, model = load_pt_model_from_tf(model_name)
     model.cpu()
 
+    """
     tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
     max_input_size = (
         tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
@@ -474,13 +497,25 @@ def export_onnx_model_from_pt(
     example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
 
     example_inputs = filter_inputs(example_inputs, input_names)
+    """
+
+    # url = 'http://images.cocodataset.org/val2017/000000039769.jpg' # Egyptian cats
+    # image = Image.open(requests.get(url, stream=True).raw)
+    # processor = ViTImageProcessor.from_pretrained(model_name)
+    # model = ViTForImageClassification.from_pretrained(model_name)
+    model = AutoModelForImageClassification.from_pretrained(model_name)
+
+    # example_inputs = processor(images=image, return_tensors="pt")
+
+    max_input_size = 1024 # What to use for ViT?
 
+    example_inputs = inputs = { 'pixel_values' : torch.rand(2,3,224,224) }
     example_outputs = model(**example_inputs)
 
-    assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
+    # assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
 
     # Flatten is needed for gpt2 and distilgpt2.
-    example_outputs_flatten = flatten(example_outputs)
+    example_outputs_flatten = flatten(example_outputs['logits'])
     example_outputs_flatten = update_flatten_list(example_outputs_flatten, [])
 
     onnx_model_path = get_onnx_file_path(
@@ -498,7 +533,8 @@ def export_onnx_model_from_pt(
         logger.info("Exporting ONNX model to {}".format(onnx_model_path))
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
-        dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
+        # dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
+        dynamic_axes, output_names = build_dynamic_axes_vit(example_inputs, example_outputs_flatten)
 
         replace_torch_functions()
         torch_onnx_export(
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
index f0422839c11eb..ee33555b8e526 100644
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -14,7 +14,7 @@
 use_package=true
 
 # only need once
-run_install=true
+run_install=false
 
 # Engines to test.
 # To run ort_trt, you need to build and install the onnxruntime-gpu-tensorrt package on your own
@@ -49,7 +49,8 @@ layer_number=16
 
 # Batch Sizes and Sequence Lengths
 batch_sizes="1 4"
-sequence_lengths="8 16 32 64 128 256 512 1024"
+sequence_lengths="32 64"
+# 8 16 32 64 128 256 512 1024"
 
 # Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model.
 # Not that different input count might lead to different performance
@@ -57,7 +58,8 @@ sequence_lengths="8 16 32 64 128 256 512 1024"
 input_counts=1
 
 # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
-models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
+models_to_test="google/vit-base-patch16-224"
+# bert-base-cased roberta-base distilbert-base-uncased"
 
 # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 # export CUDA_VISIBLE_DEVICES=1

From e27e1fd73b656bf8489b8e0f89f4c86d40c16c15 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Wed, 8 Mar 2023 20:27:50 -0800
Subject: [PATCH 03/22] Fix pytorch for VIT

---
 onnxruntime/python/tools/transformers/benchmark.py | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index afc392402cd27..7de3e881a621c 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -334,11 +334,15 @@ def run_pytorch(
             cache_dir=cache_dir,
             custom_model_class=model_class,
         )
+        """
         tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
         max_input_size = (
             tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
         )
+        """
+
+        max_input_size = 1024 # What to use for ViT?
 
         logger.debug(f"Model {model}")
         logger.debug(f"Number of parameters {model.num_parameters()}")
@@ -362,6 +366,8 @@ def run_pytorch(
 
                 # logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
                 logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224]))
+                input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float, device=device)
+                """
                 input_ids = torch.randint(
                     low=0,
                     high=config.vocab_size - 1,
@@ -369,6 +375,7 @@ def run_pytorch(
                     dtype=torch.long,
                     device=device,
                 )
+                """
                 try:
                     inference = (
                         torch.jit.trace(model, input_ids) if torchscript else torch.compile(model) if torch2 else model

From b6b313f54099000fb5fe29cdcabc67020ac606dc Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Tue, 14 Mar 2023 13:51:10 -0700
Subject: [PATCH 04/22] Update

---
 .../python/tools/transformers/benchmark.py    | 39 ++++++++-------
 .../tools/transformers/onnx_exporter.py       | 50 ++++++++-----------
 2 files changed, 41 insertions(+), 48 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 7de3e881a621c..261e42e2a799f 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -334,15 +334,15 @@ def run_pytorch(
             cache_dir=cache_dir,
             custom_model_class=model_class,
         )
-        """
-        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
-        max_input_size = (
-            tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-        )
-        """
+        if config.model_type == "vit":
+            max_input_size = 1024 # What to use for ViT?
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
-        max_input_size = 1024 # What to use for ViT?
+            max_input_size = (
+                tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
+            )
 
         logger.debug(f"Model {model}")
         logger.debug(f"Number of parameters {model.num_parameters()}")
@@ -364,18 +364,19 @@ def run_pytorch(
                 if max_input_size is not None and sequence_length > max_input_size:
                     continue
 
-                # logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
-                logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224]))
-                input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float, device=device)
-                """
-                input_ids = torch.randint(
-                    low=0,
-                    high=config.vocab_size - 1,
-                    size=(batch_size, sequence_length),
-                    dtype=torch.long,
-                    device=device,
-                )
-                """
+                if config.model_type == "vit":
+                    logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224]))
+                    input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float, device=device)
+                else:
+                    logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
+                    input_ids = torch.randint(
+                        low=0,
+                        high=config.vocab_size - 1,
+                        size=(batch_size, sequence_length),
+                        dtype=torch.long,
+                        device=device,
+                    )
+
                 try:
                     inference = (
                         torch.jit.trace(model, input_ids) if torchscript else torch.compile(model) if torch2 else model
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index f947d04553c30..d2bff922a1b77 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -119,11 +119,6 @@ def build_dynamic_axes(example_inputs, outputs_flatten):
     return dynamic_axes, output_names
 
 def build_dynamic_axes_vit(example_inputs, outputs_flatten):
-    # dynamic_axes={
-    #     'pixel_values': {0: 'batch_size', 1: 'num_channels', 2: 'height', 3:'width'},
-    #     'logits': {0: 'batch_size', 1: 'sequence_length'}
-    # }
-
     dynamic_axes = {key: {0: "pixel_values"} for key in example_inputs.keys()}
     output_names = ["logits"]
     return dynamic_axes, output_names
@@ -311,6 +306,9 @@ def modelclass_dispatcher(model_name, custom_model_class):
 
 
 def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_tf_model=False):
+    if config.model_type=="vit":
+        return AutoModelForImageClassification.from_pretrained(model_name, config=config, cache_dir=cache_dir)
+
     model_class_name = modelclass_dispatcher(model_name, custom_model_class)
 
     if model_class_name == "GPT2ModelNoPastState":
@@ -488,34 +486,26 @@ def export_onnx_model_from_pt(
     # config, model = load_pt_model_from_tf(model_name)
     model.cpu()
 
-    """
-    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-    max_input_size = (
-        tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-    )
-
-    example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
+    if config.model_type == "vit":
+        max_input_size = 1024 # What to use for ViT?
 
-    example_inputs = filter_inputs(example_inputs, input_names)
-    """
-
-    # url = 'http://images.cocodataset.org/val2017/000000039769.jpg' # Egyptian cats
-    # image = Image.open(requests.get(url, stream=True).raw)
-    # processor = ViTImageProcessor.from_pretrained(model_name)
-    # model = ViTForImageClassification.from_pretrained(model_name)
-    model = AutoModelForImageClassification.from_pretrained(model_name)
-
-    # example_inputs = processor(images=image, return_tensors="pt")
+        example_inputs = inputs = { 'pixel_values' : torch.rand(2,3,224,224) }
+        example_outputs = model(**example_inputs)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+        max_input_size = (
+            tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
+        )
 
-    max_input_size = 1024 # What to use for ViT?
+        example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
 
-    example_inputs = inputs = { 'pixel_values' : torch.rand(2,3,224,224) }
-    example_outputs = model(**example_inputs)
+        example_inputs = filter_inputs(example_inputs, input_names)
 
-    # assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
+        example_outputs = model(**example_inputs)
+        assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
 
     # Flatten is needed for gpt2 and distilgpt2.
-    example_outputs_flatten = flatten(example_outputs['logits'])
+    example_outputs_flatten = flatten(example_outputs)
     example_outputs_flatten = update_flatten_list(example_outputs_flatten, [])
 
     onnx_model_path = get_onnx_file_path(
@@ -533,8 +523,10 @@ def export_onnx_model_from_pt(
         logger.info("Exporting ONNX model to {}".format(onnx_model_path))
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
-        # dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
-        dynamic_axes, output_names = build_dynamic_axes_vit(example_inputs, example_outputs_flatten)
+        if config.model_type == "vit":
+            dynamic_axes, output_names = build_dynamic_axes_vit(example_inputs, example_outputs_flatten)
+        else:
+            dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
 
         replace_torch_functions()
         torch_onnx_export(

From 807afeda28afca9098430ecbab8a986674c8e3c7 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Tue, 14 Mar 2023 15:37:52 -0700
Subject: [PATCH 05/22] Simplifiy & cleanup

---
 .../python/tools/transformers/benchmark.py    | 44 ++++++++++++------
 .../tools/transformers/dev_benchmark.cmd      | 14 +++---
 .../tools/transformers/fusion_attention.py    |  4 +-
 .../python/tools/transformers/import.py       | 20 --------
 .../tools/transformers/onnx_exporter.py       | 46 ++++++++++++++-----
 5 files changed, 70 insertions(+), 58 deletions(-)
 delete mode 100644 onnxruntime/python/tools/transformers/import.py

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 23f1be3eeed2f..804ae83dcc12e 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -257,9 +257,14 @@ def run_onnxruntime(
                         "datetime": str(datetime.now()),
                     }
 
-                    logger.info(
-                        "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
-                    )
+                    if config.model_type == "vit":
+                        logger.info(
+                            "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224])
+                        )
+                    else:
+                        logger.info(
+                            "Run onnxruntime on {} with input shape {}".format(model_name, [batch_size, sequence_length])
+                        )
 
                     if disable_ort_io_binding:
                         result = inference_ort(
@@ -333,11 +338,15 @@ def run_pytorch(
             cache_dir=cache_dir,
             custom_model_class=model_class,
         )
-        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
-        max_input_size = (
-            tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-        )
+        if config.model_type == "vit":
+            max_input_size = 1024 # What to use for ViT?
+        else:
+            tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+
+            max_input_size = (
+                tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
+            )
 
         logger.debug(f"Model {model}")
         logger.debug(f"Number of parameters {model.num_parameters()}")
@@ -359,14 +368,19 @@ def run_pytorch(
                 if max_input_size is not None and sequence_length > max_input_size:
                     continue
 
-                logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
-                input_ids = torch.randint(
-                    low=0,
-                    high=config.vocab_size - 1,
-                    size=(batch_size, sequence_length),
-                    dtype=torch.long,
-                    device=device,
-                )
+                if config.model_type == "vit":
+                    logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224]))
+                    input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float, device=device)
+                else:
+                    logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
+                    input_ids = torch.randint(
+                        low=0,
+                        high=config.vocab_size - 1,
+                        size=(batch_size, sequence_length),
+                        dtype=torch.long,
+                        device=device,
+                    )
+
                 try:
                     inference = (
                         torch.jit.trace(model, input_ids) if torchscript else torch.compile(model) if torch2 else model
diff --git a/onnxruntime/python/tools/transformers/dev_benchmark.cmd b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
index c3de6519bd197..7a9b3254a1708 100644
--- a/onnxruntime/python/tools/transformers/dev_benchmark.cmd
+++ b/onnxruntime/python/tools/transformers/dev_benchmark.cmd
@@ -21,27 +21,25 @@ set run_torchscript=false
 
 REM Devices to test.
 REM Attention: You cannot run both CPU and GPU at the same time: gpu need onnxruntime-gpu, and CPU need onnxruntime.
-set run_gpu_fp32=true
+set run_gpu_fp32=false
 set run_gpu_fp16=false
-set run_cpu_fp32=false
-set run_cpu_int8=false
+set run_cpu_fp32=true
+set run_cpu_int8=true
 
 set average_over=100
 
 REM Enable optimizer (use script instead of OnnxRuntime for graph optimization)
 set use_optimizer=true
 
-set batch_sizes=1 4
-set sequence_length=32 64
+set batch_sizes=1
+set sequence_length=8 128
 
 REM Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model.
 REM Note that different input count might lead to different performance
 set input_counts=1
 
 REM Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
-REM set models_to_test=bert-base-cased
-set models_to_test="google/vit-base-patch16-224"
-REM set models_to_test="google/vit-base-patch32"
+set models_to_test=bert-base-cased
 
 REM If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 REM set CUDA_VISIBLE_DEVICES=1
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 82d8095328765..c834862f062ca 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -586,10 +586,8 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
                 ],
                 output_name_to_node,
             )
-#        if mask_nodes is None:
-#            logger.debug("fuse_attention: failed to match mask path")
-#            return
 
+        # ViT models have no mask nodes, so only do them for models that have them
         if mask_nodes is not None and len(mask_nodes) > 1 and mask_nodes[0].op_type == "Mul":
             _, mul_val = self.model.get_constant_input(mask_nodes[0])
             if mul_val != -10000:
diff --git a/onnxruntime/python/tools/transformers/import.py b/onnxruntime/python/tools/transformers/import.py
deleted file mode 100644
index 9c734e459ae40..0000000000000
--- a/onnxruntime/python/tools/transformers/import.py
+++ /dev/null
@@ -1,20 +0,0 @@
-from transformers import AutoImageProcessor, ViTModel
-import torch
-from datasets import load_dataset
-
-dataset = load_dataset("huggingface/cats-image")
-image = dataset["test"]["image"][0]
-
-image_processor = AutoImageProcessor.from_pretrained("google/vit-base-patch16-224-in21k")
-model = ViTModel.from_pretrained("google/vit-base-patch16-224-in21k")
-
-inputs = image_processor(image, return_tensors="pt")
-
-with torch.no_grad():
-    outputs = model(**inputs)
-
-# last_hidden_states = outputs.last_hidden_state
-# list(last_hidden_states.shape)
-
-# print(inputs)
-torch.onnx.export(model, inputs['pixel_values'], "TestModel.onnx", verbose=True)
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index c4dda99496ebe..c490be2499826 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -16,7 +16,7 @@
 from huggingface_models import MODEL_CLASSES
 from quantize_helper import QuantizeHelper
 from torch_onnx_export_helper import torch_onnx_export
-from transformers import AutoConfig, AutoTokenizer, LxmertConfig, TransfoXLConfig
+from transformers import AutoConfig, AutoModelForImageClassification, AutoTokenizer, LxmertConfig, TransfoXLConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
 from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState
@@ -49,8 +49,12 @@ def restore_torch_functions():
 
 
 def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config, data_type=numpy.int64):
-    input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, sequence_length), dtype=data_type)
+    if config.model_type=="vit":
+        input_ids = numpy.random.rand(batch_size, 3, 224, 224).astype(numpy.float32)
+        inputs = {"pixel_values": input_ids}
+        return inputs
 
+    input_ids = numpy.random.randint(low=0, high=vocab_size - 1, size=(batch_size, sequence_length), dtype=data_type)
     inputs = {"input_ids": input_ids}
 
     if "attention_mask" in input_names:
@@ -106,6 +110,10 @@ def build_dynamic_axes(example_inputs, outputs_flatten):
                 dynamic_axes[output_name].update({j: "seq_len"})
     return dynamic_axes, output_names
 
+def build_dynamic_axes_vit(example_inputs, outputs_flatten):
+    dynamic_axes = {key: {0: "pixel_values"} for key in example_inputs.keys()}
+    output_names = ["logits"]
+    return dynamic_axes, output_names
 
 def validate_onnx_model(
     onnx_model_path,
@@ -290,6 +298,9 @@ def modelclass_dispatcher(model_name, custom_model_class):
 
 
 def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_tf_model=False):
+    if config.model_type=="vit":
+        return AutoModelForImageClassification.from_pretrained(model_name, config=config, cache_dir=cache_dir)
+
     model_class_name = modelclass_dispatcher(model_name, custom_model_class)
 
     if model_class_name == "GPT2ModelNoPastState":
@@ -439,7 +450,10 @@ def validate_and_optimize_onnx(
                 model_fusion_statistics,
             )
 
-    return onnx_model_path, is_valid_onnx_model, config.vocab_size
+    if config.model_type == "vit":
+        return onnx_model_path, is_valid_onnx_model, config.num_labels
+    else:
+        return onnx_model_path, is_valid_onnx_model, config.vocab_size
 
 
 def export_onnx_model_from_pt(
@@ -466,18 +480,23 @@ def export_onnx_model_from_pt(
     # config, model = load_pt_model_from_tf(model_name)
     model.cpu()
 
-    tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
-    max_input_size = (
-        tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
-    )
+    if config.model_type == "vit":
+        max_input_size = 1024 # What to use for ViT?
 
-    example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
+        example_inputs = inputs = { 'pixel_values' : torch.rand(2,3,224,224) }
+        example_outputs = model(**example_inputs)
+    else:
+        tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
+        max_input_size = (
+            tokenizer.max_model_input_sizes[model_name] if model_name in tokenizer.max_model_input_sizes else 1024
+        )
 
-    example_inputs = filter_inputs(example_inputs, input_names)
+        example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
 
-    example_outputs = model(**example_inputs)
+        example_inputs = filter_inputs(example_inputs, input_names)
 
-    assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
+        example_outputs = model(**example_inputs)
+        assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
 
     # Flatten is needed for gpt2 and distilgpt2.
     example_outputs_flatten = flatten(example_outputs)
@@ -498,7 +517,10 @@ def export_onnx_model_from_pt(
         logger.info("Exporting ONNX model to {}".format(onnx_model_path))
         Path(onnx_model_path).parent.mkdir(parents=True, exist_ok=True)
 
-        dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
+        if config.model_type == "vit":
+            dynamic_axes, output_names = build_dynamic_axes_vit(example_inputs, example_outputs_flatten)
+        else:
+            dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
 
         replace_torch_functions()
         torch_onnx_export(

From 6054a5d89268260b3a23c4756958a2c5bbf5d57c Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Tue, 14 Mar 2023 15:40:36 -0700
Subject: [PATCH 06/22] Fix

---
 .../python/tools/transformers/huggingface_models.py       | 5 +++--
 onnxruntime/python/tools/transformers/onnx_exporter.py    | 2 --
 onnxruntime/python/tools/transformers/run_benchmark.sh    | 8 +++-----
 3 files changed, 6 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index 0aabcad3de2a2..47595db17d3a6 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -16,8 +16,6 @@
 # List of pretrained models: https://huggingface.co/transformers/pretrained_models.html
 # Pretrained model name to a tuple of input names, opset_version, use_external_data_format, optimization model type
 MODELS = {
-    "google/vit-base-patch16-224": (["input_ids"], 12, False, "bert"),
-
     # BERT
     "bert-base-uncased": (
         ["input_ids", "attention_mask", "token_type_ids"],
@@ -160,4 +158,7 @@
     ),
     # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"),
     # "google/pegasus-large": (["input_ids"], 11, False, "bert"),
+
+    # ViT
+    "google/vit-base-patch16-224": (["input_ids"], 12, False, "bert"),
 }
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index e7faca6699747..c490be2499826 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -10,12 +10,10 @@
 from pathlib import Path
 
 import numpy
-import requests
 import torch
 from affinity_helper import AffinitySetting
 from benchmark_helper import OptimizerInfo, Precision, create_onnxruntime_session
 from huggingface_models import MODEL_CLASSES
-from PIL import Image
 from quantize_helper import QuantizeHelper
 from torch_onnx_export_helper import torch_onnx_export
 from transformers import AutoConfig, AutoModelForImageClassification, AutoTokenizer, LxmertConfig, TransfoXLConfig
diff --git a/onnxruntime/python/tools/transformers/run_benchmark.sh b/onnxruntime/python/tools/transformers/run_benchmark.sh
index ee33555b8e526..f0422839c11eb 100644
--- a/onnxruntime/python/tools/transformers/run_benchmark.sh
+++ b/onnxruntime/python/tools/transformers/run_benchmark.sh
@@ -14,7 +14,7 @@
 use_package=true
 
 # only need once
-run_install=false
+run_install=true
 
 # Engines to test.
 # To run ort_trt, you need to build and install the onnxruntime-gpu-tensorrt package on your own
@@ -49,8 +49,7 @@ layer_number=16
 
 # Batch Sizes and Sequence Lengths
 batch_sizes="1 4"
-sequence_lengths="32 64"
-# 8 16 32 64 128 256 512 1024"
+sequence_lengths="8 16 32 64 128 256 512 1024"
 
 # Number of inputs (input_ids, token_type_ids, attention_mask) for ONNX model.
 # Not that different input count might lead to different performance
@@ -58,8 +57,7 @@ sequence_lengths="32 64"
 input_counts=1
 
 # Pretrained transformers models can be a subset of: bert-base-cased roberta-base gpt2 distilgpt2 distilbert-base-uncased
-models_to_test="google/vit-base-patch16-224"
-# bert-base-cased roberta-base distilbert-base-uncased"
+models_to_test="bert-base-cased roberta-base distilbert-base-uncased"
 
 # If you have mutliple GPUs, you can choose one GPU for test. Here is an example to use the second GPU:
 # export CUDA_VISIBLE_DEVICES=1

From 0d8b4e939d7a4afd7afa17f64622c8eebffbe48b Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Wed, 22 Mar 2023 20:16:50 -0700
Subject: [PATCH 07/22] Stash changes

---
 onnxruntime/python/tools/transformers/benchmark.py     | 4 ++--
 onnxruntime/python/tools/transformers/onnx_exporter.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 804ae83dcc12e..3c8114a04247c 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -340,7 +340,7 @@ def run_pytorch(
         )
 
         if config.model_type == "vit":
-            max_input_size = 1024 # What to use for ViT?
+            max_input_size = 1024 # Just needs to be greater than sequence_length
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -370,7 +370,7 @@ def run_pytorch(
 
                 if config.model_type == "vit":
                     logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, 3, 224, 224]))
-                    input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float, device=device)
+                    input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32, device=device)
                 else:
                     logger.info("Run PyTorch on {} with input shape {}".format(model_name, [batch_size, sequence_length]))
                     input_ids = torch.randint(
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index c490be2499826..41159b8189296 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -481,7 +481,7 @@ def export_onnx_model_from_pt(
     model.cpu()
 
     if config.model_type == "vit":
-        max_input_size = 1024 # What to use for ViT?
+        max_input_size = 1024 # Just needs to be greater than sequence_length
 
         example_inputs = inputs = { 'pixel_values' : torch.rand(2,3,224,224) }
         example_outputs = model(**example_inputs)

From 441b555d2355476a63ebabf85e299f41eb314cb5 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Tue, 4 Apr 2023 15:16:00 -0700
Subject: [PATCH 08/22] Stash changes

---
 onnxruntime/python/tools/transformers/huggingface_models.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index 47595db17d3a6..cf868a55f6433 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -160,5 +160,5 @@
     # "google/pegasus-large": (["input_ids"], 11, False, "bert"),
 
     # ViT
-    "google/vit-base-patch16-224": (["input_ids"], 12, False, "bert"),
+    "google/vit-base-patch16-224": (["input_ids"], 12, False, "vit"),
 }

From fe934a102d7956f6983b69dd01c233d30cad1b3e Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Thu, 6 Apr 2023 14:22:16 -0700
Subject: [PATCH 09/22] Swin benchmarking

---
 .../python/tools/transformers/benchmark.py    |  6 ++--
 .../tools/transformers/fusion_attention.py    | 12 ++++----
 .../tools/transformers/huggingface_models.py  |  5 ++--
 .../tools/transformers/onnx_exporter.py       | 29 ++++++++-----------
 .../python/tools/transformers/optimizer.py    |  3 +-
 5 files changed, 26 insertions(+), 29 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 992a70fecc601..376cbc22dbe40 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -262,7 +262,7 @@ def run_onnxruntime(
                         "datetime": str(datetime.now()),
                     }
 
-                    if config.model_type == "vit":
+                    if config.model_type == "vit" or config.model_type == "swin":
                         logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, 224, 224]}")
                     else:
                         logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, sequence_length]}")
@@ -340,7 +340,7 @@ def run_pytorch(
             custom_model_class=model_class,
         )
 
-        if config.model_type == "vit":
+        if config.model_type == "vit" or config.model_type == "swin":
             max_input_size = 1024 # Just needs to be greater than sequence_length
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
@@ -369,7 +369,7 @@ def run_pytorch(
                 if max_input_size is not None and sequence_length > max_input_size:
                     continue
 
-                if config.model_type == "vit":
+                if config.model_type == "vit" or config.model_type == "swin":
                     logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, 3, 224, 224]}")
                     input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32, device=device)
                 else:
diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index 79cbf83e1c214..c477ce08db73a 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -458,13 +458,13 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             if input not in output_name_to_node:
                 continue
 
-                if input == qkv_nodes[0].output[0]:
-                    continue
-                other_inputs.append(input)
-            if len(other_inputs) != 1:
-                return
+            if input == qkv_nodes[0].output[0]:
+                continue
+            other_inputs.append(input)
+        if len(other_inputs) != 1:
+            return
 
-            root_input = other_inputs[0]
+        root_input = other_inputs[0]
 
 
         """
diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index d00d5cabf33f7..f6899a6527d6a 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -156,10 +156,11 @@
         False,
         "bert",
     ),
-    "google/vit-base-patch16-224": (["pixel_values"], 12, False, "vit"),
     # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"),
     # "google/pegasus-large": (["input_ids"], 11, False, "bert"),
 
     # ViT
-    "google/vit-base-patch16-224": (["input_ids"], 12, False, "vit"),
+    "google/vit-base-patch16-224": (["pixel_values"], 12, False, "vit"),
+    # Swin
+    "microsoft/swin-base-patch4-window7-224": (["pixel_values"], 12, False, "swin"),
 }
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index bb3305ea25b41..b7ad1d40d98a6 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -16,7 +16,7 @@
 from huggingface_models import MODEL_CLASSES
 from quantize_helper import QuantizeHelper
 from torch_onnx_export_helper import torch_onnx_export
-from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer, LxmertConfig, TransfoXLConfig
+from transformers import AutoConfig, AutoModelForImageClassification, AutoFeatureExtractor, AutoTokenizer, LxmertConfig, TransfoXLConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
 from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState  # noqa: E402
@@ -110,11 +110,6 @@ def build_dynamic_axes(example_inputs, outputs_flatten):
                 dynamic_axes[output_name].update({j: "seq_len"})
     return dynamic_axes, output_names
 
-def build_dynamic_axes_vit(example_inputs, outputs_flatten):
-    dynamic_axes = {key: {0: "pixel_values"} for key in example_inputs.keys()}
-    output_names = ["logits"]
-    return dynamic_axes, output_names
-
 def validate_onnx_model(
     onnx_model_path,
     example_inputs,
@@ -255,8 +250,8 @@ def optimize_onnx_model(
         opt_model = optimize_model(
             onnx_model_path,
             model_type,
-            num_heads=num_attention_heads,
-            hidden_size=hidden_size,
+            num_heads=num_attention_heads if model_type != "swin" else 0,
+            hidden_size=hidden_size if model_type != "swin" else 0,
             opt_level=0,
             optimization_options=optimization_options,
             use_gpu=use_gpu,
@@ -298,9 +293,6 @@ def modelclass_dispatcher(model_name, custom_model_class):
 
 
 def load_pretrained_model(model_name, config, cache_dir, custom_model_class, is_tf_model=False):
-    if config.model_type=="vit":
-        return AutoModelForImageClassification.from_pretrained(model_name, config=config, cache_dir=cache_dir)
-
     model_class_name = modelclass_dispatcher(model_name, custom_model_class)
 
     if model_class_name == "GPT2ModelNoPastState":
@@ -479,8 +471,11 @@ def export_onnx_model_from_pt(
     example_inputs = None
     max_input_size = None
 
-    if model_type == "vit":
-        example_inputs = inputs = { 'pixel_values' : torch.rand(2,3,224,224) }
+    if model_type == "vit" or model_type == "swin":
+        image_processor = AutoFeatureExtractor.from_pretrained(model_name, cache_dir=cache_dir)
+        data = numpy.random.randint(low=0, high=256, size=224 * 224 * 3, dtype=numpy.uint8).reshape(224, 224, 3)
+
+        example_inputs = image_processor(data, return_tensors="pt")
     else:
         tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
         max_input_size = (
@@ -489,10 +484,10 @@ def export_onnx_model_from_pt(
 
         example_inputs = tokenizer.encode_plus("This is a sample input", return_tensors="pt")
 
-        example_inputs = filter_inputs(example_inputs, input_names)
+    example_inputs = filter_inputs(example_inputs, input_names)
 
-        example_outputs = model(**example_inputs)
-        assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
+    example_outputs = model(**example_inputs)
+    assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
 
     # Flatten is needed for gpt2 and distilgpt2.
     example_outputs_flatten = flatten(example_outputs)
@@ -516,7 +511,7 @@ def export_onnx_model_from_pt(
         dynamic_axes = None
         output_names = None
 
-        if model_type == "vit":
+        if model_type == "vit" or model_type == "swin":
             dynamic_axes, output_names = {key: {0: "pixel_values"} for key in example_inputs}, ["logits"]
         else:
             dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index 99ef58841d6cd..c73de6c52ad09 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -57,6 +57,7 @@
     "vae": (VaeOnnxModel, "pytorch", 1),
     "clip": (ClipOnnxModel, "pytorch", 1),
     "vit": (BertOnnxModel, "pytorch", 1),
+    "swin": (BertOnnxModel, "pytorch", 1),
 }
 
 
@@ -160,7 +161,7 @@ def optimize_by_fusion(
      Returns:
         object of an optimizer class.
     """
-    if model_type not in ["bert", "unet", "vae", "clip"] and (num_heads == 0 or hidden_size == 0):
+    if model_type not in ["bert", "vit", "swin", "unet", "vae", "clip"] and (num_heads == 0 or hidden_size == 0):
         logger.warning(f"Please specify parameters of num_heads and hidden_size for model_type {model_type}")
 
     (optimizer_class, producer, _) = MODEL_TYPES[model_type]

From dc650905eb87fc32bab1bc857577d7ca01ed2cda Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Thu, 6 Apr 2023 14:44:22 -0700
Subject: [PATCH 10/22] Add more models, fix one bug

---
 onnxruntime/python/tools/transformers/fusion_attention.py   | 2 --
 onnxruntime/python/tools/transformers/huggingface_models.py | 2 ++
 onnxruntime/python/tools/transformers/onnx_exporter.py      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/fusion_attention.py b/onnxruntime/python/tools/transformers/fusion_attention.py
index c477ce08db73a..47af94550910e 100644
--- a/onnxruntime/python/tools/transformers/fusion_attention.py
+++ b/onnxruntime/python/tools/transformers/fusion_attention.py
@@ -465,8 +465,6 @@ def fuse(self, normalize_node, input_name_to_nodes, output_name_to_node):
             return
 
         root_input = other_inputs[0]
-
-
         """
         Match flaubert                     Mask
                                             |
diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index f6899a6527d6a..408b7b41f809d 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -163,4 +163,6 @@
     "google/vit-base-patch16-224": (["pixel_values"], 12, False, "vit"),
     # Swin
     "microsoft/swin-base-patch4-window7-224": (["pixel_values"], 12, False, "swin"),
+    "microsoft/swin-small-patch4-window7-224": (["pixel_values"], 12, False, "swin"),
+    "microsoft/swin-tiny-patch4-window7-224": (["pixel_values"], 12, False, "swin"),
 }
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index b7ad1d40d98a6..21895c3e97b1a 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -442,7 +442,7 @@ def validate_and_optimize_onnx(
                 model_fusion_statistics,
             )
 
-    return onnx_model_path, is_valid_onnx_model, config.num_labels if model_type == "vit" else config.vocab_size
+    return onnx_model_path, is_valid_onnx_model, config.num_labels if model_type == "vit" or model_type == "swin" else config.vocab_size
 
 
 def export_onnx_model_from_pt(

From c2a48eb9a85ca69aa671176f6c04f236595d8e5c Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Thu, 6 Apr 2023 15:07:00 -0700
Subject: [PATCH 11/22] Fix swin for benchmark

---
 onnxruntime/python/tools/transformers/onnx_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 21895c3e97b1a..5d1b7555bac38 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -49,7 +49,7 @@ def restore_torch_functions():
 
 
 def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config, data_type=numpy.int64):
-    if config.model_type=="vit":
+    if config.model_type=="vit" or config.model_type=="swin":
         input_ids = numpy.random.rand(batch_size, 3, 224, 224).astype(numpy.float32)
         inputs = {"pixel_values": input_ids}
         return inputs

From f40e4db701b791191af87bef6352d1609f9f9db0 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Thu, 6 Apr 2023 20:21:01 -0700
Subject: [PATCH 12/22] Remove hardcoded 224 for vit/swin

---
 onnxruntime/python/tools/transformers/benchmark.py     | 6 +++---
 onnxruntime/python/tools/transformers/onnx_exporter.py | 4 ++--
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 376cbc22dbe40..197cc73a4288f 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -263,7 +263,7 @@ def run_onnxruntime(
                     }
 
                     if config.model_type == "vit" or config.model_type == "swin":
-                        logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, 224, 224]}")
+                        logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}")
                     else:
                         logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, sequence_length]}")
 
@@ -370,8 +370,8 @@ def run_pytorch(
                     continue
 
                 if config.model_type == "vit" or config.model_type == "swin":
-                    logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, 3, 224, 224]}")
-                    input_ids = torch.randn(size=(batch_size, 3, 224, 224),dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32, device=device)
+                    logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}")
+                    input_ids = torch.randn(size=(batch_size, 3, config.image_size, config.image_size),dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32, device=device)
                 else:
                     logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
                     input_ids = torch.randint(
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 5d1b7555bac38..33e443ff5be0e 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -50,7 +50,7 @@ def restore_torch_functions():
 
 def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config, data_type=numpy.int64):
     if config.model_type=="vit" or config.model_type=="swin":
-        input_ids = numpy.random.rand(batch_size, 3, 224, 224).astype(numpy.float32)
+        input_ids = numpy.random.rand(batch_size, 3, config.image_size, config.image_size).astype(numpy.float32)
         inputs = {"pixel_values": input_ids}
         return inputs
 
@@ -473,7 +473,7 @@ def export_onnx_model_from_pt(
 
     if model_type == "vit" or model_type == "swin":
         image_processor = AutoFeatureExtractor.from_pretrained(model_name, cache_dir=cache_dir)
-        data = numpy.random.randint(low=0, high=256, size=224 * 224 * 3, dtype=numpy.uint8).reshape(224, 224, 3)
+        data = numpy.random.randint(low=0, high=256, size=config.image_size * config.image_size * 3, dtype=numpy.uint8).reshape(config.image_size, config.image_size, 3)
 
         example_inputs = image_processor(data, return_tensors="pt")
     else:

From 8ab290a9070889d023b26ad5d768816ca63d41e2 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Thu, 6 Apr 2023 20:24:28 -0700
Subject: [PATCH 13/22] Remove unused AutoModelForImageClassification

---
 onnxruntime/python/tools/transformers/onnx_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 33e443ff5be0e..06e8330842e03 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -16,7 +16,7 @@
 from huggingface_models import MODEL_CLASSES
 from quantize_helper import QuantizeHelper
 from torch_onnx_export_helper import torch_onnx_export
-from transformers import AutoConfig, AutoModelForImageClassification, AutoFeatureExtractor, AutoTokenizer, LxmertConfig, TransfoXLConfig
+from transformers import AutoConfig, AutoFeatureExtractor, AutoTokenizer, LxmertConfig, TransfoXLConfig
 
 sys.path.append(os.path.join(os.path.dirname(__file__), "models", "gpt2"))
 from gpt2_helper import PRETRAINED_GPT2_MODELS, GPT2ModelNoPastState, TFGPT2ModelNoPastState  # noqa: E402

From 456d95b7c402d42936e2793e8d57982686d69473 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Thu, 6 Apr 2023 20:30:35 -0700
Subject: [PATCH 14/22] Add comments, and simplify

---
 onnxruntime/python/tools/transformers/onnx_exporter.py | 3 ++-
 onnxruntime/python/tools/transformers/optimizer.py     | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 06e8330842e03..2894d2b60af05 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -250,7 +250,7 @@ def optimize_onnx_model(
         opt_model = optimize_model(
             onnx_model_path,
             model_type,
-            num_heads=num_attention_heads if model_type != "swin" else 0,
+            num_heads=num_attention_heads if model_type != "swin" else 0, # For Swin, num_attention_heads is a list, so use 0 for now
             hidden_size=hidden_size if model_type != "swin" else 0,
             opt_level=0,
             optimization_options=optimization_options,
@@ -487,6 +487,7 @@ def export_onnx_model_from_pt(
     example_inputs = filter_inputs(example_inputs, input_names)
 
     example_outputs = model(**example_inputs)
+
     assert isinstance(example_outputs, (list, tuple)), f"type of output is not list or tuple: {type(example_outputs)}"
 
     # Flatten is needed for gpt2 and distilgpt2.
diff --git a/onnxruntime/python/tools/transformers/optimizer.py b/onnxruntime/python/tools/transformers/optimizer.py
index c73de6c52ad09..de12624821d38 100644
--- a/onnxruntime/python/tools/transformers/optimizer.py
+++ b/onnxruntime/python/tools/transformers/optimizer.py
@@ -161,7 +161,7 @@ def optimize_by_fusion(
      Returns:
         object of an optimizer class.
     """
-    if model_type not in ["bert", "vit", "swin", "unet", "vae", "clip"] and (num_heads == 0 or hidden_size == 0):
+    if model_type not in ["bert", "swin", "unet", "vae", "clip"] and (num_heads == 0 or hidden_size == 0):
         logger.warning(f"Please specify parameters of num_heads and hidden_size for model_type {model_type}")
 
     (optimizer_class, producer, _) = MODEL_TYPES[model_type]

From 50fa2f9f63fc6d40965223ffcae6a59e0fadf894 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 11:10:36 -0700
Subject: [PATCH 15/22] Fix python formatting

---
 .../python/tools/transformers/benchmark.py    | 16 ++++++++++----
 .../tools/transformers/onnx_exporter.py       | 21 ++++++++++++++-----
 2 files changed, 28 insertions(+), 9 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 197cc73a4288f..60b841685da77 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -263,7 +263,9 @@ def run_onnxruntime(
                     }
 
                     if config.model_type == "vit" or config.model_type == "swin":
-                        logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}")
+                        logger.info(
+                            f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
+                        )
                     else:
                         logger.info(f"Run onnxruntime on {model_name} with input shape {[batch_size, sequence_length]}")
 
@@ -341,7 +343,7 @@ def run_pytorch(
         )
 
         if config.model_type == "vit" or config.model_type == "swin":
-            max_input_size = 1024 # Just needs to be greater than sequence_length
+            max_input_size = 1024   # Just needs to be greater than sequence_length
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -370,8 +372,14 @@ def run_pytorch(
                     continue
 
                 if config.model_type == "vit" or config.model_type == "swin":
-                    logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}")
-                    input_ids = torch.randn(size=(batch_size, 3, config.image_size, config.image_size),dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32, device=device)
+                    logger.info(
+                        f"Run PyTorch on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
+                    )
+                    input_ids = torch.randn(
+                        size=(batch_size, 3, config.image_size, config.image_size),
+                        dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32,
+                        device=device
+                    )
                 else:
                     logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
                     input_ids = torch.randint(
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 2894d2b60af05..0f237c955f4d5 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -49,7 +49,7 @@ def restore_torch_functions():
 
 
 def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config, data_type=numpy.int64):
-    if config.model_type=="vit" or config.model_type=="swin":
+    if config.model_type == "vit" or config.model_type == "swin":
         input_ids = numpy.random.rand(batch_size, 3, config.image_size, config.image_size).astype(numpy.float32)
         inputs = {"pixel_values": input_ids}
         return inputs
@@ -244,14 +244,19 @@ def optimize_onnx_model(
         if precision == Precision.INT8:
             optimization_options.enable_embed_layer_norm = False
 
+        # For swin models, the num_attention_heads is a list, which isn't supported yet, so set to 0 for now
+        if model_type == "swin":
+            num_attention_heads = 0
+            hidden_size = 0
+
         # Use script to optimize model.
         # Use opt_level <= 1 for models to be converted to fp16, because some fused op (like FusedGemm) has only fp32 and no fp16.
         # It is better to be conservative so we use opt_level=0 here, in case MemcpyFromHost is added to the graph by OnnxRuntime.
         opt_model = optimize_model(
             onnx_model_path,
             model_type,
-            num_heads=num_attention_heads if model_type != "swin" else 0, # For Swin, num_attention_heads is a list, so use 0 for now
-            hidden_size=hidden_size if model_type != "swin" else 0,
+            num_heads=num_attention_heads,
+            hidden_size=hidden_size,
             opt_level=0,
             optimization_options=optimization_options,
             use_gpu=use_gpu,
@@ -442,7 +447,11 @@ def validate_and_optimize_onnx(
                 model_fusion_statistics,
             )
 
-    return onnx_model_path, is_valid_onnx_model, config.num_labels if model_type == "vit" or model_type == "swin" else config.vocab_size
+    return (
+        onnx_model_path,
+        is_valid_onnx_model,
+        config.num_labels if model_type == "vit" or model_type == "swin" else config.vocab_size
+    )
 
 
 def export_onnx_model_from_pt(
@@ -473,7 +482,9 @@ def export_onnx_model_from_pt(
 
     if model_type == "vit" or model_type == "swin":
         image_processor = AutoFeatureExtractor.from_pretrained(model_name, cache_dir=cache_dir)
-        data = numpy.random.randint(low=0, high=256, size=config.image_size * config.image_size * 3, dtype=numpy.uint8).reshape(config.image_size, config.image_size, 3)
+        data = numpy.random.randint(
+            low=0, high=256, size=config.image_size * config.image_size * 3, dtype=numpy.uint8
+        ).reshape(config.image_size, config.image_size, 3)
 
         example_inputs = image_processor(data, return_tensors="pt")
     else:

From ecd65f47e247e4e59ced64e05bdebc3664ff0d8a Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 12:51:03 -0700
Subject: [PATCH 16/22] Fix formatting

---
 onnxruntime/python/tools/transformers/benchmark.py          | 4 ++--
 onnxruntime/python/tools/transformers/huggingface_models.py | 1 -
 2 files changed, 2 insertions(+), 3 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 60b841685da77..6909697848505 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -343,7 +343,7 @@ def run_pytorch(
         )
 
         if config.model_type == "vit" or config.model_type == "swin":
-            max_input_size = 1024   # Just needs to be greater than sequence_length
+            max_input_size = 1024  # Just needs to be greater than sequence_length
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -378,7 +378,7 @@ def run_pytorch(
                     input_ids = torch.randn(
                         size=(batch_size, 3, config.image_size, config.image_size),
                         dtype=torch.float16 if precision == Precision.FLOAT16 else torch.float32,
-                        device=device
+                        device=device,
                     )
                 else:
                     logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
diff --git a/onnxruntime/python/tools/transformers/huggingface_models.py b/onnxruntime/python/tools/transformers/huggingface_models.py
index 408b7b41f809d..dcfe4a28ad9af 100644
--- a/onnxruntime/python/tools/transformers/huggingface_models.py
+++ b/onnxruntime/python/tools/transformers/huggingface_models.py
@@ -158,7 +158,6 @@
     ),
     # "google/pegasus-xsum": (["input_ids"], 11, False, "bert"),
     # "google/pegasus-large": (["input_ids"], 11, False, "bert"),
-
     # ViT
     "google/vit-base-patch16-224": (["pixel_values"], 12, False, "vit"),
     # Swin

From 2a336bd3610aec8ddcecf94dd5c149a9ae75eaf6 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 13:03:11 -0700
Subject: [PATCH 17/22] Formatting

---
 onnxruntime/python/tools/transformers/onnx_exporter.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 0f237c955f4d5..309ad87045d05 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -450,7 +450,7 @@ def validate_and_optimize_onnx(
     return (
         onnx_model_path,
         is_valid_onnx_model,
-        config.num_labels if model_type == "vit" or model_type == "swin" else config.vocab_size
+        config.num_labels if model_type == "vit" or model_type == "swin" else config.vocab_size,
     )
 
 

From dbd9e0ad15a6a911afff02b655dfd22de9b015b1 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 13:09:59 -0700
Subject: [PATCH 18/22] Formatting

---
 onnxruntime/python/tools/transformers/onnx_exporter.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 309ad87045d05..1189846cd4fae 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -110,6 +110,7 @@ def build_dynamic_axes(example_inputs, outputs_flatten):
                 dynamic_axes[output_name].update({j: "seq_len"})
     return dynamic_axes, output_names
 
+
 def validate_onnx_model(
     onnx_model_path,
     example_inputs,

From 43df1ed98764d07cb693237aa4e769fc074bdfc2 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 16:04:35 -0700
Subject: [PATCH 19/22] Code review feedback

---
 onnxruntime/python/tools/transformers/benchmark.py | 14 +++++++-------
 .../python/tools/transformers/onnx_exporter.py     |  8 ++++----
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index 6909697848505..b66bb70cce549 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -262,7 +262,7 @@ def run_onnxruntime(
                         "datetime": str(datetime.now()),
                     }
 
-                    if config.model_type == "vit" or config.model_type == "swin":
+                    if config.model_type in ["vit", "swin"]:
                         logger.info(
                             f"Run onnxruntime on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
                         )
@@ -342,8 +342,8 @@ def run_pytorch(
             custom_model_class=model_class,
         )
 
-        if config.model_type == "vit" or config.model_type == "swin":
-            max_input_size = 1024  # Just needs to be greater than sequence_length
+        if config.model_type in ["vit", "swin"]:
+            sequence_lengths = [1]  # Set array to one entry so we iterate once, and ignore any extra lengths
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -368,10 +368,7 @@ def run_pytorch(
                 continue
 
             for sequence_length in sequence_lengths:
-                if max_input_size is not None and sequence_length > max_input_size:
-                    continue
-
-                if config.model_type == "vit" or config.model_type == "swin":
+                if config.model_type in ["vit", "swin"]:
                     logger.info(
                         f"Run PyTorch on {model_name} with input shape {[batch_size, 3, config.image_size, config.image_size]}"
                     )
@@ -381,6 +378,9 @@ def run_pytorch(
                         device=device,
                     )
                 else:
+                    if sequence_length > max_input_size:
+                        continue
+
                     logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")
                     input_ids = torch.randint(
                         low=0,
diff --git a/onnxruntime/python/tools/transformers/onnx_exporter.py b/onnxruntime/python/tools/transformers/onnx_exporter.py
index 1189846cd4fae..38f7f8cd05f1d 100644
--- a/onnxruntime/python/tools/transformers/onnx_exporter.py
+++ b/onnxruntime/python/tools/transformers/onnx_exporter.py
@@ -49,7 +49,7 @@ def restore_torch_functions():
 
 
 def create_onnxruntime_input(vocab_size, batch_size, sequence_length, input_names, config, data_type=numpy.int64):
-    if config.model_type == "vit" or config.model_type == "swin":
+    if config.model_type in ["vit", "swin"]:
         input_ids = numpy.random.rand(batch_size, 3, config.image_size, config.image_size).astype(numpy.float32)
         inputs = {"pixel_values": input_ids}
         return inputs
@@ -451,7 +451,7 @@ def validate_and_optimize_onnx(
     return (
         onnx_model_path,
         is_valid_onnx_model,
-        config.num_labels if model_type == "vit" or model_type == "swin" else config.vocab_size,
+        config.num_labels if model_type in ["vit", "swin"] else config.vocab_size,
     )
 
 
@@ -481,7 +481,7 @@ def export_onnx_model_from_pt(
     example_inputs = None
     max_input_size = None
 
-    if model_type == "vit" or model_type == "swin":
+    if model_type in ["vit", "swin"]:
         image_processor = AutoFeatureExtractor.from_pretrained(model_name, cache_dir=cache_dir)
         data = numpy.random.randint(
             low=0, high=256, size=config.image_size * config.image_size * 3, dtype=numpy.uint8
@@ -524,7 +524,7 @@ def export_onnx_model_from_pt(
         dynamic_axes = None
         output_names = None
 
-        if model_type == "vit" or model_type == "swin":
+        if model_type in ["vit", "swin"]:
             dynamic_axes, output_names = {key: {0: "pixel_values"} for key in example_inputs}, ["logits"]
         else:
             dynamic_axes, output_names = build_dynamic_axes(example_inputs, example_outputs_flatten)

From a2fc5c5f265c774866c821f80c9aeb8f197e8922 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 16:06:16 -0700
Subject: [PATCH 20/22] Minor fix

---
 onnxruntime/python/tools/transformers/benchmark.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index b66bb70cce549..c194b4b6414f5 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -378,7 +378,7 @@ def run_pytorch(
                         device=device,
                     )
                 else:
-                    if sequence_length > max_input_size:
+                    if max_input_size is not None and sequence_length > max_input_size:
                         continue
 
                     logger.info(f"Run PyTorch on {model_name} with input shape {[batch_size, sequence_length]}")

From 7444ed5982f8a2da9465dfda76d6212755ddc960 Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 16:57:34 -0700
Subject: [PATCH 21/22] Improve formatting

---
 onnxruntime/python/tools/transformers/benchmark.py   |  6 +++++-
 .../python/tools/transformers/benchmark_helper.py    | 12 +++++++++---
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index c194b4b6414f5..a47e1a2f8452f 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -343,7 +343,8 @@ def run_pytorch(
         )
 
         if config.model_type in ["vit", "swin"]:
-            sequence_lengths = [1]  # Set array to one entry so we iterate once, and ignore any extra lengths
+            # These models don't use sequence lengths, so just pick the first sequence length so that the summary still works
+            sequence_lengths = [ sequence_lengths[0] ]
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -786,6 +787,9 @@ def main():
         logger.error("int8 is for CPU only")
         return
 
+    if len(args.models)==1 and MODELS[args.models[0]][3] in ["vit", "swim"]:
+        args.sequence_lengths = [""]
+
     args.num_threads = sorted({cpu_count if x <= 0 else x for x in args.num_threads})
 
     logger.info(f"Arguments: {args}")
diff --git a/onnxruntime/python/tools/transformers/benchmark_helper.py b/onnxruntime/python/tools/transformers/benchmark_helper.py
index fc2ec8ad8fd56..eb1a79d2e9d44 100644
--- a/onnxruntime/python/tools/transformers/benchmark_helper.py
+++ b/onnxruntime/python/tools/transformers/benchmark_helper.py
@@ -245,8 +245,11 @@ def output_summary(results, csv_filename, args):
         ]
         data_names = []
         for batch_size in args.batch_sizes:
-            for sequence_length in args.sequence_lengths:
-                data_names.append(f"b{batch_size}_s{sequence_length}")
+            if args.sequence_lengths == [""]:
+                data_names.append(f"b{batch_size}")
+            else:
+                for sequence_length in args.sequence_lengths:
+                    data_names.append(f"b{batch_size}_s{sequence_length}")
 
         csv_writer = csv.DictWriter(csv_file, fieldnames=header_names + data_names)
         csv_writer.writeheader()
@@ -273,7 +276,10 @@ def output_summary(results, csv_filename, args):
                                             assert row[k] == headers[k]
                                     b = result["batch_size"]
                                     s = result["sequence_length"]
-                                    row[f"b{b}_s{s}"] = result["average_latency_ms"]
+                                    if s != "":
+                                        row[f"b{b}_s{s}"] = result["average_latency_ms"]
+                                    else:
+                                        row[f"b{b}"] = result["average_latency_ms"]
                             if row:
                                 csv_writer.writerow(row)
 

From 5cd613fec5f0c933cf87252a7a3628234a042d9e Mon Sep 17 00:00:00 2001
From: Ryan Hill <ryanhill@microsoft.com>
Date: Fri, 7 Apr 2023 17:10:08 -0700
Subject: [PATCH 22/22] Python format

---
 onnxruntime/python/tools/transformers/benchmark.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/onnxruntime/python/tools/transformers/benchmark.py b/onnxruntime/python/tools/transformers/benchmark.py
index a47e1a2f8452f..bd9a649ae74fd 100644
--- a/onnxruntime/python/tools/transformers/benchmark.py
+++ b/onnxruntime/python/tools/transformers/benchmark.py
@@ -344,7 +344,7 @@ def run_pytorch(
 
         if config.model_type in ["vit", "swin"]:
             # These models don't use sequence lengths, so just pick the first sequence length so that the summary still works
-            sequence_lengths = [ sequence_lengths[0] ]
+            sequence_lengths = [sequence_lengths[0]]
         else:
             tokenizer = AutoTokenizer.from_pretrained(model_name, cache_dir=cache_dir)
 
@@ -787,7 +787,7 @@ def main():
         logger.error("int8 is for CPU only")
         return
 
-    if len(args.models)==1 and MODELS[args.models[0]][3] in ["vit", "swim"]:
+    if len(args.models) == 1 and MODELS[args.models[0]][3] in ["vit", "swim"]:
         args.sequence_lengths = [""]
 
     args.num_threads = sorted({cpu_count if x <= 0 else x for x in args.num_threads})