microsoft · tianleiwu · Jul 14, 2023 · Jul 13, 2023 · Jul 14, 2023
diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md b/onnxruntime/python/tools/transformers/models/stable_diffusion/README.md
@@ -277,6 +277,18 @@ Common settings for below test results:
 | torch       | 2.0.0+cu117             | default               | 16         | 14.8            | 32,306              | 16,520               |
 | torch       | 2.0.0+cu117             | compile               | 16         | 12.6            | 32,636              | 16,898               |
 
+#### Results of A100-PCIE-80GB (Ubuntu 20.04)
+| engine      | version                 | provider              | batch size | average latency | first run memory MB | second run memory MB |
+| ----------- | ----------------------- | --------------------- | ---------- | --------------- | ------------------- | -------------------- |
+| tensorrt    | 8.6.1                   | default               | 1          | 1.00            | 9,056               | 9,056                |
+| onnxruntime | 1.16.0 nightly          | tensorrt              | 1          | 1.09            | 11,250              | 11,250               |
+| onnxruntime | 1.16.0 nightly          | tensorrt (cuda graph) | 1          | 0.96            | 11,382              | 11,382               |
+| onnxruntime | 1.16.0 nightly          | cuda                  | 1          | 1.11            | 4,760               | 5,144                |
+| onnxruntime | 1.16.0 nightly          | cuda (cuda graph)     | 1          | 1.04            | 5,230               | 5,390                |
+| tensorrt    | 8.6.1                   | default               | 4          | 3.39            | 9,072               | 9,072                |
+| onnxruntime | 1.16.0 nightly          | tensorrt              | 4          | 3.60            | 11,266              | 11,266               |
+| onnxruntime | 1.16.0 nightly          | tensorrt (cuda graph) | 4          | 3.43            | 11,428              | 11,428               |
+
 #### Results of V100-PCIE-16GB (Ubuntu 20.04)
 
 Results from Standard_NC6s_v3 Azure virtual machine:

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py b/onnxruntime/python/tools/transformers/models/stable_diffusion/onnxruntime_cuda_txt2img.py
@@ -27,8 +27,7 @@
 Installation instructions
 pip install torch==1.13.1+cu117 --extra-index-url https://download.pytorch.org/whl/cu117
 pip install --upgrade transformers diffusers>=0.16.0
-pip install --upgrade tensorrt>=8.6.1
-pip install --upgrade polygraphy>=0.47.0 onnx-graphsurgeon --extra-index-url https://pypi.ngc.nvidia.com
+pip install numpy>=1.24.1 onnx>=1.13.0 coloredlogs protobuf==3.20.3 psutil sympy
 pip install onnxruntime-gpu
 """
 

diff --git a/...runtime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py b/...runtime/python/tools/transformers/models/stable_diffusion/onnxruntime_tensorrt_txt2img.py
@@ -644,6 +644,7 @@ def __load_models(self):
 
         self.models["unet"] = UNet(
             self.unet,
+            fp16=True,
             device=self.torch_device,
             max_batch_size=self.max_batch_size,
             embedding_dim=self.embedding_dim,

diff --git a/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt b/onnxruntime/python/tools/transformers/models/stable_diffusion/requirements-tensorrt.txt
@@ -11,7 +11,6 @@ sympy
 tensorrt>=8.6.1
 onnxruntime-gpu>=1.15.1
 py3nvml
-wget
 # cuda-python version shall be compatible with CUDA version of torch and onnxruntime-gpu
 cuda-python==11.7.0
 #To export onnx of stable diffusion, please install PyTorch 1.13.1+cu117