From 44808e143533e694644da13b75a8f8358ee289cf Mon Sep 17 00:00:00 2001
From: saienduri <77521230+saienduri@users.noreply.github.com>
Date: Fri, 12 Jul 2024 15:51:47 -0700
Subject: [PATCH] Add in-tree special_models test suite using reworked
 iree-tooling. (#17883)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

With this, we move away from using all the specialized json config files
and complex workflows.
Instead, we use python scripts which allow us to use custom flags,
tolerances, and configurations based on the backend/model.
Related PR in TestSuite:
https://github.com/nod-ai/SHARK-TestSuite/pull/271

This PR also removes all dependencies on SHARK-TestSuite tooling.
Reworked the tools here so that downloading, caching, testing, and
benchmarking occurs as intended with tools solely from this repo for
iree_special_models. Whenever we are adding test files here, the goal is
for an IREE user to be able to clone the repo and run the run tests
knowing nothing about the SHARK-TestSuite .

Also didn't realize, but ireers here already has a process of stamping
here to check if a file is already produced. I think we have to remove
this because it will skip even if there is a newer version of the file
available and there's really no point when downloading to a cache
because once it's there, it is never removed so not a valuable signal.

(Third times the charm. Had to close the last two versions of this PR
because couldn't get passed a pre-commit check that led me to rebase and
add a bunch of commits that weren't mine 🤦 )

ci-exactly: build_all, test_amd_mi300, build_packages, regression_test

---------

Signed-off-by: saienduri <saimanas.enduri@amd.com>
---
 .github/workflows/pkgci_regression_test.yml   | 156 ++++---
 ...dels_gpu_rocm_gfx90a_additional_flags.json |  25 --
 ...dels_gpu_rocm_gfx942_additional_flags.json |  25 --
 .../sdxl_prompt_encoder_cpu_llvm_task.json    |  22 -
 .../sdxl_prompt_encoder_gpu_rocm_gfx90a.json  |  36 --
 .../sdxl_prompt_encoder_gpu_rocm_gfx942.json  |  36 --
 .../sdxl_scheduled_unet_cpu_llvm_task.json    |  23 -
 .../sdxl_scheduled_unet_gpu_rocm_gfx90a.json  |  39 --
 .../sdxl_scheduled_unet_gpu_rocm_gfx942.json  |  41 --
 .../sdxl_vae_decode_cpu_llvm_task.json        |  20 -
 .../sdxl_vae_decode_gpu_rocm_gfx90a.json      |  29 --
 .../sdxl_vae_decode_gpu_rocm_gfx942.json      |  29 --
 .../benchmarks/sdxl/benchmark_sdxl_rocm.py    | 407 ++++++++++++++++++
 experimental/benchmarks/sdxl/conftest.py      | 144 +++++++
 .../sdxl/sdxl_pipeline_bench_f16.mlir         |  23 +
 .../regression_suite/ireers/artifacts.py      | 154 -------
 .../{ireers => ireers_tools}/__init__.py      |   0
 .../ireers_tools/artifacts.py                 | 234 ++++++++++
 .../{ireers => ireers_tools}/fixtures.py      |   0
 experimental/regression_suite/setup.py        |   9 +-
 .../shark-test-suite-models/conftest.py       |  22 +
 .../shark-test-suite-models/sd3/test_clip.py  | 167 +++++++
 .../shark-test-suite-models/sd3/test_mmdit.py | 152 +++++++
 .../shark-test-suite-models/sd3/test_vae.py   | 119 +++++
 .../shark-test-suite-models/sdxl/test_clip.py | 154 +++++++
 .../shark-test-suite-models/sdxl/test_unet.py | 183 ++++++++
 .../shark-test-suite-models/sdxl/test_vae.py  | 123 ++++++
 .../tests/pregenerated/test_llama2.py         |   2 +-
 .../tests/pregenerated/test_ukernel.py        |   2 +-
 29 files changed, 1824 insertions(+), 552 deletions(-)
 delete mode 100644 build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json
 delete mode 100644 build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json
 delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json
 create mode 100644 experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py
 create mode 100644 experimental/benchmarks/sdxl/conftest.py
 create mode 100644 experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir
 delete mode 100644 experimental/regression_suite/ireers/artifacts.py
 rename experimental/regression_suite/{ireers => ireers_tools}/__init__.py (100%)
 create mode 100644 experimental/regression_suite/ireers_tools/artifacts.py
 rename experimental/regression_suite/{ireers => ireers_tools}/fixtures.py (100%)
 create mode 100644 experimental/regression_suite/shark-test-suite-models/conftest.py
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py
 create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py

diff --git a/.github/workflows/pkgci_regression_test.yml b/.github/workflows/pkgci_regression_test.yml
index 98a6e2b8c8c2..1a61c7677371 100644
--- a/.github/workflows/pkgci_regression_test.yml
+++ b/.github/workflows/pkgci_regression_test.yml
@@ -90,7 +90,7 @@ jobs:
         uses: actions/checkout@v4.1.7
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 3603a453b3777fac9af4506a3dc0b3d87587fd47
+          ref: a06e730ce325c12db40bb89b43e8e6e897052e96
           path: SHARK-TestSuite
           submodules: false
           lfs: false
@@ -98,6 +98,7 @@ jobs:
         run: |
           source ${VENV_DIR}/bin/activate
           python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
+          pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools
 
       - name: Run external tests - ONNX test suite
         run: |
@@ -138,25 +139,14 @@ jobs:
           # CPU
           - name: cpu_llvm_task
             models-config-file: models_cpu_llvm_task.json
-            sdxl-unet-config-file: sdxl_scheduled_unet_cpu_llvm_task.json
-            sdxl-vae-config-file: sdxl_vae_decode_cpu_llvm_task.json
-            sdxl-clip-config-file: sdxl_prompt_encoder_cpu_llvm_task.json
             runs-on: nodai-amdgpu-w7900-x86-64
 
           # AMD GPU
           - name: amdgpu_rocm_mi250_gfx90a
             models-config-file: models_gpu_rocm_gfx90a.json
-            models-extra-flags-config-file: models_gpu_rocm_gfx90a_additional_flags.json
-            sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json
-            sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx90a.json
-            sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx90a.json
             runs-on: nodai-amdgpu-mi250-x86-64
           - name: amdgpu_rocm_mi300_gfx942
             models-config-file: models_gpu_rocm_gfx942.json
-            models-extra-flags-config-file: models_gpu_rocm_gfx942_additional_flags.json
-            sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx942.json
-            sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx942.json
-            sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx942.json
             runs-on: nodai-amdgpu-mi300-x86-64
           - name: amdgpu_vulkan
             models-config-file: models_gpu_vulkan.json
@@ -176,10 +166,6 @@ jobs:
       IREE_TEST_FILES: ~/iree_tests_cache
       IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
       MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }}
-      MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-extra-flags-config-file }}
-      SDXL_UNET_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-unet-config-file }}
-      SDXL_CLIP_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-clip-config-file }}
-      SDXL_VAE_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-vae-config-file }}
       VENV_DIR: ${{ github.workspace }}/venv
       LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
     steps:
@@ -207,33 +193,25 @@ jobs:
             --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
             --fetch-gh-workflow=${{ inputs.artifact_run_id }}
 
-      # TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
-      # # In-tree tests
-      # - name: Run experimental/regression_suite tests
-      #   run: |
-      #     source ${VENV_DIR}/bin/activate
-      #     pytest \
-      #       -rA -s -m "plat_host_cpu and presubmit" \
-      #       experimental/regression_suite
-
       # Out of tree tests
       - name: Check out external TestSuite repository
         uses: actions/checkout@v4.1.7
         with:
           repository: nod-ai/SHARK-TestSuite
-          ref: 3603a453b3777fac9af4506a3dc0b3d87587fd47
+          ref: a06e730ce325c12db40bb89b43e8e6e897052e96
           path: SHARK-TestSuite
           submodules: false
           lfs: true
       - name: Install external TestSuite Python requirements
         run: |
           source ${VENV_DIR}/bin/activate
-          python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
+          python3 -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt
+          pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools
       - name: Download remote files for real weight model tests
         run: |
           source ${VENV_DIR}/bin/activate
-          python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir pytorch/models
-          python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir sharktank
+          python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/pytorch/models
+          python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/sharktank
 
       - name: Run external tests - models with real weights
         if: "matrix.models-config-file != '' && !cancelled()"
@@ -251,61 +229,99 @@ jobs:
             --durations=0 \
             --config-files=${MODELS_CONFIG_FILE_PATH}
 
-      - name: Run external tests - models with real weights and additional flags
-        if: "matrix.models-extra-flags-config-file != '' && !cancelled()"
-        run: |
-          source ${VENV_DIR}/bin/activate
-          pytest SHARK-TestSuite/iree_tests/pytorch/models \
-            -rpfE \
-            -k real_weights \
-            --no-skip-tests-missing-files \
-            --capture=no \
-            --log-cli-level=info \
-            --timeout=1200 \
-            --durations=0 \
-            --config-files=${MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH}
+  test_regression_suite:
+    name: "test_regression_suite :: ${{ matrix.name }}"
+    runs-on: ${{ matrix.runs-on }}
+    strategy:
+      fail-fast: false
 
-      - name: "Run external tests - SDXL scheduled unet"
-        if: "matrix.sdxl-unet-config-file != '' && !cancelled()"
+      # Note: these jobs should use persistent runners with local caches.
+      # Downloading test files (50GB+) without a cache can take 20+ minutes.
+      matrix:
+        include:
+          # CPU
+          - name: cpu_llvm_task
+            models-config-file: models_cpu_llvm_task.json
+            backend: cpu
+            runs-on: nodai-amdgpu-w7900-x86-64
+
+          # AMD GPU
+          - name: amdgpu_rocm_mi250_gfx90a
+            rocm-chip: gfx90a
+            backend: rocm
+            runs-on: nodai-amdgpu-mi250-x86-64
+          - name: amdgpu_rocm_mi300_gfx942
+            rocm-chip: gfx942
+            backend: rocm
+            runs-on: nodai-amdgpu-mi300-x86-64
+    env:
+      PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages
+      IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts
+      IREE_TEST_FILES: ~/iree_tests_cache
+      IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite
+      VENV_DIR: ${{ github.workspace }}/venv
+      LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9
+    steps:
+      # TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions
+      # directory to be able to clean after every PR
+      - name: Pre Checkout MI300 Step
+        if: contains(matrix.name, 'gfx942')
         run: |
-          source ${VENV_DIR}/bin/activate
-          pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-scheduled-unet-3-tank \
-            -rpfE \
-            -k real_weights \
-            --no-skip-tests-missing-files \
-            --capture=no \
-            --log-cli-level=info \
-            --timeout=1200 \
-            --durations=0 \
-            --config-files=${SDXL_UNET_CONFIG_FILE_PATH}
+          sudo chmod -R 777 ~/actions-runner/_work
+      - name: Checking out IREE repository
+        uses: actions/checkout@v4.1.7
+        with:
+          submodules: false
+      - uses: actions/setup-python@v5.1.0
+        with:
+          # Must match the subset of versions built in pkgci_build_packages.
+          python-version: "3.11"
+      - uses: actions/download-artifact@v4.1.7
+        with:
+          name: linux_x86_64_release_packages
+          path: ${{ env.PACKAGE_DOWNLOAD_DIR }}
+      - name: Setup venv
+        run: |
+          ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \
+            --artifact-path=${PACKAGE_DOWNLOAD_DIR} \
+            --fetch-gh-workflow=${{ inputs.artifact_run_id }}
+
+      # TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm
+      # # In-tree tests
+      # - name: Run experimental/regression_suite tests
+      #   run: |
+      #     source ${VENV_DIR}/bin/activate
+      #     pytest \
+      #       -rA -s -m "plat_host_cpu and presubmit" \
+      #       experimental/regression_suite
 
-      - name: "Run external tests - SDXL prompt encoder"
-        if: "matrix.sdxl-clip-config-file != '' && !cancelled()"
+      - name: "Running SDXL special model tests"
+        if: "!cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-prompt-encoder-tank \
+          pytest ./experimental/regression_suite/shark-test-suite-models/sdxl \
+            -k ${{ matrix.backend }} \
             -rpfE \
-            -k real_weights \
-            --no-skip-tests-missing-files \
             --capture=no \
             --log-cli-level=info \
             --timeout=1200 \
-            --durations=0 \
-            --config-files=${SDXL_CLIP_CONFIG_FILE_PATH}
+            --durations=0
+        env:
+          ROCM_CHIP: ${{ matrix.rocm-chip }}
 
-      - name: "Run external tests - SDXL vae decode"
-        if: "matrix.sdxl-vae-config-file != '' && !cancelled()"
+      - name: "Running SD3 special model tests"
+        if: "!cancelled()"
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-vae-decode-tank \
+          pytest ./experimental/regression_suite/shark-test-suite-models/sd3 \
+            -k ${{ matrix.backend }} \
             -rpfE \
-            -k real_weights \
-            --no-skip-tests-missing-files \
             --capture=no \
             --log-cli-level=info \
             --timeout=1200 \
-            --durations=0 \
-            --config-files=${SDXL_VAE_CONFIG_FILE_PATH}
+            --durations=0
+        env:
+          ROCM_CHIP: ${{ matrix.rocm-chip }}
 
       # Note: mi250 benchmark times are more lenient than mi300 (allowing about
       # 10% deviation from observed averages), since the mi250 runners we use
@@ -314,7 +330,7 @@ jobs:
         if: contains(matrix.name, 'rocm_mi250_gfx90a')
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \
+          pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
             --goldentime-rocm-e2e-ms 1450.0 \
             --goldentime-rocm-unet-ms 370.0 \
             --goldentime-rocm-clip-ms 18.5 \
@@ -336,7 +352,7 @@ jobs:
         if: contains(matrix.name, 'rocm_mi300_gfx942')
         run: |
           source ${VENV_DIR}/bin/activate
-          pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \
+          pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \
             --goldentime-rocm-e2e-ms 325.0 \
             --goldentime-rocm-unet-ms 77.0 \
             --goldentime-rocm-clip-ms 15.5 \
diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json
deleted file mode 100644
index 4537b3f28b7d..000000000000
--- a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags": [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx90a",
-    "--iree-input-demote-f64-to-f32",
-    "--iree-opt-const-eval=false",
-    "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip"
-  ],
-  "skip_compile_tests": [
-    "pytorch/models/sdxl-scheduled-unet-3-tank",
-    "pytorch/models/sdxl-prompt-encoder-tank",
-    "pytorch/models/sdxl-vae-decode-tank"
-  ],
-  "skip_run_tests": [],
-  "expected_compile_failures": [
-    // TODO(#17344): need to regenerate .mlirbc
-    "pytorch/models/opt-125M",
-    "pytorch/models/resnet50"
-  ],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json
deleted file mode 100644
index 28950d0643d5..000000000000
--- a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json
+++ /dev/null
@@ -1,25 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags": [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx942",
-    "--iree-input-demote-f64-to-f32",
-    "--iree-opt-const-eval=false",
-    "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip"
-  ],
-  "skip_compile_tests": [
-    "pytorch/models/sdxl-scheduled-unet-3-tank",
-    "pytorch/models/sdxl-prompt-encoder-tank",
-    "pytorch/models/sdxl-vae-decode-tank"
-  ],
-  "skip_run_tests": [],
-  "expected_compile_failures": [
-    // TODO(#17344): need to regenerate .mlirbc
-    "pytorch/models/opt-125M",
-    "pytorch/models/resnet50"
-  ],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json
deleted file mode 100644
index cc39c2d53d9e..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json
+++ /dev/null
@@ -1,22 +0,0 @@
-{
-  "config_name": "cpu_llvm_task",
-  "iree_compile_flags" : [
-    "--iree-hal-target-backends=llvm-cpu",
-    "--iree-llvmcpu-target-cpu-features=host"
-  ],
-  "iree_run_module_flags": [
-    "--device=local-task",
-    "--parameters=model=real_weights.irpa",
-    "--input=1x64xi64=@inference_input.0.bin",
-    "--input=1x64xi64=@inference_input.1.bin",
-    "--input=1x64xi64=@inference_input.2.bin",
-    "--input=1x64xi64=@inference_input.3.bin",
-    "--expected_output=2x64x2048xf16=@inference_output.0.bin",
-    "--expected_output=2x1280xf16=@inference_output.1.bin",
-    "--expected_f16_threshold=1.0f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json
deleted file mode 100644
index 1aabeb85f3fb..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags": [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx90a",
-    "--iree-input-type=torch",
-    "--iree-opt-const-eval=false",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-rocm-waves-per-eu=2",
-    "--iree-llvmgpu-enable-prefetch",
-    "--iree-flow-enable-aggressive-fusion",
-    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
-    "--iree-opt-aggressively-propagate-transposes=true",
-    "--iree-codegen-llvmgpu-use-vector-distribution=true",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))",
-    "--iree-scheduling-dump-statistics-format=json",
-    "--iree-scheduling-dump-statistics-file=compilation_info.json"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip",
-    "--parameters=model=real_weights.irpa",
-    "--input=1x64xi64=@inference_input.0.bin",
-    "--input=1x64xi64=@inference_input.1.bin",
-    "--input=1x64xi64=@inference_input.2.bin",
-    "--input=1x64xi64=@inference_input.3.bin",
-    "--expected_output=2x64x2048xf16=@inference_output.0.bin",
-    "--expected_output=2x1280xf16=@inference_output.1.bin",
-    "--expected_f16_threshold=1.0f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json
deleted file mode 100644
index e3dbc9b75b0c..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json
+++ /dev/null
@@ -1,36 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags": [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx942",
-    "--iree-input-type=torch",
-    "--iree-opt-const-eval=false",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-rocm-waves-per-eu=2",
-    "--iree-llvmgpu-enable-prefetch",
-    "--iree-flow-enable-aggressive-fusion",
-    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
-    "--iree-opt-aggressively-propagate-transposes=true",
-    "--iree-codegen-llvmgpu-use-vector-distribution=true",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))",
-    "--iree-scheduling-dump-statistics-format=json",
-    "--iree-scheduling-dump-statistics-file=compilation_info.json"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip",
-    "--parameters=model=real_weights.irpa",
-    "--input=1x64xi64=@inference_input.0.bin",
-    "--input=1x64xi64=@inference_input.1.bin",
-    "--input=1x64xi64=@inference_input.2.bin",
-    "--input=1x64xi64=@inference_input.3.bin",
-    "--expected_output=2x64x2048xf16=@inference_output.0.bin",
-    "--expected_output=2x1280xf16=@inference_output.1.bin",
-    "--expected_f16_threshold=1.0f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json
deleted file mode 100644
index d2dff6f0a9f4..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json
+++ /dev/null
@@ -1,23 +0,0 @@
-{
-  "config_name": "cpu_llvm_task",
-  "iree_compile_flags": [
-    "--iree-hal-target-backends=llvm-cpu",
-    "--iree-llvmcpu-target-cpu-features=host",
-    "--iree-input-demote-f64-to-f32"
-  ],
-  "iree_run_module_flags": [
-    "--device=local-task",
-    "--parameters=model=real_weights.irpa",
-    "--module=sdxl_scheduled_unet_pipeline_fp16_cpu.vmfb",
-    "--input=1x4x128x128xf16=@inference_input.0.bin",
-    "--input=2x64x2048xf16=@inference_input.1.bin",
-    "--input=2x1280xf16=@inference_input.2.bin",
-    "--input=1xf16=@inference_input.3.bin",
-    "--expected_output=1x4x128x128xf16=@inference_output.0.bin",
-    "--expected_f16_threshold=0.8f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
deleted file mode 100644
index 8ac4f1fc895b..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json
+++ /dev/null
@@ -1,39 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags" : [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx90a",
-    "--iree-opt-const-eval=false",
-    "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
-    "--iree-flow-enable-aggressive-fusion=true",
-    "--iree-opt-aggressively-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-vm-target-truncate-unsupported-floats",
-    "--iree-llvmgpu-enable-prefetch=true",
-    "--iree-opt-data-tiling=false",
-    "--iree-codegen-gpu-native-math-precision=true",
-    "--iree-codegen-llvmgpu-use-vector-distribution",
-    "--iree-rocm-waves-per-eu=2",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
-    "--iree-scheduling-dump-statistics-format=json",
-    "--iree-scheduling-dump-statistics-file=compilation_info.json"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip",
-    "--parameters=model=real_weights.irpa",
-    "--module=sdxl_scheduled_unet_pipeline_fp16_rocm.vmfb",
-    "--input=1x4x128x128xf16=@inference_input.0.bin",
-    "--input=2x64x2048xf16=@inference_input.1.bin",
-    "--input=2x1280xf16=@inference_input.2.bin",
-    "--input=1xf16=@inference_input.3.bin",
-    "--expected_output=1x4x128x128xf16=@inference_output.0.bin",
-    "--expected_f16_threshold=0.7f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json
deleted file mode 100644
index 289e99b2af17..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json
+++ /dev/null
@@ -1,41 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags" : [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx942",
-    "--iree-opt-const-eval=false",
-    "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
-    "--iree-flow-enable-aggressive-fusion=true",
-    "--iree-opt-aggressively-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-vm-target-truncate-unsupported-floats",
-    "--iree-llvmgpu-enable-prefetch=true",
-    "--iree-opt-data-tiling=false",
-    "--iree-codegen-gpu-native-math-precision=true",
-    "--iree-codegen-llvmgpu-use-vector-distribution",
-    "--iree-rocm-waves-per-eu=2",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
-    "--iree-scheduling-dump-statistics-format=json",
-    "--iree-scheduling-dump-statistics-file=compilation_info.json"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip",
-    "--parameters=model=real_weights.irpa",
-    "--module=sdxl_scheduled_unet_pipeline_fp16_rocm.vmfb",
-    "--input=1x4x128x128xf16=@inference_input.0.bin",
-    "--input=2x64x2048xf16=@inference_input.1.bin",
-    "--input=2x1280xf16=@inference_input.2.bin",
-    "--input=1xf16=@inference_input.3.bin",
-    "--expected_output=1x4x128x128xf16=@inference_output.0.bin",
-    "--expected_f16_threshold=0.7f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": [
-    "pytorch/models/sdxl-scheduled-unet-3-tank",
-  ]
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json
deleted file mode 100644
index 0a8a48c4f00d..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json
+++ /dev/null
@@ -1,20 +0,0 @@
-{
-  "config_name": "cpu_llvm_task",
-  "iree_compile_flags" : [
-    "--iree-hal-target-backends=llvm-cpu",
-    "--iree-llvmcpu-target-cpu-features=host"
-  ],
-  "iree_run_module_flags": [
-    "--device=local-task",
-    "--parameters=model=real_weights.irpa",
-    "--input=1x4x128x128xf16=@inference_input.0.bin",
-    "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin",
-    "--expected_f16_threshold=0.02f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": [
-    "pytorch/models/sdxl-vae-decode-tank"
-  ]
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json
deleted file mode 100644
index 690bffa994ea..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags" : [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx90a",
-    "--iree-opt-const-eval=false",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-llvmgpu-enable-prefetch=true",
-    "--iree-rocm-waves-per-eu=2",
-    "--iree-flow-enable-aggressive-fusion",
-    "--iree-codegen-llvmgpu-use-vector-distribution=true",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
-    "--iree-scheduling-dump-statistics-format=json",
-    "--iree-scheduling-dump-statistics-file=compilation_info.json"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip",
-    "--parameters=model=real_weights.irpa",
-    "--input=1x4x128x128xf16=@inference_input.0.bin",
-    "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin",
-    "--expected_f16_threshold=0.4f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json
deleted file mode 100644
index 1ea72517f283..000000000000
--- a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json
+++ /dev/null
@@ -1,29 +0,0 @@
-{
-  "config_name": "gpu_rocm",
-  "iree_compile_flags" : [
-    "--iree-hal-target-backends=rocm",
-    "--iree-rocm-target-chip=gfx942",
-    "--iree-opt-const-eval=false",
-    "--iree-global-opt-propagate-transposes=true",
-    "--iree-opt-outer-dim-concat=true",
-    "--iree-llvmgpu-enable-prefetch=true",
-    "--iree-rocm-waves-per-eu=2",
-    "--iree-flow-enable-aggressive-fusion",
-    "--iree-codegen-llvmgpu-use-vector-distribution=true",
-    "--iree-execution-model=async-external",
-    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
-    "--iree-scheduling-dump-statistics-format=json",
-    "--iree-scheduling-dump-statistics-file=compilation_info.json"
-  ],
-  "iree_run_module_flags": [
-    "--device=hip",
-    "--parameters=model=real_weights.irpa",
-    "--input=1x4x128x128xf16=@inference_input.0.bin",
-    "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin",
-    "--expected_f16_threshold=0.4f"
-  ],
-  "skip_compile_tests": [],
-  "skip_run_tests": [],
-  "expected_compile_failures": [],
-  "expected_run_failures": []
-}
diff --git a/experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py b/experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py
new file mode 100644
index 000000000000..25a17050cb2a
--- /dev/null
+++ b/experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py
@@ -0,0 +1,407 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import os
+from collections import namedtuple
+import logging
+from typing import Sequence
+import subprocess
+import json
+from pathlib import Path
+import tabulate
+from pytest_check import check
+
+benchmark_dir = os.path.dirname(os.path.realpath(__file__))
+artifacts_dir = os.getenv("IREE_TEST_FILES", default=Path.cwd()) + "/artifacts"
+artifacts_dir = Path(os.path.expanduser(artifacts_dir)).resolve()
+prompt_encoder_dir = f"{artifacts_dir}/sdxl_clip"
+scheduled_unet_dir = f"{artifacts_dir}/sdxl_unet"
+vae_decode_dir = f"{artifacts_dir}/sdxl_vae"
+
+
+def run_iree_command(args: Sequence[str] = ()):
+    command = "Exec:", " ".join(args)
+    logging.getLogger().info(command)
+    proc = subprocess.run(
+        args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False
+    )
+    (
+        stdout_v,
+        stderr_v,
+    ) = (
+        proc.stdout,
+        proc.stderr,
+    )
+    return_code = proc.returncode
+    if return_code == 0:
+        return 0, proc.stdout
+    logging.getLogger().info(f"Command failed with error: {proc.stderr}")
+    return 1, proc.stdout
+
+
+def run_sdxl_rocm_benchmark(rocm_chip, gpu_number):
+    exec_args = [
+        "iree-compile",
+        f"{benchmark_dir}/sdxl_pipeline_bench_f16.mlir",
+        "--iree-hal-target-backends=rocm",
+        f"--iree-rocm-target-chip={rocm_chip}",
+        "--iree-global-opt-propagate-transposes=true",
+        "--iree-codegen-llvmgpu-use-vector-distribution",
+        "--iree-codegen-gpu-native-math-precision=true",
+        "--iree-rocm-waves-per-eu=2",
+        "--iree-opt-outer-dim-concat=true",
+        "--iree-llvmgpu-enable-prefetch",
+        "-o",
+        f"{benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb",
+    ]
+    # iree compile command for full sdxl pipeline
+    ret_value, stdout = run_iree_command(exec_args)
+    if ret_value == 1:
+        return 1, stdout
+    exec_args = [
+        "iree-benchmark-module",
+        f"--device=hip://{gpu_number}",
+        "--device_allocator=caching",
+        f"--module={prompt_encoder_dir}/model.rocm_{rocm_chip}.vmfb",
+        f"--parameters=model={prompt_encoder_dir}/real_weights.irpa",
+        f"--module={scheduled_unet_dir}/model.rocm_{rocm_chip}.vmfb",
+        f"--parameters=model={scheduled_unet_dir}/real_weights.irpa",
+        f"--module={vae_decode_dir}/model.rocm_{rocm_chip}.vmfb",
+        f"--parameters=model={vae_decode_dir}/real_weights.irpa",
+        f"--module={benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb",
+        "--function=tokens_to_image",
+        "--input=1x4x128x128xf16",
+        "--input=1xf16",
+        "--input=1x64xi64",
+        "--input=1x64xi64",
+        "--input=1x64xi64",
+        "--input=1x64xi64",
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+    ]
+    # iree benchmark command for full sdxl pipeline
+    return run_iree_command(exec_args)
+
+
+def run_sdxl_unet_rocm_benchmark(gpu_number, rocm_chip):
+    exec_args = [
+        "iree-benchmark-module",
+        f"--device=hip://{gpu_number}",
+        "--device_allocator=caching",
+        f"--module={scheduled_unet_dir}/model.rocm_{rocm_chip}.vmfb",
+        f"--parameters=model={scheduled_unet_dir}/real_weights.irpa",
+        "--function=run_forward",
+        "--input=1x4x128x128xf16",
+        "--input=2x64x2048xf16",
+        "--input=2x1280xf16",
+        "--input=2x6xf16",
+        "--input=1xf16",
+        "--input=1xi64",
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+    ]
+    # iree benchmark command for full sdxl pipeline
+    return run_iree_command(exec_args)
+
+
+def run_sdxl_prompt_encoder_rocm_benchmark(gpu_number, rocm_chip):
+    exec_args = [
+        "iree-benchmark-module",
+        f"--device=hip://{gpu_number}",
+        "--device_allocator=caching",
+        f"--module={prompt_encoder_dir}/model.rocm_{rocm_chip}.vmfb",
+        f"--parameters=model={prompt_encoder_dir}/real_weights.irpa",
+        "--function=encode_prompts",
+        "--input=1x64xi64",
+        "--input=1x64xi64",
+        "--input=1x64xi64",
+        "--input=1x64xi64",
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+    ]
+    # iree benchmark command for full sdxl pipeline
+    return run_iree_command(exec_args)
+
+
+def run_sdxl_vae_decode_rocm_benchmark(gpu_number, rocm_chip):
+    exec_args = [
+        "iree-benchmark-module",
+        f"--device=hip://{gpu_number}",
+        "--device_allocator=caching",
+        f"--module={vae_decode_dir}/model.rocm_{rocm_chip}.vmfb",
+        f"--parameters=model={vae_decode_dir}/real_weights.irpa",
+        "--function=main",
+        "--input=1x4x128x128xf16",
+        "--benchmark_repetitions=10",
+        "--benchmark_min_warmup_time=3.0",
+    ]
+    # iree benchmark command for full sdxl pipeline
+    return run_iree_command(exec_args)
+
+
+BenchmarkResult = namedtuple(
+    "BenchmarkResult", "benchmark_name time cpu_time iterations user_counters"
+)
+
+
+def decode_output(bench_lines):
+    benchmark_results = []
+    for line in bench_lines:
+        split = line.split()
+        if len(split) == 0:
+            continue
+        benchmark_name = split[0]
+        time = " ".join(split[1:3])
+        cpu_time = " ".join(split[3:5])
+        iterations = split[5]
+        user_counters = None
+        if len(split) > 5:
+            user_counters = split[6]
+        benchmark_results.append(
+            BenchmarkResult(
+                benchmark_name=benchmark_name,
+                time=time,
+                cpu_time=cpu_time,
+                iterations=iterations,
+                user_counters=user_counters,
+            )
+        )
+    return benchmark_results
+
+
+def job_summary_process(ret_value, output):
+    if ret_value == 1:
+        logging.getLogger().info("Running SDXL ROCm benchmark failed. Exiting")
+        return
+    bench_lines = output.decode().split("\n")[3:]
+    benchmark_results = decode_output(bench_lines)
+    logging.getLogger().info(benchmark_results)
+    benchmark_mean_time = float(benchmark_results[10].time.split()[0])
+    return benchmark_mean_time
+
+
+def test_sdxl_rocm_benchmark(
+    goldentime_rocm_e2e,
+    goldentime_rocm_unet,
+    goldentime_rocm_clip,
+    goldentime_rocm_vae,
+    gpu_number,
+    rocm_chip,
+    goldendispatch_rocm_unet,
+    goldendispatch_rocm_clip,
+    goldendispatch_rocm_vae,
+    goldensize_rocm_unet,
+    goldensize_rocm_clip,
+    goldensize_rocm_vae,
+):
+    # e2e benchmark
+    ret_value, output = run_sdxl_rocm_benchmark(rocm_chip, gpu_number)
+    benchmark_e2e_mean_time = job_summary_process(ret_value, output)
+    mean_line = (
+        f"E2E Benchmark Time: {str(benchmark_e2e_mean_time)} ms"
+        f" (golden time {goldentime_rocm_e2e} ms)"
+    )
+    logging.getLogger().info(mean_line)
+
+    # unet benchmark
+    ret_value, output = run_sdxl_unet_rocm_benchmark(gpu_number, rocm_chip)
+    benchmark_unet_mean_time = job_summary_process(ret_value, output)
+    mean_line = (
+        f"Scheduled Unet Benchmark Time: {str(benchmark_unet_mean_time)} ms"
+        f" (golden time {goldentime_rocm_unet} ms)"
+    )
+    logging.getLogger().info(mean_line)
+
+    # unet compilation stats check
+    with open(f"{scheduled_unet_dir}/compilation_info.json", "r") as file:
+        comp_stats = json.load(file)
+    unet_dispatch_count = int(
+        comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
+    )
+    compilation_line = (
+        f"Scheduled Unet Dispatch Count: {unet_dispatch_count}"
+        f" (golden dispatch count {goldendispatch_rocm_unet})"
+    )
+    logging.getLogger().info(compilation_line)
+
+    module_path = f"{scheduled_unet_dir}/model.rocm_{rocm_chip}.vmfb"
+    unet_binary_size = Path(module_path).stat().st_size
+    compilation_line = (
+        f"Scheduled Unet Binary Size: {unet_binary_size} bytes"
+        f" (golden binary size {goldensize_rocm_unet} bytes)"
+    )
+    logging.getLogger().info(compilation_line)
+
+    # prompt encoder benchmark
+    ret_value, output = run_sdxl_prompt_encoder_rocm_benchmark(gpu_number, rocm_chip)
+    benchmark_clip_mean_time = job_summary_process(ret_value, output)
+    mean_line = (
+        f"Prompt Encoder Benchmark Time: {str(benchmark_clip_mean_time)} ms"
+        f" (golden time {goldentime_rocm_clip} ms)"
+    )
+    logging.getLogger().info(mean_line)
+
+    # prompt encoder compilation stats check
+    with open(f"{prompt_encoder_dir}/compilation_info.json", "r") as file:
+        comp_stats = json.load(file)
+    clip_dispatch_count = int(
+        comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
+    )
+    compilation_line = (
+        f"Prompt Encoder Dispatch Count: {clip_dispatch_count}"
+        f" (golden dispatch count {goldendispatch_rocm_clip})"
+    )
+    logging.getLogger().info(compilation_line)
+
+    module_path = f"{prompt_encoder_dir}/model.rocm_{rocm_chip}.vmfb"
+    clip_binary_size = Path(module_path).stat().st_size
+    compilation_line = (
+        f"Prompt Encoder Binary Size: {clip_binary_size} bytes"
+        f" (golden binary size {goldensize_rocm_clip} bytes)"
+    )
+    logging.getLogger().info(compilation_line)
+
+    # vae decode benchmark
+    ret_value, output = run_sdxl_vae_decode_rocm_benchmark(gpu_number, rocm_chip)
+    benchmark_vae_mean_time = job_summary_process(ret_value, output)
+    mean_line = (
+        f"VAE Decode Benchmark Time: {str(benchmark_vae_mean_time)} ms"
+        f" (golden time {goldentime_rocm_vae} ms)"
+    )
+    logging.getLogger().info(mean_line)
+
+    # vae decode compilation stats check
+    with open(f"{vae_decode_dir}/compilation_info.json", "r") as file:
+        comp_stats = json.load(file)
+    vae_dispatch_count = int(
+        comp_stats["stream-aggregate"]["execution"]["dispatch-count"]
+    )
+    compilation_line = (
+        f"VAE Decode Dispatch Count: {vae_dispatch_count}"
+        f" (golden dispatch count {goldendispatch_rocm_vae})"
+    )
+    logging.getLogger().info(compilation_line)
+
+    module_path = f"{vae_decode_dir}/model.rocm_{rocm_chip}.vmfb"
+    vae_binary_size = Path(module_path).stat().st_size
+    compilation_line = (
+        f"VAE Decode Binary Size: {vae_binary_size} bytes"
+        f" (golden binary size {goldensize_rocm_vae} bytes)"
+    )
+    logging.getLogger().info(compilation_line)
+
+    # Create mean time table's header and rows
+    mean_time_header = ["Benchmark", "Current time (ms)", "Expected/golden time (ms)"]
+    mean_time_rows = [
+        ["E2E†", f"{benchmark_e2e_mean_time}", f"{goldentime_rocm_e2e}"],
+        ["Scheduled Unet", f"{benchmark_unet_mean_time}", f"{goldentime_rocm_unet}"],
+        ["Prompt Encoder", f"{benchmark_clip_mean_time}", f"{goldentime_rocm_clip}"],
+        ["VAE Decode", f"{benchmark_vae_mean_time}", f"{goldentime_rocm_vae}"],
+    ]
+
+    # Create dispatch count table's header and rows
+    dispatch_count_header = [
+        "Benchmark",
+        "Current dispatch count",
+        "Expected/golden dispatch count",
+    ]
+    dispatch_count_rows = [
+        ["Scheduled Unet", f"{unet_dispatch_count}", f"{goldendispatch_rocm_unet}"],
+        ["Prompt Encoder", f"{clip_dispatch_count}", f"{goldendispatch_rocm_clip}"],
+        ["VAE Decode", f"{vae_dispatch_count}", f"{goldendispatch_rocm_vae}"],
+    ]
+
+    # Create binary size table's header and rows
+    binary_size_header = [
+        "Benchmark",
+        "Current binary size (bytes)",
+        "Expected/golden binary size (bytes)",
+    ]
+    binary_size_rows = [
+        ["Scheduled Unet", f"{unet_binary_size}", f"{goldensize_rocm_unet}"],
+        ["Prompt Encoder", f"{clip_binary_size}", f"{goldensize_rocm_clip}"],
+        ["VAE Decode", f"{vae_binary_size}", f"{goldensize_rocm_vae}"],
+    ]
+
+    # Create mean time table using tabulate
+    mean_time_full = [mean_time_header] + mean_time_rows
+    mean_time_table = tabulate.tabulate(
+        mean_time_full, headers="firstrow", tablefmt="pipe"
+    )
+
+    # Create dispatch count table using tabulate
+    dispatch_count_full = [dispatch_count_header] + dispatch_count_rows
+    dispatch_count_table = tabulate.tabulate(
+        dispatch_count_full, headers="firstrow", tablefmt="pipe"
+    )
+
+    # Create binary size of compiled artifacts table using tabulate
+    binary_size_full = [binary_size_header] + binary_size_rows
+    binary_size_table = tabulate.tabulate(
+        binary_size_full, headers="firstrow", tablefmt="pipe"
+    )
+
+    # Write markdown tables to job summary file
+    with open("job_summary.md", "w") as job_summary:
+        print("SDXL Benchmark Summary:\n", file=job_summary)
+        print(mean_time_table, file=job_summary)
+        print("\n† E2E = Encode + Scheduled Unet * 3 + Decode\n", file=job_summary)
+        print(dispatch_count_table, file=job_summary)
+        print("\n", file=job_summary)
+        print(binary_size_table, file=job_summary)
+
+    # Check all values are either <= than golden values for times and == for compilation statistics.
+
+    check.less_equal(
+        benchmark_e2e_mean_time,
+        goldentime_rocm_e2e,
+        "SDXL e2e benchmark time should not regress",
+    )
+    check.less_equal(
+        benchmark_unet_mean_time,
+        goldentime_rocm_unet,
+        "SDXL unet benchmark time should not regress",
+    )
+    check.equal(
+        unet_dispatch_count,
+        goldendispatch_rocm_unet,
+        "SDXL scheduled unet dispatch count should not regress",
+    )
+    check.less_equal(
+        unet_binary_size,
+        goldensize_rocm_unet,
+        "SDXL scheduled unet binary size should not get bigger",
+    )
+    check.less_equal(
+        benchmark_clip_mean_time,
+        goldentime_rocm_clip,
+        "SDXL prompt encoder benchmark time should not regress",
+    )
+    check.equal(
+        clip_dispatch_count,
+        goldendispatch_rocm_clip,
+        "SDXL prompt encoder dispatch count should not regress",
+    )
+    check.less_equal(
+        clip_binary_size,
+        goldensize_rocm_clip,
+        "SDXL prompt encoder binary size should not get bigger",
+    )
+    check.less_equal(
+        benchmark_vae_mean_time,
+        goldentime_rocm_vae,
+        "SDXL vae decode benchmark time should not regress",
+    )
+    check.equal(
+        vae_dispatch_count,
+        goldendispatch_rocm_vae,
+        "SDXL vae decode dispatch count should not regress",
+    )
+    check.less_equal(
+        vae_binary_size,
+        goldensize_rocm_vae,
+        "SDXL vae decode binary size should not get bigger",
+    )
diff --git a/experimental/benchmarks/sdxl/conftest.py b/experimental/benchmarks/sdxl/conftest.py
new file mode 100644
index 000000000000..9ac43d9995ff
--- /dev/null
+++ b/experimental/benchmarks/sdxl/conftest.py
@@ -0,0 +1,144 @@
+import pytest
+
+
+def pytest_addoption(parser):
+    parser.addoption(
+        "--goldentime-rocm-e2e-ms",
+        action="store",
+        type=float,
+        help="Golden time to test benchmark",
+    )
+    parser.addoption(
+        "--goldentime-rocm-unet-ms",
+        action="store",
+        type=float,
+        help="Golden time to test benchmark",
+    )
+    parser.addoption(
+        "--goldentime-rocm-clip-ms",
+        action="store",
+        type=float,
+        help="Golden time to test benchmark",
+    )
+    parser.addoption(
+        "--goldentime-rocm-vae-ms",
+        action="store",
+        type=float,
+        help="Golden time to test benchmark",
+    )
+    parser.addoption(
+        "--goldendispatch-rocm-unet",
+        action="store",
+        default=1718,
+        type=int,
+        help="Golden dispatch count to test benchmark",
+    )
+    parser.addoption(
+        "--goldendispatch-rocm-clip",
+        action="store",
+        default=1571,
+        type=int,
+        help="Golden dispatch count to test benchmark",
+    )
+    parser.addoption(
+        "--goldendispatch-rocm-vae",
+        action="store",
+        default=250,
+        type=int,
+        help="Golden dispatch count to test benchmark",
+    )
+    parser.addoption(
+        "--goldensize-rocm-unet-bytes",
+        action="store",
+        default=2088217,
+        type=int,
+        help="Golden vmfb size to test benchmark",
+    )
+    parser.addoption(
+        "--goldensize-rocm-clip-bytes",
+        action="store",
+        default=785493,
+        type=int,
+        help="Golden vmfb size to test benchmark",
+    )
+    parser.addoption(
+        "--goldensize-rocm-vae-bytes",
+        action="store",
+        default=762067,
+        type=int,
+        help="Golden vmfb size to test benchmark",
+    )
+    parser.addoption(
+        "--gpu-number",
+        action="store",
+        default=0,
+        type=int,
+        help="IREE GPU device number to test on",
+    )
+    parser.addoption(
+        "--rocm-chip",
+        action="store",
+        default="gfx90a",
+        type=str,
+        help="ROCm target chip configuration of GPU",
+    )
+
+
+@pytest.fixture
+def goldentime_rocm_e2e(request):
+    return request.config.getoption("--goldentime-rocm-e2e-ms")
+
+
+@pytest.fixture
+def goldentime_rocm_unet(request):
+    return request.config.getoption("--goldentime-rocm-unet-ms")
+
+
+@pytest.fixture
+def goldentime_rocm_clip(request):
+    return request.config.getoption("--goldentime-rocm-clip-ms")
+
+
+@pytest.fixture
+def goldentime_rocm_vae(request):
+    return request.config.getoption("--goldentime-rocm-vae-ms")
+
+
+@pytest.fixture
+def goldendispatch_rocm_unet(request):
+    return request.config.getoption("--goldendispatch-rocm-unet")
+
+
+@pytest.fixture
+def goldendispatch_rocm_clip(request):
+    return request.config.getoption("--goldendispatch-rocm-clip")
+
+
+@pytest.fixture
+def goldendispatch_rocm_vae(request):
+    return request.config.getoption("--goldendispatch-rocm-vae")
+
+
+@pytest.fixture
+def goldensize_rocm_unet(request):
+    return request.config.getoption("--goldensize-rocm-unet-bytes")
+
+
+@pytest.fixture
+def goldensize_rocm_clip(request):
+    return request.config.getoption("--goldensize-rocm-clip-bytes")
+
+
+@pytest.fixture
+def goldensize_rocm_vae(request):
+    return request.config.getoption("--goldensize-rocm-vae-bytes")
+
+
+@pytest.fixture
+def rocm_chip(request):
+    return request.config.getoption("--rocm-chip")
+
+
+@pytest.fixture
+def gpu_number(request):
+    return request.config.getoption("--gpu-number")
diff --git a/experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir b/experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir
new file mode 100644
index 000000000000..cbf58e458ff4
--- /dev/null
+++ b/experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir
@@ -0,0 +1,23 @@
+module @sdxl_compiled_pipeline {
+  func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+  func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"}
+  func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"}
+
+  func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> {
+    %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>)
+    %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor<i64>)
+    %c0 = arith.constant 0 : index
+    %c1 = arith.constant 1 : index
+    %steps_int = tensor.extract %steps[] : tensor<i64>
+    %n_steps = arith.index_cast %steps_int: i64 to index
+    %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) {
+      %step_64 = arith.index_cast %arg0 : index to i64
+      %this_step = tensor.from_elements %step_64 : tensor<1xi64>
+      %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16>
+      scf.yield %inner : tensor<1x4x128x128xf16>
+    }
+    %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16>
+    return %image : tensor<1x3x1024x1024xf16>
+  }
+}
diff --git a/experimental/regression_suite/ireers/artifacts.py b/experimental/regression_suite/ireers/artifacts.py
deleted file mode 100644
index 12ad3808e48f..000000000000
--- a/experimental/regression_suite/ireers/artifacts.py
+++ /dev/null
@@ -1,154 +0,0 @@
-# Copyright 2023 The IREE Authors
-#
-# Licensed under the Apache License v2.0 with LLVM Exceptions.
-# See https://llvm.org/LICENSE.txt for license information.
-# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-
-from typing import Any, Callable, Collection, Dict, Union
-import functools
-from pathlib import Path
-from tqdm import tqdm
-import urllib.parse
-import urllib.request
-
-
-def show_progress(t):
-    last_b = [0]
-
-    def update_to(b=1, bsize=1, tsize=None):
-        if tsize is not None:
-            t.total = tsize
-        t.update((b - last_b[0]) * bsize)
-        last_b[0] = b
-
-    return update_to
-
-
-@functools.cache
-def get_artifact_root_dir() -> Path:
-    # TODO: Make configurable.
-    return Path.cwd() / "artifacts"
-
-
-class ArtifactGroup:
-    """A group of artifacts with a persistent location on disk."""
-
-    _INSTANCES: Dict[str, "ArtifactGroup"] = {}
-
-    def __init__(self, group_name: str):
-        self.group_name = group_name
-        if group_name:
-            self.directory = get_artifact_root_dir() / group_name
-        else:
-            self.directory = get_artifact_root_dir()
-        self.directory.mkdir(parents=True, exist_ok=True)
-
-    @classmethod
-    def get(cls, group: Union["ArtifactGroup", str]) -> "ArtifactGroup":
-        if isinstance(group, ArtifactGroup):
-            return group
-        try:
-            return cls._INSTANCES[group]
-        except KeyError:
-            instance = ArtifactGroup(group)
-            cls._INSTANCES[group] = instance
-            return instance
-
-
-class Artifact:
-    """Some form of artifact materialized to disk."""
-
-    def __init__(
-        self,
-        group: Union[ArtifactGroup, str],
-        name: str,
-        depends: Collection["Artifact"] = (),
-    ):
-        self.group = ArtifactGroup.get(group)
-        self.name = name
-        self.depends = tuple(depends)
-
-    @property
-    def path(self) -> Path:
-        return self.group.directory / self.name
-
-    def join(self):
-        """Waits for the artifact to become available."""
-        pass
-
-    def __str__(self):
-        return str(self.path)
-
-
-class ProducedArtifact(Artifact):
-    def __init__(
-        self,
-        group: Union[ArtifactGroup, str],
-        name: str,
-        callback: Callable[["ProducedArtifact"], Any],
-        *,
-        always_produce: bool = False,
-        depends: Collection["Artifact"] = (),
-    ):
-        self.group = ArtifactGroup.get(group)
-        super().__init__(group, name, depends)
-        self.name = name
-        self.callback = callback
-        self.always_produce = always_produce
-
-    @property
-    def stamp_path(self) -> Path:
-        """Path of a stamp file which indicates successful transfer."""
-        return self.path.with_suffix(self.path.suffix + ".stamp")
-
-    def start(self) -> "ProducedArtifact":
-        if not self.always_produce and self.stamp_path.exists():
-            if self.path.exists():
-                print(f"Not producing {self} because it has already been produced")
-                return self
-            self.stamp_path.unlink()
-        self.callback(self)
-        if not self.path.exists():
-            raise RuntimeError(
-                f"Artifact {self} succeeded generation but was not produced"
-            )
-        self.stamp()
-        return self
-
-    def stamp(self):
-        self.stamp_path.touch()
-
-
-class FetchedArtifact(ProducedArtifact):
-    """Represents an artifact that is to be fetched."""
-
-    def __init__(self, group: Union[ArtifactGroup, str], url: str):
-        name = Path(urllib.parse.urlparse(url).path).name
-        super().__init__(group, name, FetchedArtifact._callback)
-        self.url = url
-
-    @staticmethod
-    def _callback(self: "FetchedArtifact"):
-        print(f"Downloading {self.url} -> {self.path}", flush=True, end="")
-        with tqdm(
-            unit="B",
-            unit_scale=True,
-            unit_divisor=1024,
-            miniters=1,
-            desc=str(self.path),
-        ) as t:
-            urllib.request.urlretrieve(self.url, self.path, reporthook=show_progress(t))
-        print(f": Retrieved {self.path.stat().st_size} bytes")
-
-
-class StreamArtifact(Artifact):
-    def __init__(self, group: Union[ArtifactGroup, str], name: str):
-        super().__init__(group, name)
-        self.io = open(self.path, "ab", buffering=0)
-
-    def __del__(self):
-        self.io.close()
-
-    def write_line(self, line: Union[str, bytes]):
-        contents = line if isinstance(line, bytes) else line.encode()
-        self.io.write(contents + b"\n")
diff --git a/experimental/regression_suite/ireers/__init__.py b/experimental/regression_suite/ireers_tools/__init__.py
similarity index 100%
rename from experimental/regression_suite/ireers/__init__.py
rename to experimental/regression_suite/ireers_tools/__init__.py
diff --git a/experimental/regression_suite/ireers_tools/artifacts.py b/experimental/regression_suite/ireers_tools/artifacts.py
new file mode 100644
index 000000000000..056039364341
--- /dev/null
+++ b/experimental/regression_suite/ireers_tools/artifacts.py
@@ -0,0 +1,234 @@
+# Copyright 2023 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from typing import Any, Callable, Collection, Dict, Union
+import functools
+from pathlib import Path
+from tqdm import tqdm
+import urllib.parse
+import urllib.request
+import os
+from azure.storage.blob import BlobClient, BlobProperties
+import hashlib
+import mmap
+import re
+import logging
+
+logger = logging.getLogger(__name__)
+# Adjust logging levels.
+logging.basicConfig(level=logging.INFO)
+for log_name, log_obj in logging.Logger.manager.loggerDict.items():
+    if log_name.startswith("azure"):
+        logging.getLogger(log_name).setLevel(logging.WARNING)
+
+
+def show_progress(t):
+    last_b = [0]
+
+    def update_to(b=1, bsize=1, tsize=None):
+        if tsize is not None:
+            t.total = tsize
+        t.update((b - last_b[0]) * bsize)
+        last_b[0] = b
+
+    return update_to
+
+
+@functools.cache
+def get_artifact_root_dir() -> Path:
+    root_path = os.getenv("IREE_TEST_FILES", default=str(Path.cwd())) + "/artifacts"
+    return Path(os.path.expanduser(root_path)).resolve()
+
+
+class ArtifactGroup:
+    """A group of artifacts with a persistent location on disk."""
+
+    _INSTANCES: Dict[str, "ArtifactGroup"] = {}
+
+    def __init__(self, group_name: str):
+        self.group_name = group_name
+        if group_name:
+            self.directory = get_artifact_root_dir() / group_name
+        else:
+            self.directory = get_artifact_root_dir()
+        self.directory.mkdir(parents=True, exist_ok=True)
+
+    @classmethod
+    def get(cls, group: Union["ArtifactGroup", str]) -> "ArtifactGroup":
+        if isinstance(group, ArtifactGroup):
+            return group
+        try:
+            return cls._INSTANCES[group]
+        except KeyError:
+            instance = ArtifactGroup(group)
+            cls._INSTANCES[group] = instance
+            return instance
+
+
+class Artifact:
+    """Some form of artifact materialized to disk."""
+
+    def __init__(
+        self,
+        group: Union[ArtifactGroup, str],
+        name: str,
+        depends: Collection["Artifact"] = (),
+    ):
+        self.group = ArtifactGroup.get(group)
+        self.name = name
+        self.depends = tuple(depends)
+
+    @property
+    def path(self) -> Path:
+        return self.group.directory / self.name
+
+    def join(self):
+        """Waits for the artifact to become available."""
+        pass
+
+    def __str__(self):
+        return str(self.path)
+
+
+class ProducedArtifact(Artifact):
+    def __init__(
+        self,
+        group: Union[ArtifactGroup, str],
+        name: str,
+        callback: Callable[["ProducedArtifact"], Any],
+        *,
+        depends: Collection["Artifact"] = (),
+    ):
+        self.group = ArtifactGroup.get(group)
+        super().__init__(group, name, depends)
+        self.name = name
+        self.callback = callback
+
+    def start(self) -> "ProducedArtifact":
+        self.callback(self)
+        if not self.path.exists():
+            raise RuntimeError(
+                f"Artifact {self} succeeded generation but was not produced"
+            )
+        return self
+
+
+class FetchedArtifact(ProducedArtifact):
+    """Represents an artifact that is to be fetched."""
+
+    def __init__(self, group: Union[ArtifactGroup, str], url: str):
+        name = Path(urllib.parse.urlparse(url).path).name
+        super().__init__(group, name, FetchedArtifact._callback)
+        self.url = url
+
+    def human_readable_size(self, size, decimal_places=2):
+        for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]:
+            if size < 1024.0 or unit == "PiB":
+                break
+            size /= 1024.0
+        return f"{size:.{decimal_places}f} {unit}"
+
+    def get_azure_md5(self, remote_file: str, azure_blob_properties: BlobProperties):
+        """Gets the content_md5 hash for a blob on Azure, if available."""
+        content_settings = azure_blob_properties.get("content_settings")
+        if not content_settings:
+            return None
+        azure_md5 = content_settings.get("content_md5")
+        if not azure_md5:
+            logger.warning(
+                f"  Remote file '{remote_file}' on Azure is missing the "
+                "'content_md5' property, can't check if local matches remote"
+            )
+        return azure_md5
+
+    def get_local_md5(self, local_file_path: Path):
+        """Gets the content_md5 hash for a lolca file, if it exists."""
+        if not local_file_path.exists() or local_file_path.stat().st_size == 0:
+            return None
+
+        with open(local_file_path) as file, mmap.mmap(
+            file.fileno(), 0, access=mmap.ACCESS_READ
+        ) as file:
+            return hashlib.md5(file).digest()
+
+    def download_azure_artifact(self: "FetchedArtifact"):
+        """
+        Checks the hashes between the local file and azure file.
+        """
+        remote_file_name = self.url.rsplit("/", 1)[-1]
+
+        # Extract path components from Azure URL to use with the Azure Storage Blobs
+        # client library for Python (https://pypi.org/project/azure-storage-blob/).
+        #
+        # For example:
+        #   https://sharkpublic.blob.core.windows.net/sharkpublic/path/to/blob.txt
+        #                                            ^           ^
+        #   account_url:    https://sharkpublic.blob.core.windows.net
+        #   container_name: sharkpublic
+        #   blob_name:      path/to/blob.txt
+        result = re.search(r"(https.+\.net)/([^/]+)/(.+)", self.url)
+        account_url = result.groups()[0]
+        container_name = result.groups()[1]
+        blob_name = result.groups()[2]
+
+        with BlobClient(
+            account_url,
+            container_name,
+            blob_name,
+            max_chunk_get_size=1024 * 1024 * 32,  # 32 MiB
+            max_single_get_size=1024 * 1024 * 32,  # 32 MiB
+        ) as blob_client:
+            blob_properties = blob_client.get_blob_properties()
+            blob_size_str = self.human_readable_size(blob_properties.size)
+            azure_md5 = self.get_azure_md5(self.url, blob_properties)
+
+            local_md5 = self.get_local_md5(self.path)
+
+            if azure_md5 and azure_md5 == local_md5:
+                logger.info(
+                    f"  Skipping '{remote_file_name}' download ({blob_size_str}) "
+                    "- local MD5 hash matches"
+                )
+                return
+
+            if not local_md5:
+                logger.info(
+                    f"  Downloading '{remote_file_name}' ({blob_size_str}) "
+                    f"to '{self.path}'"
+                )
+                with open(self.path, mode="wb") as local_blob:
+                    download_stream = blob_client.download_blob(max_concurrency=4)
+                    local_blob.write(download_stream.readall())
+            else:
+                logger.info(
+                    f"  Downloading '{remote_file_name}' ({blob_size_str}) "
+                    f"to '{self.path}' (local MD5 does not match)"
+                )
+                with open(self.path, mode="wb") as local_blob:
+                    download_stream = blob_client.download_blob(max_concurrency=4)
+                    local_blob.write(download_stream.readall())
+
+    @staticmethod
+    def _callback(self: "FetchedArtifact"):
+        if "blob.core.windows.net" in self.url:
+            self.download_azure_artifact()
+        else:
+            raise NotImplementedError(
+                f"Unsupported fetched artifact URL schema for '{self.url}'"
+            )
+
+
+class StreamArtifact(Artifact):
+    def __init__(self, group: Union[ArtifactGroup, str], name: str):
+        super().__init__(group, name)
+        self.io = open(self.path, "ab", buffering=0)
+
+    def __del__(self):
+        self.io.close()
+
+    def write_line(self, line: Union[str, bytes]):
+        contents = line if isinstance(line, bytes) else line.encode()
+        self.io.write(contents + b"\n")
diff --git a/experimental/regression_suite/ireers/fixtures.py b/experimental/regression_suite/ireers_tools/fixtures.py
similarity index 100%
rename from experimental/regression_suite/ireers/fixtures.py
rename to experimental/regression_suite/ireers_tools/fixtures.py
diff --git a/experimental/regression_suite/setup.py b/experimental/regression_suite/setup.py
index 6d3e8b93de3a..06bc65bd73f7 100644
--- a/experimental/regression_suite/setup.py
+++ b/experimental/regression_suite/setup.py
@@ -11,15 +11,22 @@
     version=f"0.1dev1",
     packages=find_namespace_packages(
         include=[
-            "ireers",
+            "ireers_tools",
         ],
     ),
     install_requires=[
         "numpy",
         "pytest",
         "pytest-xdist",
+        "pytest-depends",
+        "pytest-retry",
+        "pytest-timeout",
+        "pytest-xdist",
+        "pytest-check",
         "PyYAML",
+        "tabulate",
         "tqdm",
+        "azure-storage-blob",
     ],
     extras_require={},
 )
diff --git a/experimental/regression_suite/shark-test-suite-models/conftest.py b/experimental/regression_suite/shark-test-suite-models/conftest.py
new file mode 100644
index 000000000000..135fb81abae1
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/conftest.py
@@ -0,0 +1,22 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+
+class VmfbManager:
+    sdxl_clip_cpu_vmfb = None
+    sdxl_vae_cpu_vmfb = None
+    sdxl_unet_cpu_vmfb = None
+    sdxl_clip_rocm_vmfb = None
+    sdxl_vae_rocm_vmfb = None
+    sdxl_unet_rocm_vmfb = None
+    sdxl_unet_cpu_pipeline_vmfb = None
+    sdxl_unet_rocm_pipeline_vmfb = None
+    sd3_clip_cpu_vmfb = None
+    sd3_vae_cpu_vmfb = None
+    sd3_mmdit_cpu_vmfb = None
+    sd3_clip_rocm_vmfb = None
+    sd3_vae_rocm_vmfb = None
+    sd3_mmdit_rocm_vmfb = None
diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py
new file mode 100644
index 000000000000..27253226762f
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py
@@ -0,0 +1,167 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+from conftest import VmfbManager
+
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sd3_clip_inference_input_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.0.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_input_1 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.1.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_input_2 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.2.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_input_3 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.3.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_input_4 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.4.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_input_5 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.5.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_output_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_output.0.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_inference_output_1 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_output.1.bin",
+    group="sd3_clip",
+)
+
+sd3_clip_real_weights = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/real_weights.irpa",
+    group="sd3_clip",
+)
+
+sd3_clip_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/model.mlirbc",
+    group="sd3_clip",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+@pytest.fixture
+def SD3_CLIP_COMMON_RUN_FLAGS(
+    sd3_clip_inference_input_0,
+    sd3_clip_inference_input_1,
+    sd3_clip_inference_input_2,
+    sd3_clip_inference_input_3,
+    sd3_clip_inference_input_4,
+    sd3_clip_inference_input_5,
+    sd3_clip_inference_output_0,
+    sd3_clip_inference_output_1,
+):
+    return [
+        f"--input=1x77x2xi64=@{sd3_clip_inference_input_0.path}",
+        f"--input=1x77x2xi64=@{sd3_clip_inference_input_1.path}",
+        f"--input=1x77x2xi64=@{sd3_clip_inference_input_2.path}",
+        f"--input=1x77x2xi64=@{sd3_clip_inference_input_3.path}",
+        f"--input=1x77x2xi64=@{sd3_clip_inference_input_4.path}",
+        f"--input=1x77x2xi64=@{sd3_clip_inference_input_5.path}",
+        f"--expected_output=2x154x4096xf32=@{sd3_clip_inference_output_0.path}",
+        f"--expected_output=2x2048xf32=@{sd3_clip_inference_output_1.path}",
+    ]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--iree-input-type=torch",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-llvmgpu-enable-prefetch",
+    "--iree-flow-enable-aggressive-fusion",
+    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_clip_cpu(sd3_clip_mlir):
+    VmfbManager.sd3_clip_cpu_vmfb = iree_compile(
+        sd3_clip_mlir, "cpu", CPU_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_clip_cpu"])
+def test_run_clip_cpu(SD3_CLIP_COMMON_RUN_FLAGS, sd3_clip_real_weights):
+    iree_run_module(
+        VmfbManager.sd3_clip_cpu_vmfb,
+        device="local-task",
+        function="encode_tokens",
+        args=[
+            f"--parameters=model={sd3_clip_real_weights.path}",
+            "--expected_f32_threshold=0.15f",
+        ]
+        + SD3_CLIP_COMMON_RUN_FLAGS,
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+@pytest.mark.xfail(
+    strict=True,
+    reason="Expected compilation to fail",
+)
+def test_compile_clip_rocm(sd3_clip_mlir):
+    VmfbManager.sd3_clip_rocm_vmfb = iree_compile(
+        sd3_clip_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_clip_rocm"])
+def test_run_clip_rocm(SD3_CLIP_COMMON_RUN_FLAGS, sd3_clip_real_weights):
+    return iree_run_module(
+        VmfbManager.sd3_clip_rocm_vmfb,
+        device="hip",
+        function="encode_tokens",
+        args=[f"--parameters=model={sd3_clip_real_weights.path}"]
+        + SD3_CLIP_COMMON_RUN_FLAGS,
+    )
diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py
new file mode 100644
index 000000000000..f328211de45c
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py
@@ -0,0 +1,152 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+from pathlib import Path
+from conftest import VmfbManager
+
+iree_test_path_extension = os.getenv("IREE_TEST_PATH_EXTENSION", default=Path.cwd())
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sd3_mmdit_inference_input_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.0.bin",
+    group="sd3_mmdit",
+)
+
+sd3_mmdit_inference_input_1 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.1.bin",
+    group="sd3_mmdit",
+)
+
+sd3_mmdit_inference_input_2 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.2.bin",
+    group="sd3_mmdit",
+)
+
+sd3_mmdit_inference_input_3 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.3.bin",
+    group="sd3_mmdit",
+)
+
+sd3_mmdit_inference_output_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_output.0.bin",
+    group="sd3_mmdit",
+)
+
+sd3_mmdit_real_weights = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/real_weights.irpa",
+    group="sd3_mmdit",
+)
+
+sd3_mmdit_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/model.mlirbc",
+    group="sd3_mmdit",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+@pytest.fixture
+def SD3_MMDIT_COMMON_RUN_FLAGS(
+    sd3_mmdit_inference_input_0,
+    sd3_mmdit_inference_input_1,
+    sd3_mmdit_inference_input_2,
+    sd3_mmdit_inference_input_3,
+    sd3_mmdit_inference_output_0,
+):
+    return [
+        f"--input=2x16x128x128xf16=@{sd3_mmdit_inference_input_0.path}",
+        f"--input=2x154x4096xf16=@{sd3_mmdit_inference_input_1.path}",
+        f"--input=2x2048xf16=@{sd3_mmdit_inference_input_2.path}",
+        f"--input=2xf16=@{sd3_mmdit_inference_input_3.path}",
+        f"--expected_output=2x16x128x128xf32=@{sd3_mmdit_inference_output_0.path}",
+    ]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--iree-opt-const-eval=false",
+    f"--iree-codegen-transform-dialect-library={iree_test_path_extension}/attention_and_matmul_spec.mlir",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+    "--iree-flow-enable-aggressive-fusion=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-vm-target-truncate-unsupported-floats",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-opt-data-tiling=false",
+    "--iree-codegen-gpu-native-math-precision=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_mmdit_cpu(sd3_mmdit_mlir):
+    VmfbManager.sd3_mmdit_cpu_vmfb = iree_compile(
+        sd3_mmdit_mlir, "cpu", CPU_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.xfail(
+    strict=True,
+    reason="Expected run to fail",
+)
+@pytest.mark.depends(on=["test_compile_mmdit_cpu"])
+def test_run_mmdit_cpu(SD3_MMDIT_COMMON_RUN_FLAGS, sd3_mmdit_real_weights):
+    return iree_run_module(
+        VmfbManager.sd3_mmdit_cpu_vmfb,
+        device="local-task",
+        function="run_forward",
+        args=[f"--parameters=model={sd3_mmdit_real_weights.path}"]
+        + SD3_MMDIT_COMMON_RUN_FLAGS,
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+@pytest.mark.xfail(
+    strict=True,
+    reason="Expected compilation to fail",
+)
+def test_compile_mmdit_rocm(sd3_mmdit_mlir):
+    VmfbManager.sd3_mmdit_rocm_vmfb = iree_compile(
+        sd3_mmdit_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_mmdit_rocm"])
+def test_run_mmdit_rocm(SD3_MMDIT_COMMON_RUN_FLAGS, sd3_mmdit_real_weights):
+    return iree_run_module(
+        VmfbManager.sd3_mmdit_rocm_vmfb,
+        device="hip",
+        function="run_forward",
+        args=[f"--parameters=model={sd3_mmdit_real_weights.path}"]
+        + SD3_MMDIT_COMMON_RUN_FLAGS,
+    )
diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py
new file mode 100644
index 000000000000..6d9ab660dffc
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py
@@ -0,0 +1,119 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+from conftest import VmfbManager
+
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sd3_vae_inference_input_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/inference_input.0.bin",
+    group="sd3_vae",
+)
+
+sd3_vae_inference_output_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/inference_output.0.bin",
+    group="sd3_vae",
+)
+
+sd3_vae_real_weights = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/real_weights.irpa",
+    group="sd3_vae",
+)
+
+sd3_vae_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/model.mlirbc",
+    group="sd3_vae",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+@pytest.fixture
+def SD3_VAE_COMMON_RUN_FLAGS(
+    sd3_vae_inference_input_0,
+    sd3_vae_inference_output_0,
+):
+    return [
+        f"--input=1x16x128x128xf16=@{sd3_vae_inference_input_0.path}",
+        f"--expected_output=3x1024x1024xf32=@{sd3_vae_inference_output_0.path}",
+    ]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-flow-enable-aggressive-fusion=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_vae_cpu(sd3_vae_mlir):
+    VmfbManager.sd3_vae_cpu_vmfb = iree_compile(sd3_vae_mlir, "cpu", CPU_COMPILE_FLAGS)
+
+
+@pytest.mark.depends(on=["test_compile_vae_cpu"])
+def test_run_vae_cpu(SD3_VAE_COMMON_RUN_FLAGS, sd3_vae_real_weights):
+    return iree_run_module(
+        VmfbManager.sd3_vae_cpu_vmfb,
+        device="local-task",
+        function="decode",
+        args=[
+            f"--parameters=model={sd3_vae_real_weights.path}",
+            "--expected_f32_threshold=0.01f",
+        ]
+        + SD3_VAE_COMMON_RUN_FLAGS,
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+def test_compile_vae_rocm(sd3_vae_mlir):
+    VmfbManager.sd3_vae_rocm_vmfb = iree_compile(
+        sd3_vae_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_vae_rocm"])
+def test_run_vae_rocm(SD3_VAE_COMMON_RUN_FLAGS, sd3_vae_real_weights):
+    return iree_run_module(
+        VmfbManager.sd3_vae_rocm_vmfb,
+        device="hip",
+        function="decode",
+        args=[
+            f"--parameters=model={sd3_vae_real_weights.path}",
+            "--expected_f32_threshold=0.7f",
+        ]
+        + SD3_VAE_COMMON_RUN_FLAGS,
+    )
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py
new file mode 100644
index 000000000000..41b2e61ad312
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py
@@ -0,0 +1,154 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+from conftest import VmfbManager
+
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sdxl_clip_inference_input_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.0.bin",
+    group="sdxl_clip",
+)
+
+sdxl_clip_inference_input_1 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.1.bin",
+    group="sdxl_clip",
+)
+
+sdxl_clip_inference_input_2 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.2.bin",
+    group="sdxl_clip",
+)
+
+sdxl_clip_inference_input_3 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.3.bin",
+    group="sdxl_clip",
+)
+
+sdxl_clip_inference_output_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_output.0.bin",
+    group="sdxl_clip",
+)
+
+sdxl_clip_inference_output_1 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_output.1.bin",
+    group="sdxl_clip",
+)
+
+sdxl_clip_real_weights = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/real_weights.irpa",
+    group="sdxl_clip",
+)
+
+sdxl_clip_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/model.mlirbc",
+    group="sdxl_clip",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+@pytest.fixture
+def SDXL_CLIP_COMMON_RUN_FLAGS(
+    sdxl_clip_inference_input_0,
+    sdxl_clip_inference_input_1,
+    sdxl_clip_inference_input_2,
+    sdxl_clip_inference_input_3,
+    sdxl_clip_inference_output_0,
+    sdxl_clip_inference_output_1,
+):
+    return [
+        f"--input=1x64xi64=@{sdxl_clip_inference_input_0.path}",
+        f"--input=1x64xi64=@{sdxl_clip_inference_input_1.path}",
+        f"--input=1x64xi64=@{sdxl_clip_inference_input_2.path}",
+        f"--input=1x64xi64=@{sdxl_clip_inference_input_3.path}",
+        f"--expected_output=2x64x2048xf16=@{sdxl_clip_inference_output_0.path}",
+        f"--expected_output=2x1280xf16=@{sdxl_clip_inference_output_1.path}",
+    ]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--iree-input-type=torch",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-llvmgpu-enable-prefetch",
+    "--iree-flow-enable-aggressive-fusion",
+    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_clip_cpu(sdxl_clip_mlir):
+    VmfbManager.sdxl_clip_cpu_vmfb = iree_compile(
+        sdxl_clip_mlir, "cpu", CPU_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_clip_cpu"])
+def test_run_clip_cpu(SDXL_CLIP_COMMON_RUN_FLAGS, sdxl_clip_real_weights):
+    iree_run_module(
+        VmfbManager.sdxl_clip_cpu_vmfb,
+        device="local-task",
+        function="encode_prompts",
+        args=[
+            f"--parameters=model={sdxl_clip_real_weights.path}",
+            "--expected_f16_threshold=1.0f",
+        ]
+        + SDXL_CLIP_COMMON_RUN_FLAGS,
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+def test_compile_clip_rocm(sdxl_clip_mlir):
+    VmfbManager.sdxl_clip_rocm_vmfb = iree_compile(
+        sdxl_clip_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_clip_rocm"])
+def test_run_clip_rocm(SDXL_CLIP_COMMON_RUN_FLAGS, sdxl_clip_real_weights):
+    return iree_run_module(
+        VmfbManager.sdxl_clip_rocm_vmfb,
+        device="hip",
+        function="encode_prompts",
+        args=[
+            f"--parameters=model={sdxl_clip_real_weights.path}",
+            "--expected_f16_threshold=1.0f",
+        ]
+        + SDXL_CLIP_COMMON_RUN_FLAGS,
+    )
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py
new file mode 100644
index 000000000000..4e1bc70dcb4c
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py
@@ -0,0 +1,183 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+import setuptools
+from conftest import VmfbManager
+from pathlib import Path
+
+iree_test_path_extension = os.getenv("IREE_TEST_PATH_EXTENSION", default=Path.cwd())
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sdxl_unet_inference_input_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.0.bin",
+    group="sdxl_unet",
+)
+
+sdxl_unet_inference_input_1 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.1.bin",
+    group="sdxl_unet",
+)
+
+sdxl_unet_inference_input_2 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.2.bin",
+    group="sdxl_unet",
+)
+
+sdxl_unet_inference_input_3 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.3.bin",
+    group="sdxl_unet",
+)
+
+sdxl_unet_inference_output_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_output.0.bin",
+    group="sdxl_unet",
+)
+
+sdxl_unet_real_weights = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/real_weights.irpa",
+    group="sdxl_unet",
+)
+
+sdxl_unet_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/model.mlirbc",
+    group="sdxl_unet",
+)
+
+sdxl_unet_pipeline_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/sdxl_unet_pipeline_bench_f16.mlir",
+    group="sdxl_unet",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+@pytest.fixture
+def SDXL_UNET_COMMON_RUN_FLAGS(
+    sdxl_unet_inference_input_0,
+    sdxl_unet_inference_input_1,
+    sdxl_unet_inference_input_2,
+    sdxl_unet_inference_input_3,
+    sdxl_unet_inference_output_0,
+):
+    return [
+        f"--input=1x4x128x128xf16=@{sdxl_unet_inference_input_0.path}",
+        f"--input=2x64x2048xf16=@{sdxl_unet_inference_input_1.path}",
+        f"--input=2x1280xf16=@{sdxl_unet_inference_input_2.path}",
+        f"--input=1xf16=@{sdxl_unet_inference_input_3.path}",
+        f"--expected_output=1x4x128x128xf16=@{sdxl_unet_inference_output_0.path}",
+    ]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--iree-opt-const-eval=false",
+    f"--iree-codegen-transform-dialect-library={iree_test_path_extension}/attention_and_matmul_spec.mlir",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-global-opt-enable-fuse-horizontal-contractions=true",
+    "--iree-flow-enable-aggressive-fusion=true",
+    "--iree-opt-aggressively-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-vm-target-truncate-unsupported-floats",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-opt-data-tiling=false",
+    "--iree-codegen-gpu-native-math-precision=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json",
+]
+
+ROCM_PIPELINE_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--verify=false",
+    "--iree-opt-const-eval=false",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_unet_pipeline_cpu(sdxl_unet_pipeline_mlir):
+    VmfbManager.sdxl_unet_cpu_pipeline_vmfb = iree_compile(
+        sdxl_unet_pipeline_mlir,
+        "cpu",
+        CPU_COMPILE_FLAGS,
+    )
+
+
+def test_compile_unet_cpu(sdxl_unet_mlir):
+    VmfbManager.sdxl_unet_cpu_vmfb = iree_compile(
+        sdxl_unet_mlir, "cpu", CPU_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_unet_pipeline_cpu", "test_compile_unet_cpu"])
+def test_run_unet_cpu(SDXL_UNET_COMMON_RUN_FLAGS, sdxl_unet_real_weights):
+    return iree_run_module(
+        VmfbManager.sdxl_unet_cpu_vmfb,
+        device="local-task",
+        function="produce_image_latents",
+        args=[
+            f"--parameters=model={sdxl_unet_real_weights.path}",
+            f"--module={VmfbManager.sdxl_unet_cpu_pipeline_vmfb.path}",
+            "--expected_f16_threshold=0.8f",
+        ]
+        + SDXL_UNET_COMMON_RUN_FLAGS,
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+def test_compile_unet_pipeline_rocm(sdxl_unet_pipeline_mlir):
+    VmfbManager.sdxl_unet_rocm_pipeline_vmfb = iree_compile(
+        sdxl_unet_pipeline_mlir,
+        f"rocm_{rocm_chip}",
+        ROCM_PIPELINE_COMPILE_FLAGS,
+    )
+
+
+def test_compile_unet_rocm(sdxl_unet_mlir):
+    VmfbManager.sdxl_unet_rocm_vmfb = iree_compile(
+        sdxl_unet_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_unet_pipeline_rocm", "test_compile_unet_rocm"])
+def test_run_unet_rocm(SDXL_UNET_COMMON_RUN_FLAGS, sdxl_unet_real_weights):
+    return iree_run_module(
+        VmfbManager.sdxl_unet_rocm_vmfb,
+        device="hip",
+        function="produce_image_latents",
+        args=[
+            f"--parameters=model={sdxl_unet_real_weights.path}",
+            f"--module={VmfbManager.sdxl_unet_rocm_pipeline_vmfb.path}",
+            "--expected_f16_threshold=0.7f",
+        ]
+        + SDXL_UNET_COMMON_RUN_FLAGS,
+    )
diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py
new file mode 100644
index 000000000000..49e49d3aec3e
--- /dev/null
+++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py
@@ -0,0 +1,123 @@
+# Copyright 2024 The IREE Authors
+#
+# Licensed under the Apache License v2.0 with LLVM Exceptions.
+# See https://llvm.org/LICENSE.txt for license information.
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import pytest
+from ireers_tools import *
+import os
+from conftest import VmfbManager
+
+rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a")
+
+###############################################################################
+# Fixtures
+###############################################################################
+
+sdxl_vae_inference_input_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/inference_input.0.bin",
+    group="sdxl_vae",
+)
+
+sdxl_vae_inference_output_0 = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/inference_output.0.bin",
+    group="sdxl_vae",
+)
+
+sdxl_vae_real_weights = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/real_weights.irpa",
+    group="sdxl_vae",
+)
+
+sdxl_vae_mlir = fetch_source_fixture(
+    "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/model.mlirbc",
+    group="sdxl_vae",
+)
+
+CPU_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=llvm-cpu",
+    "--iree-llvmcpu-target-cpu-features=host",
+    "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false",
+    "--iree-llvmcpu-distribution-size=32",
+    "--iree-opt-const-eval=false",
+    "--iree-llvmcpu-enable-ukernels=all",
+    "--iree-global-opt-enable-quantized-matmul-reassociation",
+]
+
+
+@pytest.fixture
+def SDXL_VAE_COMMON_RUN_FLAGS(
+    sdxl_vae_inference_input_0,
+    sdxl_vae_inference_output_0,
+):
+    return [
+        f"--input=1x4x128x128xf16=@{sdxl_vae_inference_input_0.path}",
+        f"--expected_output=1x3x1024x1024xf16=@{sdxl_vae_inference_output_0.path}",
+    ]
+
+
+ROCM_COMPILE_FLAGS = [
+    "--iree-hal-target-backends=rocm",
+    f"--iree-rocm-target-chip={rocm_chip}",
+    "--iree-opt-const-eval=false",
+    "--iree-global-opt-propagate-transposes=true",
+    "--iree-opt-outer-dim-concat=true",
+    "--iree-llvmgpu-enable-prefetch=true",
+    "--iree-rocm-waves-per-eu=2",
+    "--iree-flow-enable-aggressive-fusion=true",
+    "--iree-codegen-llvmgpu-use-vector-distribution=true",
+    "--iree-execution-model=async-external",
+    "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))",
+    "--iree-scheduling-dump-statistics-format=json",
+    "--iree-scheduling-dump-statistics-file=compilation_info.json",
+]
+
+###############################################################################
+# CPU
+###############################################################################
+
+
+def test_compile_vae_cpu(sdxl_vae_mlir):
+    VmfbManager.sdxl_vae_cpu_vmfb = iree_compile(
+        sdxl_vae_mlir, "cpu", CPU_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_vae_cpu"])
+def test_run_vae_cpu(SDXL_VAE_COMMON_RUN_FLAGS, sdxl_vae_real_weights):
+    return iree_run_module(
+        VmfbManager.sdxl_vae_cpu_vmfb,
+        device="local-task",
+        function="main",
+        args=[
+            f"--parameters=model={sdxl_vae_real_weights.path}",
+            "--expected_f16_threshold=0.02f",
+        ]
+        + SDXL_VAE_COMMON_RUN_FLAGS,
+    )
+
+
+###############################################################################
+# ROCM
+###############################################################################
+
+
+def test_compile_vae_rocm(sdxl_vae_mlir):
+    VmfbManager.sdxl_vae_rocm_vmfb = iree_compile(
+        sdxl_vae_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS
+    )
+
+
+@pytest.mark.depends(on=["test_compile_vae_rocm"])
+def test_run_vae_rocm(SDXL_VAE_COMMON_RUN_FLAGS, sdxl_vae_real_weights):
+    return iree_run_module(
+        VmfbManager.sdxl_vae_rocm_vmfb,
+        device="hip",
+        function="main",
+        args=[
+            f"--parameters=model={sdxl_vae_real_weights.path}",
+            "--expected_f16_threshold=0.4f",
+        ]
+        + SDXL_VAE_COMMON_RUN_FLAGS,
+    )
diff --git a/experimental/regression_suite/tests/pregenerated/test_llama2.py b/experimental/regression_suite/tests/pregenerated/test_llama2.py
index af9c8d2958c6..0db2abf9e6a4 100644
--- a/experimental/regression_suite/tests/pregenerated/test_llama2.py
+++ b/experimental/regression_suite/tests/pregenerated/test_llama2.py
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import pytest
-from ireers import *
+from ireers_tools import *
 
 ###############################################################################
 # Fixtures
diff --git a/experimental/regression_suite/tests/pregenerated/test_ukernel.py b/experimental/regression_suite/tests/pregenerated/test_ukernel.py
index 90e183a59e55..f9c92c66de22 100644
--- a/experimental/regression_suite/tests/pregenerated/test_ukernel.py
+++ b/experimental/regression_suite/tests/pregenerated/test_ukernel.py
@@ -5,7 +5,7 @@
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
 import pytest
-from ireers import *
+from ireers_tools import *
 
 ###############################################################################
 # Fixtures