From 44808e143533e694644da13b75a8f8358ee289cf Mon Sep 17 00:00:00 2001 From: saienduri <77521230+saienduri@users.noreply.github.com> Date: Fri, 12 Jul 2024 15:51:47 -0700 Subject: [PATCH] Add in-tree special_models test suite using reworked iree-tooling. (#17883) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit With this, we move away from using all the specialized json config files and complex workflows. Instead, we use python scripts which allow us to use custom flags, tolerances, and configurations based on the backend/model. Related PR in TestSuite: https://github.com/nod-ai/SHARK-TestSuite/pull/271 This PR also removes all dependencies on SHARK-TestSuite tooling. Reworked the tools here so that downloading, caching, testing, and benchmarking occurs as intended with tools solely from this repo for iree_special_models. Whenever we are adding test files here, the goal is for an IREE user to be able to clone the repo and run the run tests knowing nothing about the SHARK-TestSuite . Also didn't realize, but ireers here already has a process of stamping here to check if a file is already produced. I think we have to remove this because it will skip even if there is a newer version of the file available and there's really no point when downloading to a cache because once it's there, it is never removed so not a valuable signal. (Third times the charm. Had to close the last two versions of this PR because couldn't get passed a pre-commit check that led me to rebase and add a bunch of commits that weren't mine 🤦 ) ci-exactly: build_all, test_amd_mi300, build_packages, regression_test --------- Signed-off-by: saienduri --- .github/workflows/pkgci_regression_test.yml | 156 ++++--- ...dels_gpu_rocm_gfx90a_additional_flags.json | 25 -- ...dels_gpu_rocm_gfx942_additional_flags.json | 25 -- .../sdxl_prompt_encoder_cpu_llvm_task.json | 22 - .../sdxl_prompt_encoder_gpu_rocm_gfx90a.json | 36 -- .../sdxl_prompt_encoder_gpu_rocm_gfx942.json | 36 -- .../sdxl_scheduled_unet_cpu_llvm_task.json | 23 - .../sdxl_scheduled_unet_gpu_rocm_gfx90a.json | 39 -- .../sdxl_scheduled_unet_gpu_rocm_gfx942.json | 41 -- .../sdxl_vae_decode_cpu_llvm_task.json | 20 - .../sdxl_vae_decode_gpu_rocm_gfx90a.json | 29 -- .../sdxl_vae_decode_gpu_rocm_gfx942.json | 29 -- .../benchmarks/sdxl/benchmark_sdxl_rocm.py | 407 ++++++++++++++++++ experimental/benchmarks/sdxl/conftest.py | 144 +++++++ .../sdxl/sdxl_pipeline_bench_f16.mlir | 23 + .../regression_suite/ireers/artifacts.py | 154 ------- .../{ireers => ireers_tools}/__init__.py | 0 .../ireers_tools/artifacts.py | 234 ++++++++++ .../{ireers => ireers_tools}/fixtures.py | 0 experimental/regression_suite/setup.py | 9 +- .../shark-test-suite-models/conftest.py | 22 + .../shark-test-suite-models/sd3/test_clip.py | 167 +++++++ .../shark-test-suite-models/sd3/test_mmdit.py | 152 +++++++ .../shark-test-suite-models/sd3/test_vae.py | 119 +++++ .../shark-test-suite-models/sdxl/test_clip.py | 154 +++++++ .../shark-test-suite-models/sdxl/test_unet.py | 183 ++++++++ .../shark-test-suite-models/sdxl/test_vae.py | 123 ++++++ .../tests/pregenerated/test_llama2.py | 2 +- .../tests/pregenerated/test_ukernel.py | 2 +- 29 files changed, 1824 insertions(+), 552 deletions(-) delete mode 100644 build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json delete mode 100644 build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json delete mode 100644 build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json create mode 100644 experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py create mode 100644 experimental/benchmarks/sdxl/conftest.py create mode 100644 experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir delete mode 100644 experimental/regression_suite/ireers/artifacts.py rename experimental/regression_suite/{ireers => ireers_tools}/__init__.py (100%) create mode 100644 experimental/regression_suite/ireers_tools/artifacts.py rename experimental/regression_suite/{ireers => ireers_tools}/fixtures.py (100%) create mode 100644 experimental/regression_suite/shark-test-suite-models/conftest.py create mode 100644 experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py create mode 100644 experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py create mode 100644 experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py create mode 100644 experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py diff --git a/.github/workflows/pkgci_regression_test.yml b/.github/workflows/pkgci_regression_test.yml index 98a6e2b8c8c2..1a61c7677371 100644 --- a/.github/workflows/pkgci_regression_test.yml +++ b/.github/workflows/pkgci_regression_test.yml @@ -90,7 +90,7 @@ jobs: uses: actions/checkout@v4.1.7 with: repository: nod-ai/SHARK-TestSuite - ref: 3603a453b3777fac9af4506a3dc0b3d87587fd47 + ref: a06e730ce325c12db40bb89b43e8e6e897052e96 path: SHARK-TestSuite submodules: false lfs: false @@ -98,6 +98,7 @@ jobs: run: | source ${VENV_DIR}/bin/activate python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt + pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools - name: Run external tests - ONNX test suite run: | @@ -138,25 +139,14 @@ jobs: # CPU - name: cpu_llvm_task models-config-file: models_cpu_llvm_task.json - sdxl-unet-config-file: sdxl_scheduled_unet_cpu_llvm_task.json - sdxl-vae-config-file: sdxl_vae_decode_cpu_llvm_task.json - sdxl-clip-config-file: sdxl_prompt_encoder_cpu_llvm_task.json runs-on: nodai-amdgpu-w7900-x86-64 # AMD GPU - name: amdgpu_rocm_mi250_gfx90a models-config-file: models_gpu_rocm_gfx90a.json - models-extra-flags-config-file: models_gpu_rocm_gfx90a_additional_flags.json - sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json - sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx90a.json - sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx90a.json runs-on: nodai-amdgpu-mi250-x86-64 - name: amdgpu_rocm_mi300_gfx942 models-config-file: models_gpu_rocm_gfx942.json - models-extra-flags-config-file: models_gpu_rocm_gfx942_additional_flags.json - sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx942.json - sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx942.json - sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx942.json runs-on: nodai-amdgpu-mi300-x86-64 - name: amdgpu_vulkan models-config-file: models_gpu_vulkan.json @@ -176,10 +166,6 @@ jobs: IREE_TEST_FILES: ~/iree_tests_cache IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite MODELS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-config-file }} - MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.models-extra-flags-config-file }} - SDXL_UNET_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-unet-config-file }} - SDXL_CLIP_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-clip-config-file }} - SDXL_VAE_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-vae-config-file }} VENV_DIR: ${{ github.workspace }}/venv LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9 steps: @@ -207,33 +193,25 @@ jobs: --artifact-path=${PACKAGE_DOWNLOAD_DIR} \ --fetch-gh-workflow=${{ inputs.artifact_run_id }} - # TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm - # # In-tree tests - # - name: Run experimental/regression_suite tests - # run: | - # source ${VENV_DIR}/bin/activate - # pytest \ - # -rA -s -m "plat_host_cpu and presubmit" \ - # experimental/regression_suite - # Out of tree tests - name: Check out external TestSuite repository uses: actions/checkout@v4.1.7 with: repository: nod-ai/SHARK-TestSuite - ref: 3603a453b3777fac9af4506a3dc0b3d87587fd47 + ref: a06e730ce325c12db40bb89b43e8e6e897052e96 path: SHARK-TestSuite submodules: false lfs: true - name: Install external TestSuite Python requirements run: | source ${VENV_DIR}/bin/activate - python -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt + python3 -m pip install -r SHARK-TestSuite/iree_tests/requirements.txt + pip install --no-compile --pre --upgrade -e SHARK-TestSuite/common_tools - name: Download remote files for real weight model tests run: | source ${VENV_DIR}/bin/activate - python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir pytorch/models - python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir sharktank + python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/pytorch/models + python SHARK-TestSuite/iree_tests/download_remote_files.py --root-dir iree_tests/sharktank - name: Run external tests - models with real weights if: "matrix.models-config-file != '' && !cancelled()" @@ -251,61 +229,99 @@ jobs: --durations=0 \ --config-files=${MODELS_CONFIG_FILE_PATH} - - name: Run external tests - models with real weights and additional flags - if: "matrix.models-extra-flags-config-file != '' && !cancelled()" - run: | - source ${VENV_DIR}/bin/activate - pytest SHARK-TestSuite/iree_tests/pytorch/models \ - -rpfE \ - -k real_weights \ - --no-skip-tests-missing-files \ - --capture=no \ - --log-cli-level=info \ - --timeout=1200 \ - --durations=0 \ - --config-files=${MODELS_EXTRA_FLAGS_CONFIG_FILE_PATH} + test_regression_suite: + name: "test_regression_suite :: ${{ matrix.name }}" + runs-on: ${{ matrix.runs-on }} + strategy: + fail-fast: false - - name: "Run external tests - SDXL scheduled unet" - if: "matrix.sdxl-unet-config-file != '' && !cancelled()" + # Note: these jobs should use persistent runners with local caches. + # Downloading test files (50GB+) without a cache can take 20+ minutes. + matrix: + include: + # CPU + - name: cpu_llvm_task + models-config-file: models_cpu_llvm_task.json + backend: cpu + runs-on: nodai-amdgpu-w7900-x86-64 + + # AMD GPU + - name: amdgpu_rocm_mi250_gfx90a + rocm-chip: gfx90a + backend: rocm + runs-on: nodai-amdgpu-mi250-x86-64 + - name: amdgpu_rocm_mi300_gfx942 + rocm-chip: gfx942 + backend: rocm + runs-on: nodai-amdgpu-mi300-x86-64 + env: + PACKAGE_DOWNLOAD_DIR: ${{ github.workspace }}/.packages + IREERS_ARTIFACT_DIR: ${{ github.workspace }}/artifacts + IREE_TEST_FILES: ~/iree_tests_cache + IREE_TEST_PATH_EXTENSION: ${{ github.workspace }}/build_tools/pkgci/external_test_suite + VENV_DIR: ${{ github.workspace }}/venv + LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9 + steps: + # TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions + # directory to be able to clean after every PR + - name: Pre Checkout MI300 Step + if: contains(matrix.name, 'gfx942') run: | - source ${VENV_DIR}/bin/activate - pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-scheduled-unet-3-tank \ - -rpfE \ - -k real_weights \ - --no-skip-tests-missing-files \ - --capture=no \ - --log-cli-level=info \ - --timeout=1200 \ - --durations=0 \ - --config-files=${SDXL_UNET_CONFIG_FILE_PATH} + sudo chmod -R 777 ~/actions-runner/_work + - name: Checking out IREE repository + uses: actions/checkout@v4.1.7 + with: + submodules: false + - uses: actions/setup-python@v5.1.0 + with: + # Must match the subset of versions built in pkgci_build_packages. + python-version: "3.11" + - uses: actions/download-artifact@v4.1.7 + with: + name: linux_x86_64_release_packages + path: ${{ env.PACKAGE_DOWNLOAD_DIR }} + - name: Setup venv + run: | + ./build_tools/pkgci/setup_venv.py ${VENV_DIR} \ + --artifact-path=${PACKAGE_DOWNLOAD_DIR} \ + --fetch-gh-workflow=${{ inputs.artifact_run_id }} + + # TODO(#17344): regenerate .mlirbc files, test plat_rdna3_rocm on rocm + # # In-tree tests + # - name: Run experimental/regression_suite tests + # run: | + # source ${VENV_DIR}/bin/activate + # pytest \ + # -rA -s -m "plat_host_cpu and presubmit" \ + # experimental/regression_suite - - name: "Run external tests - SDXL prompt encoder" - if: "matrix.sdxl-clip-config-file != '' && !cancelled()" + - name: "Running SDXL special model tests" + if: "!cancelled()" run: | source ${VENV_DIR}/bin/activate - pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-prompt-encoder-tank \ + pytest ./experimental/regression_suite/shark-test-suite-models/sdxl \ + -k ${{ matrix.backend }} \ -rpfE \ - -k real_weights \ - --no-skip-tests-missing-files \ --capture=no \ --log-cli-level=info \ --timeout=1200 \ - --durations=0 \ - --config-files=${SDXL_CLIP_CONFIG_FILE_PATH} + --durations=0 + env: + ROCM_CHIP: ${{ matrix.rocm-chip }} - - name: "Run external tests - SDXL vae decode" - if: "matrix.sdxl-vae-config-file != '' && !cancelled()" + - name: "Running SD3 special model tests" + if: "!cancelled()" run: | source ${VENV_DIR}/bin/activate - pytest SHARK-TestSuite/iree_tests/pytorch/models/sdxl-vae-decode-tank \ + pytest ./experimental/regression_suite/shark-test-suite-models/sd3 \ + -k ${{ matrix.backend }} \ -rpfE \ - -k real_weights \ - --no-skip-tests-missing-files \ --capture=no \ --log-cli-level=info \ --timeout=1200 \ - --durations=0 \ - --config-files=${SDXL_VAE_CONFIG_FILE_PATH} + --durations=0 + env: + ROCM_CHIP: ${{ matrix.rocm-chip }} # Note: mi250 benchmark times are more lenient than mi300 (allowing about # 10% deviation from observed averages), since the mi250 runners we use @@ -314,7 +330,7 @@ jobs: if: contains(matrix.name, 'rocm_mi250_gfx90a') run: | source ${VENV_DIR}/bin/activate - pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \ + pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \ --goldentime-rocm-e2e-ms 1450.0 \ --goldentime-rocm-unet-ms 370.0 \ --goldentime-rocm-clip-ms 18.5 \ @@ -336,7 +352,7 @@ jobs: if: contains(matrix.name, 'rocm_mi300_gfx942') run: | source ${VENV_DIR}/bin/activate - pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \ + pytest ./experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py \ --goldentime-rocm-e2e-ms 325.0 \ --goldentime-rocm-unet-ms 77.0 \ --goldentime-rocm-clip-ms 15.5 \ diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json deleted file mode 100644 index 4537b3f28b7d..000000000000 --- a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx90a_additional_flags.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags": [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx90a", - "--iree-input-demote-f64-to-f32", - "--iree-opt-const-eval=false", - "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir" - ], - "iree_run_module_flags": [ - "--device=hip" - ], - "skip_compile_tests": [ - "pytorch/models/sdxl-scheduled-unet-3-tank", - "pytorch/models/sdxl-prompt-encoder-tank", - "pytorch/models/sdxl-vae-decode-tank" - ], - "skip_run_tests": [], - "expected_compile_failures": [ - // TODO(#17344): need to regenerate .mlirbc - "pytorch/models/opt-125M", - "pytorch/models/resnet50" - ], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json deleted file mode 100644 index 28950d0643d5..000000000000 --- a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json +++ /dev/null @@ -1,25 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags": [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx942", - "--iree-input-demote-f64-to-f32", - "--iree-opt-const-eval=false", - "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir" - ], - "iree_run_module_flags": [ - "--device=hip" - ], - "skip_compile_tests": [ - "pytorch/models/sdxl-scheduled-unet-3-tank", - "pytorch/models/sdxl-prompt-encoder-tank", - "pytorch/models/sdxl-vae-decode-tank" - ], - "skip_run_tests": [], - "expected_compile_failures": [ - // TODO(#17344): need to regenerate .mlirbc - "pytorch/models/opt-125M", - "pytorch/models/resnet50" - ], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json deleted file mode 100644 index cc39c2d53d9e..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_cpu_llvm_task.json +++ /dev/null @@ -1,22 +0,0 @@ -{ - "config_name": "cpu_llvm_task", - "iree_compile_flags" : [ - "--iree-hal-target-backends=llvm-cpu", - "--iree-llvmcpu-target-cpu-features=host" - ], - "iree_run_module_flags": [ - "--device=local-task", - "--parameters=model=real_weights.irpa", - "--input=1x64xi64=@inference_input.0.bin", - "--input=1x64xi64=@inference_input.1.bin", - "--input=1x64xi64=@inference_input.2.bin", - "--input=1x64xi64=@inference_input.3.bin", - "--expected_output=2x64x2048xf16=@inference_output.0.bin", - "--expected_output=2x1280xf16=@inference_output.1.bin", - "--expected_f16_threshold=1.0f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json deleted file mode 100644 index 1aabeb85f3fb..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx90a.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags": [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx90a", - "--iree-input-type=torch", - "--iree-opt-const-eval=false", - "--iree-global-opt-propagate-transposes=true", - "--iree-opt-outer-dim-concat=true", - "--iree-rocm-waves-per-eu=2", - "--iree-llvmgpu-enable-prefetch", - "--iree-flow-enable-aggressive-fusion", - "--iree-global-opt-enable-fuse-horizontal-contractions=true", - "--iree-opt-aggressively-propagate-transposes=true", - "--iree-codegen-llvmgpu-use-vector-distribution=true", - "--iree-execution-model=async-external", - "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))", - "--iree-scheduling-dump-statistics-format=json", - "--iree-scheduling-dump-statistics-file=compilation_info.json" - ], - "iree_run_module_flags": [ - "--device=hip", - "--parameters=model=real_weights.irpa", - "--input=1x64xi64=@inference_input.0.bin", - "--input=1x64xi64=@inference_input.1.bin", - "--input=1x64xi64=@inference_input.2.bin", - "--input=1x64xi64=@inference_input.3.bin", - "--expected_output=2x64x2048xf16=@inference_output.0.bin", - "--expected_output=2x1280xf16=@inference_output.1.bin", - "--expected_f16_threshold=1.0f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json deleted file mode 100644 index e3dbc9b75b0c..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json +++ /dev/null @@ -1,36 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags": [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx942", - "--iree-input-type=torch", - "--iree-opt-const-eval=false", - "--iree-global-opt-propagate-transposes=true", - "--iree-opt-outer-dim-concat=true", - "--iree-rocm-waves-per-eu=2", - "--iree-llvmgpu-enable-prefetch", - "--iree-flow-enable-aggressive-fusion", - "--iree-global-opt-enable-fuse-horizontal-contractions=true", - "--iree-opt-aggressively-propagate-transposes=true", - "--iree-codegen-llvmgpu-use-vector-distribution=true", - "--iree-execution-model=async-external", - "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))", - "--iree-scheduling-dump-statistics-format=json", - "--iree-scheduling-dump-statistics-file=compilation_info.json" - ], - "iree_run_module_flags": [ - "--device=hip", - "--parameters=model=real_weights.irpa", - "--input=1x64xi64=@inference_input.0.bin", - "--input=1x64xi64=@inference_input.1.bin", - "--input=1x64xi64=@inference_input.2.bin", - "--input=1x64xi64=@inference_input.3.bin", - "--expected_output=2x64x2048xf16=@inference_output.0.bin", - "--expected_output=2x1280xf16=@inference_output.1.bin", - "--expected_f16_threshold=1.0f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json deleted file mode 100644 index d2dff6f0a9f4..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_cpu_llvm_task.json +++ /dev/null @@ -1,23 +0,0 @@ -{ - "config_name": "cpu_llvm_task", - "iree_compile_flags": [ - "--iree-hal-target-backends=llvm-cpu", - "--iree-llvmcpu-target-cpu-features=host", - "--iree-input-demote-f64-to-f32" - ], - "iree_run_module_flags": [ - "--device=local-task", - "--parameters=model=real_weights.irpa", - "--module=sdxl_scheduled_unet_pipeline_fp16_cpu.vmfb", - "--input=1x4x128x128xf16=@inference_input.0.bin", - "--input=2x64x2048xf16=@inference_input.1.bin", - "--input=2x1280xf16=@inference_input.2.bin", - "--input=1xf16=@inference_input.3.bin", - "--expected_output=1x4x128x128xf16=@inference_output.0.bin", - "--expected_f16_threshold=0.8f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json deleted file mode 100644 index 8ac4f1fc895b..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx90a.json +++ /dev/null @@ -1,39 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags" : [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx90a", - "--iree-opt-const-eval=false", - "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir", - "--iree-global-opt-propagate-transposes=true", - "--iree-global-opt-enable-fuse-horizontal-contractions=true", - "--iree-flow-enable-aggressive-fusion=true", - "--iree-opt-aggressively-propagate-transposes=true", - "--iree-opt-outer-dim-concat=true", - "--iree-vm-target-truncate-unsupported-floats", - "--iree-llvmgpu-enable-prefetch=true", - "--iree-opt-data-tiling=false", - "--iree-codegen-gpu-native-math-precision=true", - "--iree-codegen-llvmgpu-use-vector-distribution", - "--iree-rocm-waves-per-eu=2", - "--iree-execution-model=async-external", - "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", - "--iree-scheduling-dump-statistics-format=json", - "--iree-scheduling-dump-statistics-file=compilation_info.json" - ], - "iree_run_module_flags": [ - "--device=hip", - "--parameters=model=real_weights.irpa", - "--module=sdxl_scheduled_unet_pipeline_fp16_rocm.vmfb", - "--input=1x4x128x128xf16=@inference_input.0.bin", - "--input=2x64x2048xf16=@inference_input.1.bin", - "--input=2x1280xf16=@inference_input.2.bin", - "--input=1xf16=@inference_input.3.bin", - "--expected_output=1x4x128x128xf16=@inference_output.0.bin", - "--expected_f16_threshold=0.7f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json deleted file mode 100644 index 289e99b2af17..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json +++ /dev/null @@ -1,41 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags" : [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx942", - "--iree-opt-const-eval=false", - "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir", - "--iree-global-opt-propagate-transposes=true", - "--iree-global-opt-enable-fuse-horizontal-contractions=true", - "--iree-flow-enable-aggressive-fusion=true", - "--iree-opt-aggressively-propagate-transposes=true", - "--iree-opt-outer-dim-concat=true", - "--iree-vm-target-truncate-unsupported-floats", - "--iree-llvmgpu-enable-prefetch=true", - "--iree-opt-data-tiling=false", - "--iree-codegen-gpu-native-math-precision=true", - "--iree-codegen-llvmgpu-use-vector-distribution", - "--iree-rocm-waves-per-eu=2", - "--iree-execution-model=async-external", - "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", - "--iree-scheduling-dump-statistics-format=json", - "--iree-scheduling-dump-statistics-file=compilation_info.json" - ], - "iree_run_module_flags": [ - "--device=hip", - "--parameters=model=real_weights.irpa", - "--module=sdxl_scheduled_unet_pipeline_fp16_rocm.vmfb", - "--input=1x4x128x128xf16=@inference_input.0.bin", - "--input=2x64x2048xf16=@inference_input.1.bin", - "--input=2x1280xf16=@inference_input.2.bin", - "--input=1xf16=@inference_input.3.bin", - "--expected_output=1x4x128x128xf16=@inference_output.0.bin", - "--expected_f16_threshold=0.7f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [ - "pytorch/models/sdxl-scheduled-unet-3-tank", - ] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json deleted file mode 100644 index 0a8a48c4f00d..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_cpu_llvm_task.json +++ /dev/null @@ -1,20 +0,0 @@ -{ - "config_name": "cpu_llvm_task", - "iree_compile_flags" : [ - "--iree-hal-target-backends=llvm-cpu", - "--iree-llvmcpu-target-cpu-features=host" - ], - "iree_run_module_flags": [ - "--device=local-task", - "--parameters=model=real_weights.irpa", - "--input=1x4x128x128xf16=@inference_input.0.bin", - "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin", - "--expected_f16_threshold=0.02f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [ - "pytorch/models/sdxl-vae-decode-tank" - ] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json deleted file mode 100644 index 690bffa994ea..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx90a.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags" : [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx90a", - "--iree-opt-const-eval=false", - "--iree-global-opt-propagate-transposes=true", - "--iree-opt-outer-dim-concat=true", - "--iree-llvmgpu-enable-prefetch=true", - "--iree-rocm-waves-per-eu=2", - "--iree-flow-enable-aggressive-fusion", - "--iree-codegen-llvmgpu-use-vector-distribution=true", - "--iree-execution-model=async-external", - "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", - "--iree-scheduling-dump-statistics-format=json", - "--iree-scheduling-dump-statistics-file=compilation_info.json" - ], - "iree_run_module_flags": [ - "--device=hip", - "--parameters=model=real_weights.irpa", - "--input=1x4x128x128xf16=@inference_input.0.bin", - "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin", - "--expected_f16_threshold=0.4f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json deleted file mode 100644 index 1ea72517f283..000000000000 --- a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json +++ /dev/null @@ -1,29 +0,0 @@ -{ - "config_name": "gpu_rocm", - "iree_compile_flags" : [ - "--iree-hal-target-backends=rocm", - "--iree-rocm-target-chip=gfx942", - "--iree-opt-const-eval=false", - "--iree-global-opt-propagate-transposes=true", - "--iree-opt-outer-dim-concat=true", - "--iree-llvmgpu-enable-prefetch=true", - "--iree-rocm-waves-per-eu=2", - "--iree-flow-enable-aggressive-fusion", - "--iree-codegen-llvmgpu-use-vector-distribution=true", - "--iree-execution-model=async-external", - "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", - "--iree-scheduling-dump-statistics-format=json", - "--iree-scheduling-dump-statistics-file=compilation_info.json" - ], - "iree_run_module_flags": [ - "--device=hip", - "--parameters=model=real_weights.irpa", - "--input=1x4x128x128xf16=@inference_input.0.bin", - "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin", - "--expected_f16_threshold=0.4f" - ], - "skip_compile_tests": [], - "skip_run_tests": [], - "expected_compile_failures": [], - "expected_run_failures": [] -} diff --git a/experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py b/experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py new file mode 100644 index 000000000000..25a17050cb2a --- /dev/null +++ b/experimental/benchmarks/sdxl/benchmark_sdxl_rocm.py @@ -0,0 +1,407 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import os +from collections import namedtuple +import logging +from typing import Sequence +import subprocess +import json +from pathlib import Path +import tabulate +from pytest_check import check + +benchmark_dir = os.path.dirname(os.path.realpath(__file__)) +artifacts_dir = os.getenv("IREE_TEST_FILES", default=Path.cwd()) + "/artifacts" +artifacts_dir = Path(os.path.expanduser(artifacts_dir)).resolve() +prompt_encoder_dir = f"{artifacts_dir}/sdxl_clip" +scheduled_unet_dir = f"{artifacts_dir}/sdxl_unet" +vae_decode_dir = f"{artifacts_dir}/sdxl_vae" + + +def run_iree_command(args: Sequence[str] = ()): + command = "Exec:", " ".join(args) + logging.getLogger().info(command) + proc = subprocess.run( + args, stdout=subprocess.PIPE, stderr=subprocess.PIPE, check=False + ) + ( + stdout_v, + stderr_v, + ) = ( + proc.stdout, + proc.stderr, + ) + return_code = proc.returncode + if return_code == 0: + return 0, proc.stdout + logging.getLogger().info(f"Command failed with error: {proc.stderr}") + return 1, proc.stdout + + +def run_sdxl_rocm_benchmark(rocm_chip, gpu_number): + exec_args = [ + "iree-compile", + f"{benchmark_dir}/sdxl_pipeline_bench_f16.mlir", + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-global-opt-propagate-transposes=true", + "--iree-codegen-llvmgpu-use-vector-distribution", + "--iree-codegen-gpu-native-math-precision=true", + "--iree-rocm-waves-per-eu=2", + "--iree-opt-outer-dim-concat=true", + "--iree-llvmgpu-enable-prefetch", + "-o", + f"{benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb", + ] + # iree compile command for full sdxl pipeline + ret_value, stdout = run_iree_command(exec_args) + if ret_value == 1: + return 1, stdout + exec_args = [ + "iree-benchmark-module", + f"--device=hip://{gpu_number}", + "--device_allocator=caching", + f"--module={prompt_encoder_dir}/model.rocm_{rocm_chip}.vmfb", + f"--parameters=model={prompt_encoder_dir}/real_weights.irpa", + f"--module={scheduled_unet_dir}/model.rocm_{rocm_chip}.vmfb", + f"--parameters=model={scheduled_unet_dir}/real_weights.irpa", + f"--module={vae_decode_dir}/model.rocm_{rocm_chip}.vmfb", + f"--parameters=model={vae_decode_dir}/real_weights.irpa", + f"--module={benchmark_dir}/sdxl_full_pipeline_fp16_rocm.vmfb", + "--function=tokens_to_image", + "--input=1x4x128x128xf16", + "--input=1xf16", + "--input=1x64xi64", + "--input=1x64xi64", + "--input=1x64xi64", + "--input=1x64xi64", + "--benchmark_repetitions=10", + "--benchmark_min_warmup_time=3.0", + ] + # iree benchmark command for full sdxl pipeline + return run_iree_command(exec_args) + + +def run_sdxl_unet_rocm_benchmark(gpu_number, rocm_chip): + exec_args = [ + "iree-benchmark-module", + f"--device=hip://{gpu_number}", + "--device_allocator=caching", + f"--module={scheduled_unet_dir}/model.rocm_{rocm_chip}.vmfb", + f"--parameters=model={scheduled_unet_dir}/real_weights.irpa", + "--function=run_forward", + "--input=1x4x128x128xf16", + "--input=2x64x2048xf16", + "--input=2x1280xf16", + "--input=2x6xf16", + "--input=1xf16", + "--input=1xi64", + "--benchmark_repetitions=10", + "--benchmark_min_warmup_time=3.0", + ] + # iree benchmark command for full sdxl pipeline + return run_iree_command(exec_args) + + +def run_sdxl_prompt_encoder_rocm_benchmark(gpu_number, rocm_chip): + exec_args = [ + "iree-benchmark-module", + f"--device=hip://{gpu_number}", + "--device_allocator=caching", + f"--module={prompt_encoder_dir}/model.rocm_{rocm_chip}.vmfb", + f"--parameters=model={prompt_encoder_dir}/real_weights.irpa", + "--function=encode_prompts", + "--input=1x64xi64", + "--input=1x64xi64", + "--input=1x64xi64", + "--input=1x64xi64", + "--benchmark_repetitions=10", + "--benchmark_min_warmup_time=3.0", + ] + # iree benchmark command for full sdxl pipeline + return run_iree_command(exec_args) + + +def run_sdxl_vae_decode_rocm_benchmark(gpu_number, rocm_chip): + exec_args = [ + "iree-benchmark-module", + f"--device=hip://{gpu_number}", + "--device_allocator=caching", + f"--module={vae_decode_dir}/model.rocm_{rocm_chip}.vmfb", + f"--parameters=model={vae_decode_dir}/real_weights.irpa", + "--function=main", + "--input=1x4x128x128xf16", + "--benchmark_repetitions=10", + "--benchmark_min_warmup_time=3.0", + ] + # iree benchmark command for full sdxl pipeline + return run_iree_command(exec_args) + + +BenchmarkResult = namedtuple( + "BenchmarkResult", "benchmark_name time cpu_time iterations user_counters" +) + + +def decode_output(bench_lines): + benchmark_results = [] + for line in bench_lines: + split = line.split() + if len(split) == 0: + continue + benchmark_name = split[0] + time = " ".join(split[1:3]) + cpu_time = " ".join(split[3:5]) + iterations = split[5] + user_counters = None + if len(split) > 5: + user_counters = split[6] + benchmark_results.append( + BenchmarkResult( + benchmark_name=benchmark_name, + time=time, + cpu_time=cpu_time, + iterations=iterations, + user_counters=user_counters, + ) + ) + return benchmark_results + + +def job_summary_process(ret_value, output): + if ret_value == 1: + logging.getLogger().info("Running SDXL ROCm benchmark failed. Exiting") + return + bench_lines = output.decode().split("\n")[3:] + benchmark_results = decode_output(bench_lines) + logging.getLogger().info(benchmark_results) + benchmark_mean_time = float(benchmark_results[10].time.split()[0]) + return benchmark_mean_time + + +def test_sdxl_rocm_benchmark( + goldentime_rocm_e2e, + goldentime_rocm_unet, + goldentime_rocm_clip, + goldentime_rocm_vae, + gpu_number, + rocm_chip, + goldendispatch_rocm_unet, + goldendispatch_rocm_clip, + goldendispatch_rocm_vae, + goldensize_rocm_unet, + goldensize_rocm_clip, + goldensize_rocm_vae, +): + # e2e benchmark + ret_value, output = run_sdxl_rocm_benchmark(rocm_chip, gpu_number) + benchmark_e2e_mean_time = job_summary_process(ret_value, output) + mean_line = ( + f"E2E Benchmark Time: {str(benchmark_e2e_mean_time)} ms" + f" (golden time {goldentime_rocm_e2e} ms)" + ) + logging.getLogger().info(mean_line) + + # unet benchmark + ret_value, output = run_sdxl_unet_rocm_benchmark(gpu_number, rocm_chip) + benchmark_unet_mean_time = job_summary_process(ret_value, output) + mean_line = ( + f"Scheduled Unet Benchmark Time: {str(benchmark_unet_mean_time)} ms" + f" (golden time {goldentime_rocm_unet} ms)" + ) + logging.getLogger().info(mean_line) + + # unet compilation stats check + with open(f"{scheduled_unet_dir}/compilation_info.json", "r") as file: + comp_stats = json.load(file) + unet_dispatch_count = int( + comp_stats["stream-aggregate"]["execution"]["dispatch-count"] + ) + compilation_line = ( + f"Scheduled Unet Dispatch Count: {unet_dispatch_count}" + f" (golden dispatch count {goldendispatch_rocm_unet})" + ) + logging.getLogger().info(compilation_line) + + module_path = f"{scheduled_unet_dir}/model.rocm_{rocm_chip}.vmfb" + unet_binary_size = Path(module_path).stat().st_size + compilation_line = ( + f"Scheduled Unet Binary Size: {unet_binary_size} bytes" + f" (golden binary size {goldensize_rocm_unet} bytes)" + ) + logging.getLogger().info(compilation_line) + + # prompt encoder benchmark + ret_value, output = run_sdxl_prompt_encoder_rocm_benchmark(gpu_number, rocm_chip) + benchmark_clip_mean_time = job_summary_process(ret_value, output) + mean_line = ( + f"Prompt Encoder Benchmark Time: {str(benchmark_clip_mean_time)} ms" + f" (golden time {goldentime_rocm_clip} ms)" + ) + logging.getLogger().info(mean_line) + + # prompt encoder compilation stats check + with open(f"{prompt_encoder_dir}/compilation_info.json", "r") as file: + comp_stats = json.load(file) + clip_dispatch_count = int( + comp_stats["stream-aggregate"]["execution"]["dispatch-count"] + ) + compilation_line = ( + f"Prompt Encoder Dispatch Count: {clip_dispatch_count}" + f" (golden dispatch count {goldendispatch_rocm_clip})" + ) + logging.getLogger().info(compilation_line) + + module_path = f"{prompt_encoder_dir}/model.rocm_{rocm_chip}.vmfb" + clip_binary_size = Path(module_path).stat().st_size + compilation_line = ( + f"Prompt Encoder Binary Size: {clip_binary_size} bytes" + f" (golden binary size {goldensize_rocm_clip} bytes)" + ) + logging.getLogger().info(compilation_line) + + # vae decode benchmark + ret_value, output = run_sdxl_vae_decode_rocm_benchmark(gpu_number, rocm_chip) + benchmark_vae_mean_time = job_summary_process(ret_value, output) + mean_line = ( + f"VAE Decode Benchmark Time: {str(benchmark_vae_mean_time)} ms" + f" (golden time {goldentime_rocm_vae} ms)" + ) + logging.getLogger().info(mean_line) + + # vae decode compilation stats check + with open(f"{vae_decode_dir}/compilation_info.json", "r") as file: + comp_stats = json.load(file) + vae_dispatch_count = int( + comp_stats["stream-aggregate"]["execution"]["dispatch-count"] + ) + compilation_line = ( + f"VAE Decode Dispatch Count: {vae_dispatch_count}" + f" (golden dispatch count {goldendispatch_rocm_vae})" + ) + logging.getLogger().info(compilation_line) + + module_path = f"{vae_decode_dir}/model.rocm_{rocm_chip}.vmfb" + vae_binary_size = Path(module_path).stat().st_size + compilation_line = ( + f"VAE Decode Binary Size: {vae_binary_size} bytes" + f" (golden binary size {goldensize_rocm_vae} bytes)" + ) + logging.getLogger().info(compilation_line) + + # Create mean time table's header and rows + mean_time_header = ["Benchmark", "Current time (ms)", "Expected/golden time (ms)"] + mean_time_rows = [ + ["E2E†", f"{benchmark_e2e_mean_time}", f"{goldentime_rocm_e2e}"], + ["Scheduled Unet", f"{benchmark_unet_mean_time}", f"{goldentime_rocm_unet}"], + ["Prompt Encoder", f"{benchmark_clip_mean_time}", f"{goldentime_rocm_clip}"], + ["VAE Decode", f"{benchmark_vae_mean_time}", f"{goldentime_rocm_vae}"], + ] + + # Create dispatch count table's header and rows + dispatch_count_header = [ + "Benchmark", + "Current dispatch count", + "Expected/golden dispatch count", + ] + dispatch_count_rows = [ + ["Scheduled Unet", f"{unet_dispatch_count}", f"{goldendispatch_rocm_unet}"], + ["Prompt Encoder", f"{clip_dispatch_count}", f"{goldendispatch_rocm_clip}"], + ["VAE Decode", f"{vae_dispatch_count}", f"{goldendispatch_rocm_vae}"], + ] + + # Create binary size table's header and rows + binary_size_header = [ + "Benchmark", + "Current binary size (bytes)", + "Expected/golden binary size (bytes)", + ] + binary_size_rows = [ + ["Scheduled Unet", f"{unet_binary_size}", f"{goldensize_rocm_unet}"], + ["Prompt Encoder", f"{clip_binary_size}", f"{goldensize_rocm_clip}"], + ["VAE Decode", f"{vae_binary_size}", f"{goldensize_rocm_vae}"], + ] + + # Create mean time table using tabulate + mean_time_full = [mean_time_header] + mean_time_rows + mean_time_table = tabulate.tabulate( + mean_time_full, headers="firstrow", tablefmt="pipe" + ) + + # Create dispatch count table using tabulate + dispatch_count_full = [dispatch_count_header] + dispatch_count_rows + dispatch_count_table = tabulate.tabulate( + dispatch_count_full, headers="firstrow", tablefmt="pipe" + ) + + # Create binary size of compiled artifacts table using tabulate + binary_size_full = [binary_size_header] + binary_size_rows + binary_size_table = tabulate.tabulate( + binary_size_full, headers="firstrow", tablefmt="pipe" + ) + + # Write markdown tables to job summary file + with open("job_summary.md", "w") as job_summary: + print("SDXL Benchmark Summary:\n", file=job_summary) + print(mean_time_table, file=job_summary) + print("\n† E2E = Encode + Scheduled Unet * 3 + Decode\n", file=job_summary) + print(dispatch_count_table, file=job_summary) + print("\n", file=job_summary) + print(binary_size_table, file=job_summary) + + # Check all values are either <= than golden values for times and == for compilation statistics. + + check.less_equal( + benchmark_e2e_mean_time, + goldentime_rocm_e2e, + "SDXL e2e benchmark time should not regress", + ) + check.less_equal( + benchmark_unet_mean_time, + goldentime_rocm_unet, + "SDXL unet benchmark time should not regress", + ) + check.equal( + unet_dispatch_count, + goldendispatch_rocm_unet, + "SDXL scheduled unet dispatch count should not regress", + ) + check.less_equal( + unet_binary_size, + goldensize_rocm_unet, + "SDXL scheduled unet binary size should not get bigger", + ) + check.less_equal( + benchmark_clip_mean_time, + goldentime_rocm_clip, + "SDXL prompt encoder benchmark time should not regress", + ) + check.equal( + clip_dispatch_count, + goldendispatch_rocm_clip, + "SDXL prompt encoder dispatch count should not regress", + ) + check.less_equal( + clip_binary_size, + goldensize_rocm_clip, + "SDXL prompt encoder binary size should not get bigger", + ) + check.less_equal( + benchmark_vae_mean_time, + goldentime_rocm_vae, + "SDXL vae decode benchmark time should not regress", + ) + check.equal( + vae_dispatch_count, + goldendispatch_rocm_vae, + "SDXL vae decode dispatch count should not regress", + ) + check.less_equal( + vae_binary_size, + goldensize_rocm_vae, + "SDXL vae decode binary size should not get bigger", + ) diff --git a/experimental/benchmarks/sdxl/conftest.py b/experimental/benchmarks/sdxl/conftest.py new file mode 100644 index 000000000000..9ac43d9995ff --- /dev/null +++ b/experimental/benchmarks/sdxl/conftest.py @@ -0,0 +1,144 @@ +import pytest + + +def pytest_addoption(parser): + parser.addoption( + "--goldentime-rocm-e2e-ms", + action="store", + type=float, + help="Golden time to test benchmark", + ) + parser.addoption( + "--goldentime-rocm-unet-ms", + action="store", + type=float, + help="Golden time to test benchmark", + ) + parser.addoption( + "--goldentime-rocm-clip-ms", + action="store", + type=float, + help="Golden time to test benchmark", + ) + parser.addoption( + "--goldentime-rocm-vae-ms", + action="store", + type=float, + help="Golden time to test benchmark", + ) + parser.addoption( + "--goldendispatch-rocm-unet", + action="store", + default=1718, + type=int, + help="Golden dispatch count to test benchmark", + ) + parser.addoption( + "--goldendispatch-rocm-clip", + action="store", + default=1571, + type=int, + help="Golden dispatch count to test benchmark", + ) + parser.addoption( + "--goldendispatch-rocm-vae", + action="store", + default=250, + type=int, + help="Golden dispatch count to test benchmark", + ) + parser.addoption( + "--goldensize-rocm-unet-bytes", + action="store", + default=2088217, + type=int, + help="Golden vmfb size to test benchmark", + ) + parser.addoption( + "--goldensize-rocm-clip-bytes", + action="store", + default=785493, + type=int, + help="Golden vmfb size to test benchmark", + ) + parser.addoption( + "--goldensize-rocm-vae-bytes", + action="store", + default=762067, + type=int, + help="Golden vmfb size to test benchmark", + ) + parser.addoption( + "--gpu-number", + action="store", + default=0, + type=int, + help="IREE GPU device number to test on", + ) + parser.addoption( + "--rocm-chip", + action="store", + default="gfx90a", + type=str, + help="ROCm target chip configuration of GPU", + ) + + +@pytest.fixture +def goldentime_rocm_e2e(request): + return request.config.getoption("--goldentime-rocm-e2e-ms") + + +@pytest.fixture +def goldentime_rocm_unet(request): + return request.config.getoption("--goldentime-rocm-unet-ms") + + +@pytest.fixture +def goldentime_rocm_clip(request): + return request.config.getoption("--goldentime-rocm-clip-ms") + + +@pytest.fixture +def goldentime_rocm_vae(request): + return request.config.getoption("--goldentime-rocm-vae-ms") + + +@pytest.fixture +def goldendispatch_rocm_unet(request): + return request.config.getoption("--goldendispatch-rocm-unet") + + +@pytest.fixture +def goldendispatch_rocm_clip(request): + return request.config.getoption("--goldendispatch-rocm-clip") + + +@pytest.fixture +def goldendispatch_rocm_vae(request): + return request.config.getoption("--goldendispatch-rocm-vae") + + +@pytest.fixture +def goldensize_rocm_unet(request): + return request.config.getoption("--goldensize-rocm-unet-bytes") + + +@pytest.fixture +def goldensize_rocm_clip(request): + return request.config.getoption("--goldensize-rocm-clip-bytes") + + +@pytest.fixture +def goldensize_rocm_vae(request): + return request.config.getoption("--goldensize-rocm-vae-bytes") + + +@pytest.fixture +def rocm_chip(request): + return request.config.getoption("--rocm-chip") + + +@pytest.fixture +def gpu_number(request): + return request.config.getoption("--gpu-number") diff --git a/experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir b/experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir new file mode 100644 index 000000000000..cbf58e458ff4 --- /dev/null +++ b/experimental/benchmarks/sdxl/sdxl_pipeline_bench_f16.mlir @@ -0,0 +1,23 @@ +module @sdxl_compiled_pipeline { + func.func private @compiled_scheduled_unet.run_initialize(%arg0: tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"} + func.func private @compiled_scheduled_unet.run_forward(%arg0: tensor<1x4x128x128xf16>, %arg1: tensor<2x64x2048xf16>, %arg2: tensor<2x1280xf16>, %arg3: tensor<2x6xf16>, %arg4: tensor<1xf16>, %arg5: tensor<1xi64>) -> tensor<1x4x128x128xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"} + func.func private @compiled_clip.encode_prompts(%arg0: tensor<1x64xi64>, %arg1: tensor<1x64xi64>, %arg2: tensor<1x64xi64>, %arg3: tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]}]"} + func.func private @compiled_vae.main(%arg0: tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> attributes {torch.args_schema = "[1, {\22type\22: \22builtins.tuple\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: \22builtins.list\22, \22context\22: \22null\22, \22children_spec\22: [{\22type\22: null, \22context\22: null, \22children_spec\22: []}]}, {\22type\22: \22builtins.dict\22, \22context\22: \22[]\22, \22children_spec\22: []}]}]", torch.return_schema = "[1, {\22type\22: null, \22context\22: null, \22children_spec\22: []}]"} + + func.func @tokens_to_image(%sample: tensor<1x4x128x128xf16>, %guidance_scale: tensor<1xf16>, %t_ids_1: tensor<1x64xi64>, %t_ids_2: tensor<1x64xi64>, %u_ids_1: tensor<1x64xi64>, %u_ids_2: tensor<1x64xi64>) -> tensor<1x3x1024x1024xf16> { + %p_embeds, %t_embeds = func.call @compiled_clip.encode_prompts(%t_ids_1, %t_ids_2, %u_ids_1, %u_ids_2) : (tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>, tensor<1x64xi64>) -> (tensor<2x64x2048xf16>, tensor<2x1280xf16>) + %noisy_sample, %time_ids, %steps = func.call @compiled_scheduled_unet.run_initialize(%sample) : (tensor<1x4x128x128xf16>) -> (tensor<1x4x128x128xf16>, tensor<2x6xf16>, tensor) + %c0 = arith.constant 0 : index + %c1 = arith.constant 1 : index + %steps_int = tensor.extract %steps[] : tensor + %n_steps = arith.index_cast %steps_int: i64 to index + %res = scf.for %arg0 = %c0 to %n_steps step %c1 iter_args(%arg = %noisy_sample) -> (tensor<1x4x128x128xf16>) { + %step_64 = arith.index_cast %arg0 : index to i64 + %this_step = tensor.from_elements %step_64 : tensor<1xi64> + %inner = func.call @compiled_scheduled_unet.run_forward(%arg, %p_embeds, %t_embeds, %time_ids, %guidance_scale, %this_step) : (tensor<1x4x128x128xf16>, tensor<2x64x2048xf16>, tensor<2x1280xf16>, tensor<2x6xf16>, tensor<1xf16>, tensor<1xi64>) -> tensor<1x4x128x128xf16> + scf.yield %inner : tensor<1x4x128x128xf16> + } + %image = func.call @compiled_vae.main(%res): (tensor<1x4x128x128xf16>) -> tensor<1x3x1024x1024xf16> + return %image : tensor<1x3x1024x1024xf16> + } +} diff --git a/experimental/regression_suite/ireers/artifacts.py b/experimental/regression_suite/ireers/artifacts.py deleted file mode 100644 index 12ad3808e48f..000000000000 --- a/experimental/regression_suite/ireers/artifacts.py +++ /dev/null @@ -1,154 +0,0 @@ -# Copyright 2023 The IREE Authors -# -# Licensed under the Apache License v2.0 with LLVM Exceptions. -# See https://llvm.org/LICENSE.txt for license information. -# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception - -from typing import Any, Callable, Collection, Dict, Union -import functools -from pathlib import Path -from tqdm import tqdm -import urllib.parse -import urllib.request - - -def show_progress(t): - last_b = [0] - - def update_to(b=1, bsize=1, tsize=None): - if tsize is not None: - t.total = tsize - t.update((b - last_b[0]) * bsize) - last_b[0] = b - - return update_to - - -@functools.cache -def get_artifact_root_dir() -> Path: - # TODO: Make configurable. - return Path.cwd() / "artifacts" - - -class ArtifactGroup: - """A group of artifacts with a persistent location on disk.""" - - _INSTANCES: Dict[str, "ArtifactGroup"] = {} - - def __init__(self, group_name: str): - self.group_name = group_name - if group_name: - self.directory = get_artifact_root_dir() / group_name - else: - self.directory = get_artifact_root_dir() - self.directory.mkdir(parents=True, exist_ok=True) - - @classmethod - def get(cls, group: Union["ArtifactGroup", str]) -> "ArtifactGroup": - if isinstance(group, ArtifactGroup): - return group - try: - return cls._INSTANCES[group] - except KeyError: - instance = ArtifactGroup(group) - cls._INSTANCES[group] = instance - return instance - - -class Artifact: - """Some form of artifact materialized to disk.""" - - def __init__( - self, - group: Union[ArtifactGroup, str], - name: str, - depends: Collection["Artifact"] = (), - ): - self.group = ArtifactGroup.get(group) - self.name = name - self.depends = tuple(depends) - - @property - def path(self) -> Path: - return self.group.directory / self.name - - def join(self): - """Waits for the artifact to become available.""" - pass - - def __str__(self): - return str(self.path) - - -class ProducedArtifact(Artifact): - def __init__( - self, - group: Union[ArtifactGroup, str], - name: str, - callback: Callable[["ProducedArtifact"], Any], - *, - always_produce: bool = False, - depends: Collection["Artifact"] = (), - ): - self.group = ArtifactGroup.get(group) - super().__init__(group, name, depends) - self.name = name - self.callback = callback - self.always_produce = always_produce - - @property - def stamp_path(self) -> Path: - """Path of a stamp file which indicates successful transfer.""" - return self.path.with_suffix(self.path.suffix + ".stamp") - - def start(self) -> "ProducedArtifact": - if not self.always_produce and self.stamp_path.exists(): - if self.path.exists(): - print(f"Not producing {self} because it has already been produced") - return self - self.stamp_path.unlink() - self.callback(self) - if not self.path.exists(): - raise RuntimeError( - f"Artifact {self} succeeded generation but was not produced" - ) - self.stamp() - return self - - def stamp(self): - self.stamp_path.touch() - - -class FetchedArtifact(ProducedArtifact): - """Represents an artifact that is to be fetched.""" - - def __init__(self, group: Union[ArtifactGroup, str], url: str): - name = Path(urllib.parse.urlparse(url).path).name - super().__init__(group, name, FetchedArtifact._callback) - self.url = url - - @staticmethod - def _callback(self: "FetchedArtifact"): - print(f"Downloading {self.url} -> {self.path}", flush=True, end="") - with tqdm( - unit="B", - unit_scale=True, - unit_divisor=1024, - miniters=1, - desc=str(self.path), - ) as t: - urllib.request.urlretrieve(self.url, self.path, reporthook=show_progress(t)) - print(f": Retrieved {self.path.stat().st_size} bytes") - - -class StreamArtifact(Artifact): - def __init__(self, group: Union[ArtifactGroup, str], name: str): - super().__init__(group, name) - self.io = open(self.path, "ab", buffering=0) - - def __del__(self): - self.io.close() - - def write_line(self, line: Union[str, bytes]): - contents = line if isinstance(line, bytes) else line.encode() - self.io.write(contents + b"\n") diff --git a/experimental/regression_suite/ireers/__init__.py b/experimental/regression_suite/ireers_tools/__init__.py similarity index 100% rename from experimental/regression_suite/ireers/__init__.py rename to experimental/regression_suite/ireers_tools/__init__.py diff --git a/experimental/regression_suite/ireers_tools/artifacts.py b/experimental/regression_suite/ireers_tools/artifacts.py new file mode 100644 index 000000000000..056039364341 --- /dev/null +++ b/experimental/regression_suite/ireers_tools/artifacts.py @@ -0,0 +1,234 @@ +# Copyright 2023 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +from typing import Any, Callable, Collection, Dict, Union +import functools +from pathlib import Path +from tqdm import tqdm +import urllib.parse +import urllib.request +import os +from azure.storage.blob import BlobClient, BlobProperties +import hashlib +import mmap +import re +import logging + +logger = logging.getLogger(__name__) +# Adjust logging levels. +logging.basicConfig(level=logging.INFO) +for log_name, log_obj in logging.Logger.manager.loggerDict.items(): + if log_name.startswith("azure"): + logging.getLogger(log_name).setLevel(logging.WARNING) + + +def show_progress(t): + last_b = [0] + + def update_to(b=1, bsize=1, tsize=None): + if tsize is not None: + t.total = tsize + t.update((b - last_b[0]) * bsize) + last_b[0] = b + + return update_to + + +@functools.cache +def get_artifact_root_dir() -> Path: + root_path = os.getenv("IREE_TEST_FILES", default=str(Path.cwd())) + "/artifacts" + return Path(os.path.expanduser(root_path)).resolve() + + +class ArtifactGroup: + """A group of artifacts with a persistent location on disk.""" + + _INSTANCES: Dict[str, "ArtifactGroup"] = {} + + def __init__(self, group_name: str): + self.group_name = group_name + if group_name: + self.directory = get_artifact_root_dir() / group_name + else: + self.directory = get_artifact_root_dir() + self.directory.mkdir(parents=True, exist_ok=True) + + @classmethod + def get(cls, group: Union["ArtifactGroup", str]) -> "ArtifactGroup": + if isinstance(group, ArtifactGroup): + return group + try: + return cls._INSTANCES[group] + except KeyError: + instance = ArtifactGroup(group) + cls._INSTANCES[group] = instance + return instance + + +class Artifact: + """Some form of artifact materialized to disk.""" + + def __init__( + self, + group: Union[ArtifactGroup, str], + name: str, + depends: Collection["Artifact"] = (), + ): + self.group = ArtifactGroup.get(group) + self.name = name + self.depends = tuple(depends) + + @property + def path(self) -> Path: + return self.group.directory / self.name + + def join(self): + """Waits for the artifact to become available.""" + pass + + def __str__(self): + return str(self.path) + + +class ProducedArtifact(Artifact): + def __init__( + self, + group: Union[ArtifactGroup, str], + name: str, + callback: Callable[["ProducedArtifact"], Any], + *, + depends: Collection["Artifact"] = (), + ): + self.group = ArtifactGroup.get(group) + super().__init__(group, name, depends) + self.name = name + self.callback = callback + + def start(self) -> "ProducedArtifact": + self.callback(self) + if not self.path.exists(): + raise RuntimeError( + f"Artifact {self} succeeded generation but was not produced" + ) + return self + + +class FetchedArtifact(ProducedArtifact): + """Represents an artifact that is to be fetched.""" + + def __init__(self, group: Union[ArtifactGroup, str], url: str): + name = Path(urllib.parse.urlparse(url).path).name + super().__init__(group, name, FetchedArtifact._callback) + self.url = url + + def human_readable_size(self, size, decimal_places=2): + for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]: + if size < 1024.0 or unit == "PiB": + break + size /= 1024.0 + return f"{size:.{decimal_places}f} {unit}" + + def get_azure_md5(self, remote_file: str, azure_blob_properties: BlobProperties): + """Gets the content_md5 hash for a blob on Azure, if available.""" + content_settings = azure_blob_properties.get("content_settings") + if not content_settings: + return None + azure_md5 = content_settings.get("content_md5") + if not azure_md5: + logger.warning( + f" Remote file '{remote_file}' on Azure is missing the " + "'content_md5' property, can't check if local matches remote" + ) + return azure_md5 + + def get_local_md5(self, local_file_path: Path): + """Gets the content_md5 hash for a lolca file, if it exists.""" + if not local_file_path.exists() or local_file_path.stat().st_size == 0: + return None + + with open(local_file_path) as file, mmap.mmap( + file.fileno(), 0, access=mmap.ACCESS_READ + ) as file: + return hashlib.md5(file).digest() + + def download_azure_artifact(self: "FetchedArtifact"): + """ + Checks the hashes between the local file and azure file. + """ + remote_file_name = self.url.rsplit("/", 1)[-1] + + # Extract path components from Azure URL to use with the Azure Storage Blobs + # client library for Python (https://pypi.org/project/azure-storage-blob/). + # + # For example: + # https://sharkpublic.blob.core.windows.net/sharkpublic/path/to/blob.txt + # ^ ^ + # account_url: https://sharkpublic.blob.core.windows.net + # container_name: sharkpublic + # blob_name: path/to/blob.txt + result = re.search(r"(https.+\.net)/([^/]+)/(.+)", self.url) + account_url = result.groups()[0] + container_name = result.groups()[1] + blob_name = result.groups()[2] + + with BlobClient( + account_url, + container_name, + blob_name, + max_chunk_get_size=1024 * 1024 * 32, # 32 MiB + max_single_get_size=1024 * 1024 * 32, # 32 MiB + ) as blob_client: + blob_properties = blob_client.get_blob_properties() + blob_size_str = self.human_readable_size(blob_properties.size) + azure_md5 = self.get_azure_md5(self.url, blob_properties) + + local_md5 = self.get_local_md5(self.path) + + if azure_md5 and azure_md5 == local_md5: + logger.info( + f" Skipping '{remote_file_name}' download ({blob_size_str}) " + "- local MD5 hash matches" + ) + return + + if not local_md5: + logger.info( + f" Downloading '{remote_file_name}' ({blob_size_str}) " + f"to '{self.path}'" + ) + with open(self.path, mode="wb") as local_blob: + download_stream = blob_client.download_blob(max_concurrency=4) + local_blob.write(download_stream.readall()) + else: + logger.info( + f" Downloading '{remote_file_name}' ({blob_size_str}) " + f"to '{self.path}' (local MD5 does not match)" + ) + with open(self.path, mode="wb") as local_blob: + download_stream = blob_client.download_blob(max_concurrency=4) + local_blob.write(download_stream.readall()) + + @staticmethod + def _callback(self: "FetchedArtifact"): + if "blob.core.windows.net" in self.url: + self.download_azure_artifact() + else: + raise NotImplementedError( + f"Unsupported fetched artifact URL schema for '{self.url}'" + ) + + +class StreamArtifact(Artifact): + def __init__(self, group: Union[ArtifactGroup, str], name: str): + super().__init__(group, name) + self.io = open(self.path, "ab", buffering=0) + + def __del__(self): + self.io.close() + + def write_line(self, line: Union[str, bytes]): + contents = line if isinstance(line, bytes) else line.encode() + self.io.write(contents + b"\n") diff --git a/experimental/regression_suite/ireers/fixtures.py b/experimental/regression_suite/ireers_tools/fixtures.py similarity index 100% rename from experimental/regression_suite/ireers/fixtures.py rename to experimental/regression_suite/ireers_tools/fixtures.py diff --git a/experimental/regression_suite/setup.py b/experimental/regression_suite/setup.py index 6d3e8b93de3a..06bc65bd73f7 100644 --- a/experimental/regression_suite/setup.py +++ b/experimental/regression_suite/setup.py @@ -11,15 +11,22 @@ version=f"0.1dev1", packages=find_namespace_packages( include=[ - "ireers", + "ireers_tools", ], ), install_requires=[ "numpy", "pytest", "pytest-xdist", + "pytest-depends", + "pytest-retry", + "pytest-timeout", + "pytest-xdist", + "pytest-check", "PyYAML", + "tabulate", "tqdm", + "azure-storage-blob", ], extras_require={}, ) diff --git a/experimental/regression_suite/shark-test-suite-models/conftest.py b/experimental/regression_suite/shark-test-suite-models/conftest.py new file mode 100644 index 000000000000..135fb81abae1 --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/conftest.py @@ -0,0 +1,22 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + + +class VmfbManager: + sdxl_clip_cpu_vmfb = None + sdxl_vae_cpu_vmfb = None + sdxl_unet_cpu_vmfb = None + sdxl_clip_rocm_vmfb = None + sdxl_vae_rocm_vmfb = None + sdxl_unet_rocm_vmfb = None + sdxl_unet_cpu_pipeline_vmfb = None + sdxl_unet_rocm_pipeline_vmfb = None + sd3_clip_cpu_vmfb = None + sd3_vae_cpu_vmfb = None + sd3_mmdit_cpu_vmfb = None + sd3_clip_rocm_vmfb = None + sd3_vae_rocm_vmfb = None + sd3_mmdit_rocm_vmfb = None diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py new file mode 100644 index 000000000000..27253226762f --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_clip.py @@ -0,0 +1,167 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import pytest +from ireers_tools import * +import os +from conftest import VmfbManager + +rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a") + +############################################################################### +# Fixtures +############################################################################### + +sd3_clip_inference_input_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.0.bin", + group="sd3_clip", +) + +sd3_clip_inference_input_1 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.1.bin", + group="sd3_clip", +) + +sd3_clip_inference_input_2 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.2.bin", + group="sd3_clip", +) + +sd3_clip_inference_input_3 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.3.bin", + group="sd3_clip", +) + +sd3_clip_inference_input_4 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.4.bin", + group="sd3_clip", +) + +sd3_clip_inference_input_5 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_input.5.bin", + group="sd3_clip", +) + +sd3_clip_inference_output_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_output.0.bin", + group="sd3_clip", +) + +sd3_clip_inference_output_1 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/inference_output.1.bin", + group="sd3_clip", +) + +sd3_clip_real_weights = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/real_weights.irpa", + group="sd3_clip", +) + +sd3_clip_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-prompt-encoder/model.mlirbc", + group="sd3_clip", +) + +CPU_COMPILE_FLAGS = [ + "--iree-hal-target-backends=llvm-cpu", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false", + "--iree-llvmcpu-distribution-size=32", + "--iree-opt-const-eval=false", + "--iree-llvmcpu-enable-ukernels=all", + "--iree-global-opt-enable-quantized-matmul-reassociation", +] + + +@pytest.fixture +def SD3_CLIP_COMMON_RUN_FLAGS( + sd3_clip_inference_input_0, + sd3_clip_inference_input_1, + sd3_clip_inference_input_2, + sd3_clip_inference_input_3, + sd3_clip_inference_input_4, + sd3_clip_inference_input_5, + sd3_clip_inference_output_0, + sd3_clip_inference_output_1, +): + return [ + f"--input=1x77x2xi64=@{sd3_clip_inference_input_0.path}", + f"--input=1x77x2xi64=@{sd3_clip_inference_input_1.path}", + f"--input=1x77x2xi64=@{sd3_clip_inference_input_2.path}", + f"--input=1x77x2xi64=@{sd3_clip_inference_input_3.path}", + f"--input=1x77x2xi64=@{sd3_clip_inference_input_4.path}", + f"--input=1x77x2xi64=@{sd3_clip_inference_input_5.path}", + f"--expected_output=2x154x4096xf32=@{sd3_clip_inference_output_0.path}", + f"--expected_output=2x2048xf32=@{sd3_clip_inference_output_1.path}", + ] + + +ROCM_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-input-type=torch", + "--iree-opt-const-eval=false", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-rocm-waves-per-eu=2", + "--iree-llvmgpu-enable-prefetch", + "--iree-flow-enable-aggressive-fusion", + "--iree-global-opt-enable-fuse-horizontal-contractions=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-codegen-llvmgpu-use-vector-distribution=true", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))", +] + +############################################################################### +# CPU +############################################################################### + + +def test_compile_clip_cpu(sd3_clip_mlir): + VmfbManager.sd3_clip_cpu_vmfb = iree_compile( + sd3_clip_mlir, "cpu", CPU_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_clip_cpu"]) +def test_run_clip_cpu(SD3_CLIP_COMMON_RUN_FLAGS, sd3_clip_real_weights): + iree_run_module( + VmfbManager.sd3_clip_cpu_vmfb, + device="local-task", + function="encode_tokens", + args=[ + f"--parameters=model={sd3_clip_real_weights.path}", + "--expected_f32_threshold=0.15f", + ] + + SD3_CLIP_COMMON_RUN_FLAGS, + ) + + +############################################################################### +# ROCM +############################################################################### + + +@pytest.mark.xfail( + strict=True, + reason="Expected compilation to fail", +) +def test_compile_clip_rocm(sd3_clip_mlir): + VmfbManager.sd3_clip_rocm_vmfb = iree_compile( + sd3_clip_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_clip_rocm"]) +def test_run_clip_rocm(SD3_CLIP_COMMON_RUN_FLAGS, sd3_clip_real_weights): + return iree_run_module( + VmfbManager.sd3_clip_rocm_vmfb, + device="hip", + function="encode_tokens", + args=[f"--parameters=model={sd3_clip_real_weights.path}"] + + SD3_CLIP_COMMON_RUN_FLAGS, + ) diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py new file mode 100644 index 000000000000..f328211de45c --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_mmdit.py @@ -0,0 +1,152 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import pytest +from ireers_tools import * +import os +from pathlib import Path +from conftest import VmfbManager + +iree_test_path_extension = os.getenv("IREE_TEST_PATH_EXTENSION", default=Path.cwd()) +rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a") + +############################################################################### +# Fixtures +############################################################################### + +sd3_mmdit_inference_input_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.0.bin", + group="sd3_mmdit", +) + +sd3_mmdit_inference_input_1 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.1.bin", + group="sd3_mmdit", +) + +sd3_mmdit_inference_input_2 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.2.bin", + group="sd3_mmdit", +) + +sd3_mmdit_inference_input_3 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_input.3.bin", + group="sd3_mmdit", +) + +sd3_mmdit_inference_output_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/inference_output.0.bin", + group="sd3_mmdit", +) + +sd3_mmdit_real_weights = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/real_weights.irpa", + group="sd3_mmdit", +) + +sd3_mmdit_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-mmdit/model.mlirbc", + group="sd3_mmdit", +) + +CPU_COMPILE_FLAGS = [ + "--iree-hal-target-backends=llvm-cpu", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false", + "--iree-llvmcpu-distribution-size=32", + "--iree-opt-const-eval=false", + "--iree-llvmcpu-enable-ukernels=all", + "--iree-global-opt-enable-quantized-matmul-reassociation", +] + + +@pytest.fixture +def SD3_MMDIT_COMMON_RUN_FLAGS( + sd3_mmdit_inference_input_0, + sd3_mmdit_inference_input_1, + sd3_mmdit_inference_input_2, + sd3_mmdit_inference_input_3, + sd3_mmdit_inference_output_0, +): + return [ + f"--input=2x16x128x128xf16=@{sd3_mmdit_inference_input_0.path}", + f"--input=2x154x4096xf16=@{sd3_mmdit_inference_input_1.path}", + f"--input=2x2048xf16=@{sd3_mmdit_inference_input_2.path}", + f"--input=2xf16=@{sd3_mmdit_inference_input_3.path}", + f"--expected_output=2x16x128x128xf32=@{sd3_mmdit_inference_output_0.path}", + ] + + +ROCM_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-opt-const-eval=false", + f"--iree-codegen-transform-dialect-library={iree_test_path_extension}/attention_and_matmul_spec.mlir", + "--iree-global-opt-propagate-transposes=true", + "--iree-global-opt-enable-fuse-horizontal-contractions=true", + "--iree-flow-enable-aggressive-fusion=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-vm-target-truncate-unsupported-floats", + "--iree-llvmgpu-enable-prefetch=true", + "--iree-opt-data-tiling=false", + "--iree-codegen-gpu-native-math-precision=true", + "--iree-codegen-llvmgpu-use-vector-distribution", + "--iree-rocm-waves-per-eu=2", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", +] + +############################################################################### +# CPU +############################################################################### + + +def test_compile_mmdit_cpu(sd3_mmdit_mlir): + VmfbManager.sd3_mmdit_cpu_vmfb = iree_compile( + sd3_mmdit_mlir, "cpu", CPU_COMPILE_FLAGS + ) + + +@pytest.mark.xfail( + strict=True, + reason="Expected run to fail", +) +@pytest.mark.depends(on=["test_compile_mmdit_cpu"]) +def test_run_mmdit_cpu(SD3_MMDIT_COMMON_RUN_FLAGS, sd3_mmdit_real_weights): + return iree_run_module( + VmfbManager.sd3_mmdit_cpu_vmfb, + device="local-task", + function="run_forward", + args=[f"--parameters=model={sd3_mmdit_real_weights.path}"] + + SD3_MMDIT_COMMON_RUN_FLAGS, + ) + + +############################################################################### +# ROCM +############################################################################### + + +@pytest.mark.xfail( + strict=True, + reason="Expected compilation to fail", +) +def test_compile_mmdit_rocm(sd3_mmdit_mlir): + VmfbManager.sd3_mmdit_rocm_vmfb = iree_compile( + sd3_mmdit_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_mmdit_rocm"]) +def test_run_mmdit_rocm(SD3_MMDIT_COMMON_RUN_FLAGS, sd3_mmdit_real_weights): + return iree_run_module( + VmfbManager.sd3_mmdit_rocm_vmfb, + device="hip", + function="run_forward", + args=[f"--parameters=model={sd3_mmdit_real_weights.path}"] + + SD3_MMDIT_COMMON_RUN_FLAGS, + ) diff --git a/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py b/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py new file mode 100644 index 000000000000..6d9ab660dffc --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/sd3/test_vae.py @@ -0,0 +1,119 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import pytest +from ireers_tools import * +import os +from conftest import VmfbManager + +rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a") + +############################################################################### +# Fixtures +############################################################################### + +sd3_vae_inference_input_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/inference_input.0.bin", + group="sd3_vae", +) + +sd3_vae_inference_output_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/inference_output.0.bin", + group="sd3_vae", +) + +sd3_vae_real_weights = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/real_weights.irpa", + group="sd3_vae", +) + +sd3_vae_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sd3-vae/model.mlirbc", + group="sd3_vae", +) + +CPU_COMPILE_FLAGS = [ + "--iree-hal-target-backends=llvm-cpu", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false", + "--iree-llvmcpu-distribution-size=32", + "--iree-opt-const-eval=false", + "--iree-llvmcpu-enable-ukernels=all", + "--iree-global-opt-enable-quantized-matmul-reassociation", +] + + +@pytest.fixture +def SD3_VAE_COMMON_RUN_FLAGS( + sd3_vae_inference_input_0, + sd3_vae_inference_output_0, +): + return [ + f"--input=1x16x128x128xf16=@{sd3_vae_inference_input_0.path}", + f"--expected_output=3x1024x1024xf32=@{sd3_vae_inference_output_0.path}", + ] + + +ROCM_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-opt-const-eval=false", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-llvmgpu-enable-prefetch=true", + "--iree-rocm-waves-per-eu=2", + "--iree-flow-enable-aggressive-fusion=true", + "--iree-codegen-llvmgpu-use-vector-distribution=true", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", +] + +############################################################################### +# CPU +############################################################################### + + +def test_compile_vae_cpu(sd3_vae_mlir): + VmfbManager.sd3_vae_cpu_vmfb = iree_compile(sd3_vae_mlir, "cpu", CPU_COMPILE_FLAGS) + + +@pytest.mark.depends(on=["test_compile_vae_cpu"]) +def test_run_vae_cpu(SD3_VAE_COMMON_RUN_FLAGS, sd3_vae_real_weights): + return iree_run_module( + VmfbManager.sd3_vae_cpu_vmfb, + device="local-task", + function="decode", + args=[ + f"--parameters=model={sd3_vae_real_weights.path}", + "--expected_f32_threshold=0.01f", + ] + + SD3_VAE_COMMON_RUN_FLAGS, + ) + + +############################################################################### +# ROCM +############################################################################### + + +def test_compile_vae_rocm(sd3_vae_mlir): + VmfbManager.sd3_vae_rocm_vmfb = iree_compile( + sd3_vae_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_vae_rocm"]) +def test_run_vae_rocm(SD3_VAE_COMMON_RUN_FLAGS, sd3_vae_real_weights): + return iree_run_module( + VmfbManager.sd3_vae_rocm_vmfb, + device="hip", + function="decode", + args=[ + f"--parameters=model={sd3_vae_real_weights.path}", + "--expected_f32_threshold=0.7f", + ] + + SD3_VAE_COMMON_RUN_FLAGS, + ) diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py new file mode 100644 index 000000000000..41b2e61ad312 --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_clip.py @@ -0,0 +1,154 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import pytest +from ireers_tools import * +import os +from conftest import VmfbManager + +rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a") + +############################################################################### +# Fixtures +############################################################################### + +sdxl_clip_inference_input_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.0.bin", + group="sdxl_clip", +) + +sdxl_clip_inference_input_1 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.1.bin", + group="sdxl_clip", +) + +sdxl_clip_inference_input_2 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.2.bin", + group="sdxl_clip", +) + +sdxl_clip_inference_input_3 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_input.3.bin", + group="sdxl_clip", +) + +sdxl_clip_inference_output_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_output.0.bin", + group="sdxl_clip", +) + +sdxl_clip_inference_output_1 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/inference_output.1.bin", + group="sdxl_clip", +) + +sdxl_clip_real_weights = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/real_weights.irpa", + group="sdxl_clip", +) + +sdxl_clip_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-prompt-encoder/model.mlirbc", + group="sdxl_clip", +) + +CPU_COMPILE_FLAGS = [ + "--iree-hal-target-backends=llvm-cpu", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false", + "--iree-llvmcpu-distribution-size=32", + "--iree-opt-const-eval=false", + "--iree-llvmcpu-enable-ukernels=all", + "--iree-global-opt-enable-quantized-matmul-reassociation", +] + + +@pytest.fixture +def SDXL_CLIP_COMMON_RUN_FLAGS( + sdxl_clip_inference_input_0, + sdxl_clip_inference_input_1, + sdxl_clip_inference_input_2, + sdxl_clip_inference_input_3, + sdxl_clip_inference_output_0, + sdxl_clip_inference_output_1, +): + return [ + f"--input=1x64xi64=@{sdxl_clip_inference_input_0.path}", + f"--input=1x64xi64=@{sdxl_clip_inference_input_1.path}", + f"--input=1x64xi64=@{sdxl_clip_inference_input_2.path}", + f"--input=1x64xi64=@{sdxl_clip_inference_input_3.path}", + f"--expected_output=2x64x2048xf16=@{sdxl_clip_inference_output_0.path}", + f"--expected_output=2x1280xf16=@{sdxl_clip_inference_output_1.path}", + ] + + +ROCM_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-input-type=torch", + "--iree-opt-const-eval=false", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-rocm-waves-per-eu=2", + "--iree-llvmgpu-enable-prefetch", + "--iree-flow-enable-aggressive-fusion", + "--iree-global-opt-enable-fuse-horizontal-contractions=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-codegen-llvmgpu-use-vector-distribution=true", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))", + "--iree-scheduling-dump-statistics-format=json", + "--iree-scheduling-dump-statistics-file=compilation_info.json", +] + +############################################################################### +# CPU +############################################################################### + + +def test_compile_clip_cpu(sdxl_clip_mlir): + VmfbManager.sdxl_clip_cpu_vmfb = iree_compile( + sdxl_clip_mlir, "cpu", CPU_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_clip_cpu"]) +def test_run_clip_cpu(SDXL_CLIP_COMMON_RUN_FLAGS, sdxl_clip_real_weights): + iree_run_module( + VmfbManager.sdxl_clip_cpu_vmfb, + device="local-task", + function="encode_prompts", + args=[ + f"--parameters=model={sdxl_clip_real_weights.path}", + "--expected_f16_threshold=1.0f", + ] + + SDXL_CLIP_COMMON_RUN_FLAGS, + ) + + +############################################################################### +# ROCM +############################################################################### + + +def test_compile_clip_rocm(sdxl_clip_mlir): + VmfbManager.sdxl_clip_rocm_vmfb = iree_compile( + sdxl_clip_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_clip_rocm"]) +def test_run_clip_rocm(SDXL_CLIP_COMMON_RUN_FLAGS, sdxl_clip_real_weights): + return iree_run_module( + VmfbManager.sdxl_clip_rocm_vmfb, + device="hip", + function="encode_prompts", + args=[ + f"--parameters=model={sdxl_clip_real_weights.path}", + "--expected_f16_threshold=1.0f", + ] + + SDXL_CLIP_COMMON_RUN_FLAGS, + ) diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py new file mode 100644 index 000000000000..4e1bc70dcb4c --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_unet.py @@ -0,0 +1,183 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import pytest +from ireers_tools import * +import os +import setuptools +from conftest import VmfbManager +from pathlib import Path + +iree_test_path_extension = os.getenv("IREE_TEST_PATH_EXTENSION", default=Path.cwd()) +rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a") + +############################################################################### +# Fixtures +############################################################################### + +sdxl_unet_inference_input_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.0.bin", + group="sdxl_unet", +) + +sdxl_unet_inference_input_1 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.1.bin", + group="sdxl_unet", +) + +sdxl_unet_inference_input_2 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.2.bin", + group="sdxl_unet", +) + +sdxl_unet_inference_input_3 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_input.3.bin", + group="sdxl_unet", +) + +sdxl_unet_inference_output_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/inference_output.0.bin", + group="sdxl_unet", +) + +sdxl_unet_real_weights = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/real_weights.irpa", + group="sdxl_unet", +) + +sdxl_unet_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/model.mlirbc", + group="sdxl_unet", +) + +sdxl_unet_pipeline_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-scheduled-unet/sdxl_unet_pipeline_bench_f16.mlir", + group="sdxl_unet", +) + +CPU_COMPILE_FLAGS = [ + "--iree-hal-target-backends=llvm-cpu", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false", + "--iree-llvmcpu-distribution-size=32", + "--iree-opt-const-eval=false", + "--iree-llvmcpu-enable-ukernels=all", + "--iree-global-opt-enable-quantized-matmul-reassociation", +] + + +@pytest.fixture +def SDXL_UNET_COMMON_RUN_FLAGS( + sdxl_unet_inference_input_0, + sdxl_unet_inference_input_1, + sdxl_unet_inference_input_2, + sdxl_unet_inference_input_3, + sdxl_unet_inference_output_0, +): + return [ + f"--input=1x4x128x128xf16=@{sdxl_unet_inference_input_0.path}", + f"--input=2x64x2048xf16=@{sdxl_unet_inference_input_1.path}", + f"--input=2x1280xf16=@{sdxl_unet_inference_input_2.path}", + f"--input=1xf16=@{sdxl_unet_inference_input_3.path}", + f"--expected_output=1x4x128x128xf16=@{sdxl_unet_inference_output_0.path}", + ] + + +ROCM_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-opt-const-eval=false", + f"--iree-codegen-transform-dialect-library={iree_test_path_extension}/attention_and_matmul_spec.mlir", + "--iree-global-opt-propagate-transposes=true", + "--iree-global-opt-enable-fuse-horizontal-contractions=true", + "--iree-flow-enable-aggressive-fusion=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-vm-target-truncate-unsupported-floats", + "--iree-llvmgpu-enable-prefetch=true", + "--iree-opt-data-tiling=false", + "--iree-codegen-gpu-native-math-precision=true", + "--iree-codegen-llvmgpu-use-vector-distribution", + "--iree-rocm-waves-per-eu=2", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", + "--iree-scheduling-dump-statistics-format=json", + "--iree-scheduling-dump-statistics-file=compilation_info.json", +] + +ROCM_PIPELINE_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--verify=false", + "--iree-opt-const-eval=false", +] + +############################################################################### +# CPU +############################################################################### + + +def test_compile_unet_pipeline_cpu(sdxl_unet_pipeline_mlir): + VmfbManager.sdxl_unet_cpu_pipeline_vmfb = iree_compile( + sdxl_unet_pipeline_mlir, + "cpu", + CPU_COMPILE_FLAGS, + ) + + +def test_compile_unet_cpu(sdxl_unet_mlir): + VmfbManager.sdxl_unet_cpu_vmfb = iree_compile( + sdxl_unet_mlir, "cpu", CPU_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_unet_pipeline_cpu", "test_compile_unet_cpu"]) +def test_run_unet_cpu(SDXL_UNET_COMMON_RUN_FLAGS, sdxl_unet_real_weights): + return iree_run_module( + VmfbManager.sdxl_unet_cpu_vmfb, + device="local-task", + function="produce_image_latents", + args=[ + f"--parameters=model={sdxl_unet_real_weights.path}", + f"--module={VmfbManager.sdxl_unet_cpu_pipeline_vmfb.path}", + "--expected_f16_threshold=0.8f", + ] + + SDXL_UNET_COMMON_RUN_FLAGS, + ) + + +############################################################################### +# ROCM +############################################################################### + + +def test_compile_unet_pipeline_rocm(sdxl_unet_pipeline_mlir): + VmfbManager.sdxl_unet_rocm_pipeline_vmfb = iree_compile( + sdxl_unet_pipeline_mlir, + f"rocm_{rocm_chip}", + ROCM_PIPELINE_COMPILE_FLAGS, + ) + + +def test_compile_unet_rocm(sdxl_unet_mlir): + VmfbManager.sdxl_unet_rocm_vmfb = iree_compile( + sdxl_unet_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_unet_pipeline_rocm", "test_compile_unet_rocm"]) +def test_run_unet_rocm(SDXL_UNET_COMMON_RUN_FLAGS, sdxl_unet_real_weights): + return iree_run_module( + VmfbManager.sdxl_unet_rocm_vmfb, + device="hip", + function="produce_image_latents", + args=[ + f"--parameters=model={sdxl_unet_real_weights.path}", + f"--module={VmfbManager.sdxl_unet_rocm_pipeline_vmfb.path}", + "--expected_f16_threshold=0.7f", + ] + + SDXL_UNET_COMMON_RUN_FLAGS, + ) diff --git a/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py b/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py new file mode 100644 index 000000000000..49e49d3aec3e --- /dev/null +++ b/experimental/regression_suite/shark-test-suite-models/sdxl/test_vae.py @@ -0,0 +1,123 @@ +# Copyright 2024 The IREE Authors +# +# Licensed under the Apache License v2.0 with LLVM Exceptions. +# See https://llvm.org/LICENSE.txt for license information. +# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception + +import pytest +from ireers_tools import * +import os +from conftest import VmfbManager + +rocm_chip = os.getenv("ROCM_CHIP", default="gfx90a") + +############################################################################### +# Fixtures +############################################################################### + +sdxl_vae_inference_input_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/inference_input.0.bin", + group="sdxl_vae", +) + +sdxl_vae_inference_output_0 = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/inference_output.0.bin", + group="sdxl_vae", +) + +sdxl_vae_real_weights = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/real_weights.irpa", + group="sdxl_vae", +) + +sdxl_vae_mlir = fetch_source_fixture( + "https://sharkpublic.blob.core.windows.net/sharkpublic/sai/sdxl-vae-decode/model.mlirbc", + group="sdxl_vae", +) + +CPU_COMPILE_FLAGS = [ + "--iree-hal-target-backends=llvm-cpu", + "--iree-llvmcpu-target-cpu-features=host", + "--iree-llvmcpu-fail-on-out-of-bounds-stack-allocation=false", + "--iree-llvmcpu-distribution-size=32", + "--iree-opt-const-eval=false", + "--iree-llvmcpu-enable-ukernels=all", + "--iree-global-opt-enable-quantized-matmul-reassociation", +] + + +@pytest.fixture +def SDXL_VAE_COMMON_RUN_FLAGS( + sdxl_vae_inference_input_0, + sdxl_vae_inference_output_0, +): + return [ + f"--input=1x4x128x128xf16=@{sdxl_vae_inference_input_0.path}", + f"--expected_output=1x3x1024x1024xf16=@{sdxl_vae_inference_output_0.path}", + ] + + +ROCM_COMPILE_FLAGS = [ + "--iree-hal-target-backends=rocm", + f"--iree-rocm-target-chip={rocm_chip}", + "--iree-opt-const-eval=false", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-llvmgpu-enable-prefetch=true", + "--iree-rocm-waves-per-eu=2", + "--iree-flow-enable-aggressive-fusion=true", + "--iree-codegen-llvmgpu-use-vector-distribution=true", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", + "--iree-scheduling-dump-statistics-format=json", + "--iree-scheduling-dump-statistics-file=compilation_info.json", +] + +############################################################################### +# CPU +############################################################################### + + +def test_compile_vae_cpu(sdxl_vae_mlir): + VmfbManager.sdxl_vae_cpu_vmfb = iree_compile( + sdxl_vae_mlir, "cpu", CPU_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_vae_cpu"]) +def test_run_vae_cpu(SDXL_VAE_COMMON_RUN_FLAGS, sdxl_vae_real_weights): + return iree_run_module( + VmfbManager.sdxl_vae_cpu_vmfb, + device="local-task", + function="main", + args=[ + f"--parameters=model={sdxl_vae_real_weights.path}", + "--expected_f16_threshold=0.02f", + ] + + SDXL_VAE_COMMON_RUN_FLAGS, + ) + + +############################################################################### +# ROCM +############################################################################### + + +def test_compile_vae_rocm(sdxl_vae_mlir): + VmfbManager.sdxl_vae_rocm_vmfb = iree_compile( + sdxl_vae_mlir, f"rocm_{rocm_chip}", ROCM_COMPILE_FLAGS + ) + + +@pytest.mark.depends(on=["test_compile_vae_rocm"]) +def test_run_vae_rocm(SDXL_VAE_COMMON_RUN_FLAGS, sdxl_vae_real_weights): + return iree_run_module( + VmfbManager.sdxl_vae_rocm_vmfb, + device="hip", + function="main", + args=[ + f"--parameters=model={sdxl_vae_real_weights.path}", + "--expected_f16_threshold=0.4f", + ] + + SDXL_VAE_COMMON_RUN_FLAGS, + ) diff --git a/experimental/regression_suite/tests/pregenerated/test_llama2.py b/experimental/regression_suite/tests/pregenerated/test_llama2.py index af9c8d2958c6..0db2abf9e6a4 100644 --- a/experimental/regression_suite/tests/pregenerated/test_llama2.py +++ b/experimental/regression_suite/tests/pregenerated/test_llama2.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import pytest -from ireers import * +from ireers_tools import * ############################################################################### # Fixtures diff --git a/experimental/regression_suite/tests/pregenerated/test_ukernel.py b/experimental/regression_suite/tests/pregenerated/test_ukernel.py index 90e183a59e55..f9c92c66de22 100644 --- a/experimental/regression_suite/tests/pregenerated/test_ukernel.py +++ b/experimental/regression_suite/tests/pregenerated/test_ukernel.py @@ -5,7 +5,7 @@ # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception import pytest -from ireers import * +from ireers_tools import * ############################################################################### # Fixtures