diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 8c69c434e9d9..74b3d90316ca 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -430,6 +430,50 @@ jobs: run: | ./build_tools/cmake/ctest_all.sh ${BUILD_DIR} + test_amd_mi300: + needs: [setup, build_all] + if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_amd_mi300') + env: + BUILD_DIR: build-tests + INSTALL_DIR: ${{ needs.build_all.outputs.install-dir }} + INSTALL_DIR_ARCHIVE: ${{ needs.build_all.outputs.install-dir-archive }} + INSTALL_DIR_GCS_URL: ${{ needs.build_all.outputs.install-dir-gcs-url }} + IREE_CPU_DISABLE: 1 + IREE_VULKAN_DISABLE: 1 + IREE_CUDA_DISABLE: 1 + IREE_HIP_DISABLE: 0 + IREE_HIP_TEST_TARGET_CHIP: "gfx942" + LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9 + runs-on: nodai-amdgpu-mi300-x86-64 + steps: + - name: Pre Checkout MI300 Step + if: contains(matrix.name, 'gfx942') + run: | + sudo chmod -R 777 ~/actions-runner/_work + - name: "Checking out repository" + uses: actions/checkout@v4.1.7 + - name: "Checking out runtime submodules" + run: ./build_tools/scripts/git/update_runtime_submodules.sh + - name: "Downloading install dir archive" + run: wget "${INSTALL_DIR_GCS_URL}" -O "${INSTALL_DIR_ARCHIVE}" + - name: "Extracting install directory" + run: tar -xf "${INSTALL_DIR_ARCHIVE}" + - name: "Building tests" + run: | + ./build_tools/pkgci/build_tests_using_package.sh ${INSTALL_DIR} + - name: "Running GPU tests" + env: + IREE_CTEST_LABEL_REGEX: ^requires-gpu|^driver=hip$ + IREE_NVIDIA_SM80_TESTS_DISABLE: 1 + IREE_MULTI_DEVICE_TESTS_DISABLE: 0 + IREE_AMD_RDNA3_TESTS_DISABLE: 1 + IREE_NVIDIA_GPU_TESTS_DISABLE: 0 + IREE_CUDA_DISABLE: 1 + IREE_CPU_DISABLE: 1 + IREE_HIP_DISABLE: 0 + run: | + ./build_tools/cmake/ctest_all.sh ${BUILD_DIR} + test_amd_w7900: needs: [setup, build_all] if: contains(fromJson(needs.setup.outputs.enabled-jobs), 'test_amd_w7900') @@ -939,6 +983,7 @@ jobs: - test_nvidia_gpu - test_nvidia_a100 - test_amd_mi250 + - test_amd_mi300 - test_amd_w7900 # Configurations diff --git a/.github/workflows/pkgci_regression_test.yml b/.github/workflows/pkgci_regression_test.yml index 99ee321502e0..905038241aab 100644 --- a/.github/workflows/pkgci_regression_test.yml +++ b/.github/workflows/pkgci_regression_test.yml @@ -144,13 +144,20 @@ jobs: runs-on: nodai-amdgpu-w7900-x86-64 # AMD GPU - - name: amdgpu_rocm_gfx90a + - name: amdgpu_rocm_mi250_gfx90a models-config-file: models_gpu_rocm_gfx90a.json models-extra-flags-config-file: models_gpu_rocm_gfx90a_additional_flags.json sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx90a.json sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx90a.json sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx90a.json runs-on: nodai-amdgpu-mi250-x86-64 + - name: amdgpu_rocm_mi300_gfx942 + models-config-file: models_gpu_rocm_gfx942.json + models-extra-flags-config-file: models_gpu_rocm_gfx942_additional_flags.json + sdxl-unet-config-file: sdxl_scheduled_unet_gpu_rocm_gfx942.json + sdxl-vae-config-file: sdxl_vae_decode_gpu_rocm_gfx942.json + sdxl-clip-config-file: sdxl_prompt_encoder_gpu_rocm_gfx942.json + runs-on: nodai-amdgpu-mi300-x86-64 - name: amdgpu_vulkan models-config-file: models_gpu_vulkan.json runs-on: nodai-amdgpu-w7900-x86-64 @@ -174,7 +181,14 @@ jobs: SDXL_CLIP_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-clip-config-file }} SDXL_VAE_CONFIG_FILE_PATH: build_tools/pkgci/external_test_suite/${{ matrix.sdxl-vae-config-file }} VENV_DIR: ${{ github.workspace }}/venv + LD_LIBRARY_PATH: /home/esaimana/Python-3.11.9 steps: + # TODO(saienduri): Find alternative to this temporary step that manipulates permission of github actions + # directory to be able to clean after every PR + - name: Pre Checkout MI300 Step + if: contains(matrix.name, 'gfx942') + run: | + sudo chmod -R 777 ~/actions-runner/_work - name: Checking out IREE repository uses: actions/checkout@v4.1.7 with: @@ -293,8 +307,8 @@ jobs: --durations=0 \ --config-files=${SDXL_VAE_CONFIG_FILE_PATH} - - name: "Running SDXL rocm pipeline benchmark" - if: contains(matrix.name, 'rocm') + - name: "Running SDXL rocm pipeline benchmark (mi250)" + if: contains(matrix.name, 'rocm_mi250_gfx90a') run: | source ${VENV_DIR}/bin/activate pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \ @@ -313,3 +327,25 @@ jobs: --log-cli-level=info \ --retries 7 echo "$(> $GITHUB_STEP_SUMMARY + rm job_summary.md + + - name: "Running SDXL rocm pipeline benchmark (mi300)" + if: contains(matrix.name, 'rocm_mi300_gfx942') + run: | + source ${VENV_DIR}/bin/activate + pytest SHARK-TestSuite/iree_tests/benchmarks/sdxl/benchmark_sdxl_rocm.py \ + --goldentime-rocm-e2e-ms 320 \ + --goldentime-rocm-unet-ms 77 \ + --goldentime-rocm-clip-ms 15 \ + --goldentime-rocm-vae-ms 74 \ + --goldendispatch-rocm-unet 1714 \ + --goldendispatch-rocm-clip 1569 \ + --goldendispatch-rocm-vae 248 \ + --goldensize-rocm-unet-bytes 2054938 \ + --goldensize-rocm-clip-bytes 780328 \ + --goldensize-rocm-vae-bytes 758509 \ + --gpu-number 0 \ + --rocm-chip gfx942 \ + --log-cli-level=info \ + --retries 7 + echo "$(> $GITHUB_STEP_SUMMARY diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942.json new file mode 100644 index 000000000000..5d451f0b6f20 --- /dev/null +++ b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942.json @@ -0,0 +1,28 @@ +{ + "config_name": "gpu_rocm", + "iree_compile_flags": [ + "--iree-hal-target-backends=rocm", + "--iree-rocm-target-chip=gfx942", + "--iree-input-demote-f64-to-f32" + ], + "iree_run_module_flags": [ + "--device=hip" + ], + "skip_compile_tests": [ + "pytorch/models/sdxl-scheduled-unet-3-tank", + "pytorch/models/sdxl-prompt-encoder-tank", + "pytorch/models/sdxl-vae-decode-tank" + ], + "skip_run_tests": [], + "expected_compile_failures": [ + // TODO(#17344): need to regenerate .mlirbc + "pytorch/models/opt-125M", + "pytorch/models/resnet50", + "pytorch/models/sdxl-vae-decode-tank", + + // error: 'builtin.module' op failed to run transform dialect passes + // (transform spec file is specific to SDXL?) + "sharktank/llama/open-llama-3b-v2-f16" + ], + "expected_run_failures": [] +} diff --git a/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json new file mode 100644 index 000000000000..28950d0643d5 --- /dev/null +++ b/build_tools/pkgci/external_test_suite/models_gpu_rocm_gfx942_additional_flags.json @@ -0,0 +1,25 @@ +{ + "config_name": "gpu_rocm", + "iree_compile_flags": [ + "--iree-hal-target-backends=rocm", + "--iree-rocm-target-chip=gfx942", + "--iree-input-demote-f64-to-f32", + "--iree-opt-const-eval=false", + "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir" + ], + "iree_run_module_flags": [ + "--device=hip" + ], + "skip_compile_tests": [ + "pytorch/models/sdxl-scheduled-unet-3-tank", + "pytorch/models/sdxl-prompt-encoder-tank", + "pytorch/models/sdxl-vae-decode-tank" + ], + "skip_run_tests": [], + "expected_compile_failures": [ + // TODO(#17344): need to regenerate .mlirbc + "pytorch/models/opt-125M", + "pytorch/models/resnet50" + ], + "expected_run_failures": [] +} diff --git a/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json new file mode 100644 index 000000000000..e3dbc9b75b0c --- /dev/null +++ b/build_tools/pkgci/external_test_suite/sdxl_prompt_encoder_gpu_rocm_gfx942.json @@ -0,0 +1,36 @@ +{ + "config_name": "gpu_rocm", + "iree_compile_flags": [ + "--iree-hal-target-backends=rocm", + "--iree-rocm-target-chip=gfx942", + "--iree-input-type=torch", + "--iree-opt-const-eval=false", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-rocm-waves-per-eu=2", + "--iree-llvmgpu-enable-prefetch", + "--iree-flow-enable-aggressive-fusion", + "--iree-global-opt-enable-fuse-horizontal-contractions=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-codegen-llvmgpu-use-vector-distribution=true", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics{pad-target-type=conv}))", + "--iree-scheduling-dump-statistics-format=json", + "--iree-scheduling-dump-statistics-file=compilation_info.json" + ], + "iree_run_module_flags": [ + "--device=hip", + "--parameters=model=real_weights.irpa", + "--input=1x64xi64=@inference_input.0.bin", + "--input=1x64xi64=@inference_input.1.bin", + "--input=1x64xi64=@inference_input.2.bin", + "--input=1x64xi64=@inference_input.3.bin", + "--expected_output=2x64x2048xf16=@inference_output.0.bin", + "--expected_output=2x1280xf16=@inference_output.1.bin", + "--expected_f16_threshold=1.0f" + ], + "skip_compile_tests": [], + "skip_run_tests": [], + "expected_compile_failures": [], + "expected_run_failures": [] +} diff --git a/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json new file mode 100644 index 000000000000..289e99b2af17 --- /dev/null +++ b/build_tools/pkgci/external_test_suite/sdxl_scheduled_unet_gpu_rocm_gfx942.json @@ -0,0 +1,41 @@ +{ + "config_name": "gpu_rocm", + "iree_compile_flags" : [ + "--iree-hal-target-backends=rocm", + "--iree-rocm-target-chip=gfx942", + "--iree-opt-const-eval=false", + "--iree-codegen-transform-dialect-library=${IREE_TEST_PATH_EXTENSION}/attention_and_matmul_spec.mlir", + "--iree-global-opt-propagate-transposes=true", + "--iree-global-opt-enable-fuse-horizontal-contractions=true", + "--iree-flow-enable-aggressive-fusion=true", + "--iree-opt-aggressively-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-vm-target-truncate-unsupported-floats", + "--iree-llvmgpu-enable-prefetch=true", + "--iree-opt-data-tiling=false", + "--iree-codegen-gpu-native-math-precision=true", + "--iree-codegen-llvmgpu-use-vector-distribution", + "--iree-rocm-waves-per-eu=2", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", + "--iree-scheduling-dump-statistics-format=json", + "--iree-scheduling-dump-statistics-file=compilation_info.json" + ], + "iree_run_module_flags": [ + "--device=hip", + "--parameters=model=real_weights.irpa", + "--module=sdxl_scheduled_unet_pipeline_fp16_rocm.vmfb", + "--input=1x4x128x128xf16=@inference_input.0.bin", + "--input=2x64x2048xf16=@inference_input.1.bin", + "--input=2x1280xf16=@inference_input.2.bin", + "--input=1xf16=@inference_input.3.bin", + "--expected_output=1x4x128x128xf16=@inference_output.0.bin", + "--expected_f16_threshold=0.7f" + ], + "skip_compile_tests": [], + "skip_run_tests": [], + "expected_compile_failures": [], + "expected_run_failures": [ + "pytorch/models/sdxl-scheduled-unet-3-tank", + ] +} diff --git a/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json new file mode 100644 index 000000000000..1ea72517f283 --- /dev/null +++ b/build_tools/pkgci/external_test_suite/sdxl_vae_decode_gpu_rocm_gfx942.json @@ -0,0 +1,29 @@ +{ + "config_name": "gpu_rocm", + "iree_compile_flags" : [ + "--iree-hal-target-backends=rocm", + "--iree-rocm-target-chip=gfx942", + "--iree-opt-const-eval=false", + "--iree-global-opt-propagate-transposes=true", + "--iree-opt-outer-dim-concat=true", + "--iree-llvmgpu-enable-prefetch=true", + "--iree-rocm-waves-per-eu=2", + "--iree-flow-enable-aggressive-fusion", + "--iree-codegen-llvmgpu-use-vector-distribution=true", + "--iree-execution-model=async-external", + "--iree-preprocessing-pass-pipeline=builtin.module(iree-preprocessing-transpose-convolution-pipeline, util.func(iree-preprocessing-pad-to-intrinsics))", + "--iree-scheduling-dump-statistics-format=json", + "--iree-scheduling-dump-statistics-file=compilation_info.json" + ], + "iree_run_module_flags": [ + "--device=hip", + "--parameters=model=real_weights.irpa", + "--input=1x4x128x128xf16=@inference_input.0.bin", + "--expected_output=1x3x1024x1024xf16=@inference_output.0.bin", + "--expected_f16_threshold=0.4f" + ], + "skip_compile_tests": [], + "skip_run_tests": [], + "expected_compile_failures": [], + "expected_run_failures": [] +}